 Load the Dataset

In [13]:
import pandas as pd

# Load your dataset
df = pd.read_csv('./FullData.csv')

# Display the first few rows to understand its structure
print(df.head())


                Name Nationality National_Position  National_Kit   
0  Cristiano Ronaldo    Portugal                LS           7.0  \
1       Lionel Messi   Argentina                RW          10.0   
2             Neymar      Brazil                LW          10.0   
3        Luis Suárez     Uruguay                LS           9.0   
4       Manuel Neuer     Germany                GK           1.0   

           Club Club_Position  Club_Kit Club_Joining  Contract_Expiry  Rating   
0   Real Madrid            LW       7.0   07/01/2009           2021.0      94  \
1  FC Barcelona            RW      10.0   07/01/2004           2018.0      93   
2  FC Barcelona            LW      11.0   07/01/2013           2021.0      92   
3  FC Barcelona            ST       9.0   07/11/2014           2021.0      92   
4     FC Bayern            GK       1.0   07/01/2011           2021.0      92   

   ... Long_Shots Curve Freekick_Accuracy Penalties  Volleys GK_Positioning   
0  ...         90    81  

Task 1: Create Attribute Dependent or Overall Best Teams
For creating the best teams based on attributes or overall ratings, you'll want to sort the players by their overall rating or specific attributes and then pick the top players for each position.

In [14]:
# Example for overall best team (top 11 players regardless of position)
best_team = df.sort_values(by='Rating', ascending=False).head(11)
print("Overall Best Team:")
print(best_team[['Name', 'Nationality', 'Club', 'Rating']])


Overall Best Team:
                  Name Nationality             Club  Rating
0    Cristiano Ronaldo    Portugal      Real Madrid      94
1         Lionel Messi   Argentina     FC Barcelona      93
2               Neymar      Brazil     FC Barcelona      92
3          Luis Suárez     Uruguay     FC Barcelona      92
4         Manuel Neuer     Germany        FC Bayern      92
5               De Gea       Spain   Manchester Utd      90
6   Robert Lewandowski      Poland        FC Bayern      90
7          Gareth Bale       Wales      Real Madrid      90
8   Zlatan Ibrahimović      Sweden   Manchester Utd      90
14     Gonzalo Higuaín   Argentina         Juventus      89
17       Sergio Agüero   Argentina  Manchester City      89


Task 2: Create the Fastest/Slowest Teams
Speed can be a combination of 'Acceleration' and 'Speed' attributes. For simplicity, let's just use one of these or an average if both are available.

In [15]:
# Assuming 'Speed' is an attribute, adjust if using a different attribute like 'Pace' or 'Acceleration'
fastest_team = df.sort_values(by='Speed', ascending=False).head(11)
slowest_team = df.sort_values(by='Speed').head(11)

print("Fastest Team:")
print(fastest_team[['Name', 'Nationality', 'Club', 'Speed']])

print("\nSlowest Team:")
print(slowest_team[['Name', 'Nationality', 'Club', 'Speed']])


Fastest Team:
                           Name    Nationality             Club  Speed
35    Pierre-Emerick Aubameyang          Gabon    Bor. Dortmund     96
1461          Jonathan Biabiany         France            Inter     96
8598               Anibal Chalá        Ecuador        FC Dallas     96
5091              Ernest Asante          Ghana  FC Nordsjælland     95
2209                Jürgen Damm         Mexico           Tigres     95
7                   Gareth Bale          Wales      Real Madrid     95
8305               Mathis Bolly    Ivory Coast   Greuther Fürth     95
1748              Víctor Ibarbo       Colombia       Sagan Tosu     95
393                    Williams          Spain  Athletic Bilbao     95
6060         Michael O'Halloran       Scotland          Rangers     94
6743              Gboly Ariyibi  United States    Nott'm Forest     94

Slowest Team:
                        Name     Nationality             Club  Speed
16884         Miłosz Mleczko          Poland      

Task 3: Attributes by World Areas
To see which areas of the world provide which attributes, group players by their Nationality or a broader region classification, then calculate the average for attributes like 'Stamina' and 'Pace'.


In [16]:
# You might need to map countries to continents or regions first if going by areas rather than nationality

# Example for averaging 'Stamina' by nationality
average_stamina_by_nationality = df.groupby('Nationality')['Stamina'].mean().sort_values(ascending=False)
print("Average Stamina by Nationality:")
print(average_stamina_by_nationality.head(10))


Average Stamina by Nationality:
Nationality
Chad          85.000000
Aruba         79.000000
Guam          78.000000
Barbados      78.000000
Montserrat    76.000000
Niger         75.500000
Cuba          75.000000
Belize        75.000000
Uzbekistan    74.666667
Suriname      74.500000
Name: Stamina, dtype: float64


Task 4: Best Players by Position
Group players by position, then find the player with the highest rating in each position.

In [17]:
# First, identify the best players by position based on their highest rating
best_by_position = df.loc[df.groupby('Club_Position')['Rating'].idxmax()]

# Now, sort these players by 'Rating' in descending order
best_by_position_sorted = best_by_position.sort_values(by='Rating', ascending=False)

# Print the sorted list
print("Best Players by Position in Descending Order of Rating:")
print(best_by_position_sorted[['Name', 'Club_Position', 'Rating']])


Best Players by Position in Descending Order of Rating:
                    Name Club_Position  Rating
0      Cristiano Ronaldo            LW      94
1           Lionel Messi            RW      93
3            Luis Suárez            ST      92
4           Manuel Neuer            GK      92
15          Thiago Silva           LCB      89
12           Luka Modrić           RCM      89
13            Mesut Özil           CAM      89
10        Jérôme Boateng           Sub      89
32               Iniesta            LM      88
19     Antoine Griezmann            RS      88
28          Philipp Lahm            RB      88
29                  Pepe           RCB      88
18            Paul Pogba           LCM      88
37          Arturo Vidal           LDM      87
42          Arjen Robben            RM      87
56      Radja Nainggolan            LF      86
49       Sergio Busquets           CDM      86
50                Thiago           RDM      86
45           David Alaba            LB      86
71  

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame, already loaded

# Convert Preferred Foot to a binary feature: Right=1, Left=0
# If 'Preferred_Foot' is not binary, adjust accordingly. This step assumes binary representation.
df['Preffered_Foot'] = df['Preffered_Foot'].map({'Right': 1, 'Left': 0})

# Assuming the list of features you're interested in includes technical skills
features = [
    'Dribbling', 'Marking', 'Speed', 'Finishing', 'Agility', 'Strength', 'Jumping', 
    'Heading', 'Vision', 'Short_Pass', 'Ball_Control', 'Long_Pass', 'Crossing', 
    'Standing_Tackle', 'Sliding_Tackle', 'Preffered_Foot'  # Assuming correct column name
]  # Now including 'Preferred_Foot'


# Map specific positions to broader categories
position_groupings = {
    'LF': 'ST', 'RF': 'ST', 'CF': 'ST', 'LS': 'ST', 'RS': 'ST', 'ST': 'ST',
    'LW': 'Winger', 'RW': 'Winger',
    'RDM': 'Midfield', 'LDM': 'Midfield', 'LM': 'Midfield', 'RM': 'Midfield', 'CM': 'Midfield', 'RCM': 'Midfield',
    'LCM': 'Midfield', 'CAM': 'Midfield', 'CDM': 'Midfield', 'RAM': 'Midfield', 'LAM': 'Midfield',
    'LWB': 'Wingback', 'RWB': 'Wingback',
    'LB': 'Defender', 'RB': 'Defender', 'CB': 'Defender', 'RCB': 'Defender', 'LCB': 'Defender'
}


# Filter out rows with 'RES' or 'SUB' in Club_Position
cleaned_df = df.loc[~df['Club_Position'].isin(['Res', 'Sub'])]

# Apply the position groupings to the 'Club_Position' column
cleaned_df['Position_Group'] = cleaned_df['Club_Position'].map(position_groupings)

# Fill in any missing groupings with the original position (for positions already broad or not mapped)
cleaned_df['Position_Group'].fillna(cleaned_df['Club_Position'], inplace=True)

# Apply the position groupings to the 'National_Position' column, with an adjustment for NaN values
cleaned_df['National_Position_Group'] = cleaned_df['National_Position'].map(position_groupings)
cleaned_df['National_Position_Group'].fillna('', inplace=True)

# Drop rows with any NaN values in the features or target
cleaned_df = cleaned_df.dropna(subset=features + ['Club_Position'])

# Since 'National_Position_Group' is categorical, apply one-hot encoding to it and any other categorical features
cleaned_df = pd.get_dummies(cleaned_df, columns=['National_Position_Group'], drop_first=True)

# Ensure the features list is updated to include these new one-hot encoded columns
features.extend([col for col in cleaned_df.columns if 'National_Position_Group' in col])

# Define X and y from the cleaned DataFrame
X = cleaned_df[features]
y = cleaned_df['Position_Group']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Initialize GridSearchCV with the RandomForestClassifier, parameter grid, and number of folds for cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator from the grid search to make predictions
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate the best model with accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with best parameters:", accuracy)

# Feature importance from the best model
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature rankings
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {features[indices[f]]} ({importances[indices[f]]})")

Fitting 5 folds for each of 108 candidates, totalling 540 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Position_Group'] = cleaned_df['Club_Position'].map(position_groupings)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Position_Group'].fillna(cleaned_df['Club_Position'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['National_Position_Group'] = cleaned_df['National_Position'].map(position_groupings)
A va

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators

In [32]:

# Retrieve names for the test set (assuming 'Name' was part of your original DataFrame)
test_names = cleaned_df.loc[y_test.index, 'Name']




# Create a DataFrame with actual and predicted values, including player names
results_df = pd.DataFrame({
    'Name': test_names,
    'Actual Position': y_test,
    'Predicted Position': y_pred
}).reset_index(drop=True)

# Set the maximum number of rows to display
pd.set_option('display.max_rows', 50)


# Display the DataFrame
print(results_df)


# Function to display a DataFrame in chunks
#def display_df_in_chunks(df, chunk_size=50):
    #total_rows = len(df)
    #for start in range(0, total_rows, chunk_size):
        #end = min(start + chunk_size, total_rows)
       # display(df[start:end])

# Use the function to display your DataFrame
#display_df_in_chunks(results_df, chunk_size=50)

                        Name Actual Position Predicted Position
0           Maciej Jankowski              ST           Midfield
1     Saad Abdulameer Luaibi        Midfield           Midfield
2                 Tom Miller        Defender           Defender
3            Anders Trondsen        Midfield           Midfield
4        Mahmood Moaaz Hasah        Defender           Defender
...                      ...             ...                ...
1385        Cristian Ledesma        Midfield           Midfield
1386            Alfred Gomis              GK                 GK
1387           Yuriy Lodygin              GK                 GK
1388         James Rodríguez        Midfield           Midfield
1389        Kalifa Coulibaly              ST                 ST

[1390 rows x 3 columns]


In [33]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Assuming your model was trained with scaled numerical data
scaler = StandardScaler()

def preprocess(age, height, weight):
    # Convert inputs to float
    age = float(age) / 100
    height = (float(height) - 1.5) / 1
    weight = (float(weight) - 50) / 100
    
    # Reshape data to match the input shape of your model
    data = [[age, height, weight]]
    
    return data



In [34]:
my_model = best_rf

# Gather input data

# Ask the user for the values of the features
dribbling = float(input("Enter dribbling: "))
marking = float(input("Enter marking: "))
speed = float(input("Enter speed: "))
finishing = float(input("Enter finishing: "))
agility = float(input("Enter agility: "))
strength = float(input("Enter strength: "))
jumping = float(input("Enter jumping: "))
heading = float(input("Enter heading: "))
vision = float(input("Enter vision: "))
short_pass = float(input("Enter short pass: "))
ball_control = float(input("Enter ball control: "))
long_pass = float(input("Enter long pass: "))
crossing = float(input("Enter crossing: "))
standing_tackle = float(input("Enter standing tackle: "))
sliding_tackle = float(input("Enter sliding tackle: "))
preferred_foot = float(input("Enter preferred foot (Right=1, Left=0): "))
national_position_group_1 = 0
national_position_group_2 = 0
national_position_group_3 = 0
national_position_group_4 = 0
national_position_group_5 = 0

# Create input_data with the values provided by the user
input_data = [[dribbling, marking, speed, finishing, agility, strength, jumping, heading, vision, short_pass, ball_control, long_pass, crossing, standing_tackle, sliding_tackle, preferred_foot, national_position_group_1, national_position_group_2, national_position_group_3, national_position_group_4, national_position_group_5]]

# # Assuming le is the LabelEncoder you used for encoding the positions
# le = LabelEncoder()

# # Fit the LabelEncoder with the positions
# le.fit(['ST', 'Winger', 'Midfield', 'Wingback', 'Defender'])

# Use the model to make a prediction
prediction = my_model.predict(input_data)


print("Predicted position: ", prediction)

Predicted position:  ['Midfield']


