In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
df = pd.read_csv("clean.csv")

In [3]:
df = df.dropna(subset=["transfer_fee_amnt"])


In [4]:
features = [
    "season", "window", "player_age", "player_nation", "player_pos",
    "market_val_amnt", "is_free", "is_loan", "is_loan_end", "is_retired"
]
target = "transfer_fee_amnt"

In [5]:
X = df[features]
y = df[target]



In [6]:
numeric_features = ["season", "player_age", "market_val_amnt"]
categorical_features = ["window", "player_nation", "player_pos"]
boolean_features = ["is_free", "is_loan", "is_loan_end", "is_retired"]


In [7]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [8]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="passthrough"  # Keep boolean features
)

In [10]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
y_pred = model.predict(X_test)

In [14]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


In [15]:
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")

Root Mean Squared Error (RMSE): 19,115,976.38


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28263 entries, 0 to 28262
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   league                28263 non-null  object 
 1   season                28263 non-null  int64  
 2   window                28263 non-null  object 
 3   team_id               28263 non-null  int64  
 4   team_name             28263 non-null  object 
 5   team_country          28263 non-null  object 
 6   dir                   28263 non-null  object 
 7   player_id             28263 non-null  int64  
 8   player_name           28263 non-null  object 
 9   player_age            28259 non-null  float64
 10  player_nation         28261 non-null  object 
 11  player_nation2        9676 non-null   object 
 12  player_pos            28263 non-null  object 
 13  counter_team_id       28263 non-null  object 
 14  counter_team_name     28263 non-null  object 
 15  counter_team_countr

In [17]:
df.to_csv('clean.csv',index=False)

In [18]:
df.head(50)

Unnamed: 0,league,season,window,team_id,team_name,team_country,dir,player_id,player_name,player_age,...,counter_team_id,counter_team_name,counter_team_country,transfer_fee_amnt,market_val_amnt,is_free,is_loan,is_loan_end,is_retired,transfer_id
0,GB1,2009,s,985,Manchester United,England,in,33544,Antonio Valencia,23.0,...,1071,Wigan Athletic,England,18800000.0,,False,False,False,False,310832
1,GB1,2009,s,985,Manchester United,England,in,62049,Mame Diouf,21.0,...,687,Molde FK,Norway,4500000.0,1600000.0,False,False,False,False,319841
2,GB1,2009,s,985,Manchester United,England,in,43261,Gabriel Obertan,20.0,...,40,FC Girondins Bordeaux,France,4000000.0,400000.0,False,False,False,False,315185
3,GB1,2009,s,985,Manchester United,England,in,1397,Michael Owen,29.0,...,762,Newcastle United,England,0.0,,True,False,False,False,306421
4,GB1,2009,s,985,Manchester United,England,left,8198,Cristiano Ronaldo,24.0,...,418,Real Madrid,Spain,94000000.0,45000000.0,False,False,False,False,308498
5,GB1,2009,s,985,Manchester United,England,left,42411,Fraizer Campbell,21.0,...,289,Sunderland AFC,England,4100000.0,700000.0,False,False,False,False,316117
6,GB1,2009,s,985,Manchester United,England,left,64484,Manucho,26.0,...,366,Real Valladolid CF,Spain,2750000.0,,False,False,False,False,320029
7,GB1,2009,s,985,Manchester United,England,left,15449,Lee Martin,22.0,...,677,Ipswich Town,England,2250000.0,250000.0,False,False,False,False,314073
8,GB1,2009,s,31,Liverpool FC,England,in,3881,Glen Johnson,24.0,...,1020,Portsmouth FC,England,20500000.0,,False,False,False,False,303572
9,GB1,2009,s,31,Liverpool FC,England,in,5957,Alberto Aquilani,25.0,...,12,AS Roma,Italy,20000000.0,,False,False,False,False,329599


In [19]:
joblib.dump(model, "transfer_fee_model.pkl")
print("✅ Model saved as 'transfer_fee_model.pkl'")

✅ Model saved as 'transfer_fee_model.pkl'
