In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
# Load data

df = pd.read_csv("../data/CompleteDataset2.csv", low_memory=False)

In [2]:
selected_columns = [
    "Overall",
    "Potential",
    "Ball control",
    "Composure",
    "Reactions",
    "Short passing",
    "Age",
    "Value",
]

df = df[selected_columns]

In [3]:
def parseValue(x):
    x = str(x).replace("€", "")
    if "M" in str(x):
        x = str(x).replace("M", "")
        x = float(x) * 1000000
    elif "K" in str(x):
        x = str(x).replace("K", "")
        x = float(x) * 1000
    return float(x)

In [4]:
df["Value"] = df["Value"].apply(parseValue)

In [5]:
df.head()

Unnamed: 0,Overall,Potential,Ball control,Composure,Reactions,Short passing,Age,Value
0,94,94,93,95,96,83,32,95500000.0
1,93,93,95,96,95,88,30,105000000.0
2,92,94,95,92,88,81,25,123000000.0
3,92,92,91,83,93,83,30,97000000.0
4,92,92,48,70,85,55,31,61000000.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17981 entries, 0 to 17980
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall        17981 non-null  int64  
 1   Potential      17981 non-null  int64  
 2   Ball control   17981 non-null  object 
 3   Composure      17981 non-null  object 
 4   Reactions      17981 non-null  object 
 5   Short passing  17981 non-null  object 
 6   Age            17981 non-null  int64  
 7   Value          17981 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 1.1+ MB


In [7]:
numeric_columns = ["Ball control", "Composure", "Reactions", "Short passing"]
df[numeric_columns] = (
    df[numeric_columns].apply(pd.to_numeric, errors="coerce").fillna(0)
)

In [8]:
imputer = SimpleImputer(strategy="mean")
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [9]:
clf = RandomForestRegressor(n_estimators=15, random_state=42)
rfe = RFE(estimator=clf, n_features_to_select=6, step=1)
x_selected = rfe.fit_transform(df_imputed.drop("Value", axis=1), df_imputed["Value"])

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    x_selected, df_imputed["Value"], test_size=0.3, random_state=42
)

In [11]:
clf.fit(x_train, y_train)

In [12]:
y_pred = clf.predict(x_test)

In [13]:
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")

Mean Absolute Error: 240653.70713623727
Mean Squared Error: 804770967123.9088


In [14]:
from sklearn.metrics import r2_score

In [15]:
y_pred = clf.predict(x_test)

In [16]:
r2 = r2_score(y_test, y_pred)

In [17]:
print(f"The coefficient of determination (R²): {r2}")

The coefficient of determination (R²): 0.9751446544171671


In [18]:
for i in range(len(rfe.support_)):
    if rfe.support_[i]:
        print(df_imputed.columns[i])

Overall
Potential
Ball control
Composure
Reactions
Age


In [19]:
data = [[90, 95, 95, 96, 94, 90]]
print(clf.predict(data))

[92866666.66666667]


In [20]:
print(clf.predict(data))

[92866666.66666667]


In [21]:
data = [[93, 93, 9, 96, 94, 30]]
print(clf.predict(data))

[81800000.]


In [22]:
data = [[93, 93, 90, 96, 94, 30]]
print(clf.predict(data))

[83933333.33333333]


In [24]:
joblib.dump(clf, "../Models/priceQuess.pkl")

['../Models/priceQuess.pkl']

In [25]:
clf = joblib.load("../Models/priceQuess.pkl")

In [26]:
data = [[93, 93, 90, 96, 94, 30]]
print(clf.predict(data))

[83933333.33333333]
