In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import roc_auc_score,confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder

from data_process import process_raw_data, process_curated_data, data_encoding

In [10]:
df_raw = process_raw_data(yr_list = [2022,2023,2024])
df_cur = process_curated_data(df_raw = df_raw)
df_encoded = data_encoding(df_cur = df_cur)

### Starting data processing
    ## Processing data of 2022
    ## Processing data of 2023
    ## Processing data of 2024


In [11]:
df_encoded

Unnamed: 0,pontos_num,rodada_id,variacao_num,media_num,preco_inicial,benefit_ratio,apelido_abel_ferreira,apelido_acevedo,apelido_ademir,apelido_adriano_martins,...,clube_id_293,clube_id_356,clube_id_373,clube_id_1371,posicao_id_ata,posicao_id_gol,posicao_id_lat,posicao_id_mei,posicao_id_tec,posicao_id_zag
0,1.00,19,-0.57,4.42,12.76,0.078370,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,4.70,19,-0.40,3.41,13.02,0.360983,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4.20,19,-0.22,6.76,13.77,0.305011,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,5.90,19,0.80,2.45,5.15,1.145631,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,6.80,19,-0.75,8.41,18.53,0.366972,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7258,1.30,13,0.31,1.30,2.00,0.650000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7259,-0.30,13,-0.44,0.10,2.92,-0.102740,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7260,14.10,13,3.89,7.90,1.55,9.096774,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7261,3.68,13,-0.15,4.82,8.28,0.444444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
X = df_encoded.iloc[:,1:].values
y = df_encoded.iloc[:,0].values

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

In [14]:
y_pred = rf.predict(X_test)
rf_score = rf.score(X_test,y_test)

print(f"Random Forest Score: {rf_score}")

Random Forest Score: 0.978832113250017


In [15]:
cval_rf = cross_val_score(rf,X_train,y_train, cv=5)

print(f"Random Forest Cross Val Score {cval_rf}")

Random Forest Cross Val Score [0.97299136 0.97805745 0.97785293 0.97996277 0.98148115]


In [16]:
import pickle

# Nome do arquivo onde o modelo será salvo
model_filename = 'random_forest_model.pkl'

# Salvar o modelo
with open(model_filename, 'wb') as file:
    pickle.dump(rf, file)

print(f"Modelo salvo como {model_filename}")

Modelo salvo como random_forest_model.pkl
