In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import smogn

In [4]:
df = pd.read_csv('features.csv')
x = df.drop(columns=['target', 'Medu', 'health', 'Dalc', 'Walc', 'traveltime', 'Mjob', 'internet',
                     'Pstatus', 'goout', 'Fjob', 'guardian', 'activities'], axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

df_train = X_train.copy()
df_train['target'] = y_train

df_train_bal = smogn.smoter(
    data=df_train,
    y='target',
    k=5,
    samp_method='extreme'
)

X_train_smogn = df_train_bal.drop(columns='target')
y_train_smogn = df_train_bal['target']

dist_matrix: 100%|#############################| 55/55 [00:00<00:00, 108.15it/s]
synth_matrix: 100%|############################| 55/55 [00:00<00:00, 117.00it/s]
r_index: 100%|#################################| 27/27 [00:00<00:00, 705.65it/s]


In [6]:
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [30,50,100,200],
    'max_depth': [5,10,15,20],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f'Melhores parâmetros: {grid_search.best_params_}')
print(f'Melhor score (CV): {grid_search.best_score_:.4f}')

Fitting 5 folds for each of 192 candidates, totalling 960 fits
Melhores parâmetros: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 30}
Melhor score (CV): 0.2839
