# Grid-Search


---


[documentação](https://scikit-learn.org/stable/index.html) <br>
dataset: [fonte](https://www.kaggle.com/datasets/hellbuoy/car-price-prediction?select=CarPrice_Assignment.csv)

---

>[Grid-Search](#scrollTo=QoBv84MIUa-h)

>>[1.  vamos importar o dataset](#scrollTo=_78JL1jFVQST)

>>[2.  vamos fazer o train test split](#scrollTo=bP8vEZsNl-gh)

>>[3.  vamos aplicar o GridSearchCV](#scrollTo=Z_3KDut4mKXy)



## 1.&nbsp; classificação

### 1.1.&nbsp; vamos importar o dataset

In [1]:
# vamos importar as bibliotecas
import pandas as pd
import numpy as np
# vamos importar o dataframe do ficheiro .csv
df_contract_renewal = pd.read_csv("South_China.csv")

# veja as 5 primeiras linhas do dataframe
df_contract_renewal.head()
# veja a forma do dataframe: quantas linhas, quantas colunas?
df_contract_renewal.shape
# veja a informação sobre o dataframe
df_contract_renewal.info()
# veja a descrição das variáveis numéricas
df_contract_renewal.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     1500 non-null   int64  
 1   Registration_Duration  1500 non-null   int64  
 2   Revenue                1500 non-null   int64  
 3   Cost                   1500 non-null   float64
 4   Renewal                1500 non-null   bool   
dtypes: bool(1), float64(1), int64(3)
memory usage: 48.5 KB


Unnamed: 0,ID,Registration_Duration,Revenue,Cost
count,1500.0,1500.0,1500.0,1500.0
mean,10750.5,33.756,58.387333,6.19105
std,433.157015,10.928133,61.862056,8.206847
min,10001.0,19.0,13.0,0.5136
25%,10375.75,25.0,28.0,1.82114
50%,10750.5,33.0,40.0,3.735905
75%,11125.25,40.0,64.0,7.245237
max,11500.0,72.0,981.0,96.4712


### 1.2.&nbsp; vamos fazer o train test split

In [2]:
# defina a variável alvo
target_variable = 'Renewal'

# train_test split usando a função train_test_split
X = df_contract_renewal.drop(["ID", target_variable], axis = 1)
y = df_contract_renewal[target_variable]*1

# importe a função train_test_split e defina X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 12,
                                                    stratify = y)

### 1.3.&nbsp; vamos aplicar o GridSearchCV

In [3]:
# Vamos importar as bibliotecas
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    make_scorer
)

# Vamos definir o cv
skf = StratifiedKFold(n_splits=5)

# Vamos criar o modelo de KNN
knn = KNeighborsClassifier()

# Vamos definir a grid dos hyperparameter
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Vamos definir as métricas de scoring
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc'
}

# Vamos aplicar a grid search ao modelo de KNN
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=skf, scoring=scoring, refit='roc_auc')
grid_search_knn.fit(X_train, y_train)

# Vamos ver os melhores hyperparameters
best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_
print("Best Parameters for KNN:", best_params_knn)
print("Best Cross-Validation AUC for KNN:", best_score_knn)

# Vamos ver os resultados para cada combinação
pd.DataFrame(grid_search_knn.cv_results_)

# Vamos definir o modelo de Random Forest
rf = RandomForestClassifier(random_state=42)

# Vamos definir a grid para os hyperparameters
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Vamos aplicar a grid search ao modelo de Random Forest
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=skf, scoring=scoring, refit='roc_auc')
grid_search_rf.fit(X_train, y_train)

# Vamos ver os melhores hyperparameters
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_
print("Best Parameters for Random Forest:", best_params_rf)
print("Best Cross-Validation AUC for Random Forest:", best_score_rf)

# Vamos ver os resultados para cada combinação
pd.DataFrame(grid_search_rf.cv_results_)

# Vamos fazer o fit do melhor modelo de KNN
best_knn = grid_search_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test)
y_pred_knn_proba = best_knn.predict_proba(X_test)[:, 1]

# Vamos avaliar as métricas finais
print("\nKNN Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_knn_proba))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

# Vamos fazer o fit do melhor modelo de Random Forest
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_proba = best_rf.predict_proba(X_test)[:, 1]

# Vamos avaliar as métricas finais
print("\nRandom Forest Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_rf_proba))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Best Parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
Best Cross-Validation AUC for KNN: 0.7197110288321455
Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 50}
Best Cross-Validation AUC for Random Forest: 0.7671537781825226

KNN Model Evaluation
Accuracy: 0.7066666666666667
AUC-ROC: 0.7204545454545455
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.51      0.56       110
           1       0.74      0.82      0.78       190

    accuracy                           0.71       300
   macro avg       0.68      0.67      0.67       300
weighted avg       0.70      0.71      0.70       300

Confusion Matrix:
 [[ 56  54]
 [ 34 156]]

Random Forest Model Evaluation
Accuracy: 0.7233333333333334
AUC-ROC: 0.7449282296650719
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.53      0.58       110
 

## 2.&nbsp; regressão

### 2.1.&nbsp; vamos importar o dataset

In [4]:
# packages gerais
import pandas as pd
import numpy as np

# dataset
df_car_price = pd.read_csv("CarPrice_Assignment.csv")

# exploração inicial
df_car_price.head()
# df_car_price.info()
# df_car_price.shape
# df_car_price.describe()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950.0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450.0


### 2.2.&nbsp; vamos fazer o train test split

In [5]:
# definimos a variável alvo
target_variable = "price"

# train_test split usando a função train_test_split
# -> não consideramos stratification

X = df_car_price.drop([target_variable], axis = 1)
y = df_car_price[target_variable]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 12)

### 2.3.&nbsp; vamos aplicar o GridSearchCV

In [6]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

# Vamos definir o cv
kf = KFold(n_splits=5)

# Vamos criar o modelo de random forest
rf = RandomForestRegressor(random_state=42)

# Vamos definir a grid para os hyperparameters
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Vamos definir as métricas de scoring
scoring = {
    'mse': 'neg_mean_squared_error',
    'r2': 'r2'
}

# Vamos aplicar a grid search ao modelo de Random Forest
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=kf, scoring=scoring, refit='r2')
grid_search_rf.fit(X_train, y_train)

# Vamos ver os melhores hyperparameters
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_
print("Best Parameters for Random Forest Regressor:", best_params_rf)
print("Best Cross-Validation R2 Score for Random Forest Regressor:", best_score_rf)

# Vamos ver os resultados para cada combinação
pd.DataFrame(grid_search_rf.cv_results_)

# Vamos fazer o fit do melhor modelo de Random Forest
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Vamos avaliar as métricas finais
print("\nRandom Forest Regressor Evaluation")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("R2 Score:", r2_score(y_test, y_pred_rf))

Best Parameters for Random Forest Regressor: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Cross-Validation R2 Score for Random Forest Regressor: 0.9186841721582377

Random Forest Regressor Evaluation
Mean Squared Error: 5468781.391901354
R2 Score: 0.8990393705689095
