# Grid-Search


---


[documentação](https://scikit-learn.org/stable/index.html) <br>
dataset: [fonte](https://www.kaggle.com/datasets/hellbuoy/car-price-prediction?select=CarPrice_Assignment.csv)

---

>[Grid-Search](#scrollTo=QoBv84MIUa-h)

>>[1.  vamos importar o dataset](#scrollTo=_78JL1jFVQST)

>>[2.  vamos fazer o train test split](#scrollTo=bP8vEZsNl-gh)

>>[3.  vamos aplicar o GridSearchCV](#scrollTo=Z_3KDut4mKXy)



## 1.&nbsp; classificação

### 1.1.&nbsp; vamos importar o dataset

In [7]:
# vamos importar as bibliotecas
import pandas as pd
import numpy as np
# vamos importar o dataframe do ficheiro .csv
df_contract_renewal = pd.read_csv("C:/Users/yfreitas/Documents/pythonSamples/Machine Learning/Files/South_China.csv")

# veja as 5 primeiras linhas do dataframe
df_contract_renewal.head()
# veja a forma do dataframe: quantas linhas, quantas colunas?
df_contract_renewal.shape
# veja a informação sobre o dataframe
df_contract_renewal.info()
# veja a descrição das variáveis numéricas
df_contract_renewal.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     1500 non-null   int64  
 1   Registration_Duration  1500 non-null   int64  
 2   Revenue                1500 non-null   int64  
 3   Cost                   1500 non-null   float64
 4   Renewal                1500 non-null   bool   
dtypes: bool(1), float64(1), int64(3)
memory usage: 48.5 KB


Unnamed: 0,ID,Registration_Duration,Revenue,Cost
count,1500.0,1500.0,1500.0,1500.0
mean,10750.5,33.756,58.387333,6.19105
std,433.157015,10.928133,61.862056,8.206847
min,10001.0,19.0,13.0,0.5136
25%,10375.75,25.0,28.0,1.82114
50%,10750.5,33.0,40.0,3.735905
75%,11125.25,40.0,64.0,7.245237
max,11500.0,72.0,981.0,96.4712


### 1.2.&nbsp; vamos fazer o train test split

In [10]:
# defina a variável alvo
target_variable = 'Renewal'

# train_test split usando a função train_test_split
X = df_contract_renewal.drop(["ID", target_variable], axis = 1)
y = df_contract_renewal[target_variable]*1

# importe a função train_test_split e defina X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 12,
                                                    stratify = y)

### 1.3.&nbsp; vamos aplicar o GridSearchCV, with stratifiedKfold because it is classification KNN

In [11]:
# Vamos importar as bibliotecas
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    make_scorer
)

# Vamos definir o cv
skf = StratifiedKFold(n_splits=5)

# Vamos criar o modelo de KNN
knn = KNeighborsClassifier()

# Vamos definir a grid dos hyperparameter
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Vamos definir as métricas de scoring
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc'
}

# Vamos aplicar a grid search ao modelo de KNN
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=skf, scoring=scoring, refit='roc_auc')
grid_search_knn.fit(X_train, y_train)

# Vamos ver os melhores hyperparameters
best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_
print("Best Parameters for KNN:", best_params_knn)
print("Best Cross-Validation AUC for KNN:", best_score_knn)

# Vamos ver os resultados para cada combinação
pd.DataFrame(grid_search_knn.cv_results_)

Best Parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best Cross-Validation AUC for KNN: 0.71989169590823


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_accuracy,split1_test_accuracy,...,std_test_accuracy,rank_test_accuracy,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,split3_test_roc_auc,split4_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
0,0.0008,0.00098,0.014864,0.003247,euclidean,3,uniform,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.65,0.7,...,0.029977,7,0.655615,0.709554,0.698864,0.68526,0.651303,0.680119,0.02313,13
1,0.00127,0.00097,0.004879,0.003866,euclidean,3,distance,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.658333,0.683333,...,0.020344,12,0.655091,0.703873,0.706041,0.681429,0.65104,0.679495,0.023272,14
2,0.000808,0.000761,0.013199,0.006662,euclidean,5,uniform,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.6375,0.679167,...,0.022883,11,0.668548,0.704172,0.699462,0.692848,0.67433,0.687872,0.014011,11
3,0.001062,0.000899,0.005759,0.005557,euclidean,5,distance,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.633333,0.666667,...,0.023422,14,0.672772,0.696322,0.704919,0.684471,0.673954,0.686488,0.012536,12
4,0.001029,0.000918,0.010895,0.007726,euclidean,7,uniform,"{'metric': 'euclidean', 'n_neighbors': 7, 'wei...",0.691667,0.670833,...,0.012638,4,0.700882,0.727123,0.691761,0.706183,0.701187,0.705427,0.011804,6
5,0.001004,0.000634,0.003652,0.001939,euclidean,7,distance,"{'metric': 'euclidean', 'n_neighbors': 7, 'wei...",0.670833,0.6625,...,0.014767,8,0.68978,0.709143,0.702714,0.707047,0.693637,0.700464,0.007543,8
6,0.00122,0.000768,0.016069,0.003645,euclidean,9,uniform,"{'metric': 'euclidean', 'n_neighbors': 9, 'wei...",0.6875,0.716667,...,0.012191,1,0.727534,0.742113,0.691836,0.726279,0.7087,0.719292,0.017341,2
7,0.001205,0.000984,0.005761,0.004034,euclidean,9,distance,"{'metric': 'euclidean', 'n_neighbors': 9, 'wei...",0.679167,0.6875,...,0.015,5,0.710937,0.721516,0.699013,0.716287,0.697769,0.709105,0.009374,4
8,0.003684,0.0064,0.008432,0.007045,manhattan,3,uniform,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",0.625,0.6875,...,0.028504,14,0.643316,0.703648,0.702714,0.672564,0.656111,0.675671,0.024303,15
9,0.000393,0.000786,0.006722,0.008112,manhattan,3,distance,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",0.645833,0.679167,...,0.013591,16,0.65311,0.69535,0.704134,0.665202,0.647998,0.673159,0.022584,16


### 1.4. Predict with the best result from grid search 

In [12]:
# Vamos fazer o fit do melhor modelo de KNN
best_knn = grid_search_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test)
y_pred_knn_proba = best_knn.predict_proba(X_test)[:, 1]

# Vamos avaliar as métricas finais
print("\nKNN Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_knn_proba))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


KNN Model Evaluation
Accuracy: 0.6733333333333333
AUC-ROC: 0.7066028708133971
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.42      0.48       110
           1       0.71      0.82      0.76       190

    accuracy                           0.67       300
   macro avg       0.64      0.62      0.62       300
weighted avg       0.66      0.67      0.66       300

Confusion Matrix:
 [[ 46  64]
 [ 34 156]]


### 1.5. Random Forest Classification with stratifiedKfold and gridsearch

In [13]:
# Vamos definir o modelo de Random Forest
rf = RandomForestClassifier(random_state=42)

# Vamos definir a grid para os hyperparameters
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Vamos aplicar a grid search ao modelo de Random Forest
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=skf, scoring=scoring, refit='roc_auc')
grid_search_rf.fit(X_train, y_train)

# Vamos ver os melhores hyperparameters
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_
print("Best Parameters for Random Forest:", best_params_rf)
print("Best Cross-Validation AUC for Random Forest:", best_score_rf)

# Vamos ver os resultados para cada combinação
pd.DataFrame(grid_search_rf.cv_results_)

Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 50}
Best Cross-Validation AUC for Random Forest: 0.7671537781825226


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,std_test_accuracy,rank_test_accuracy,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,split3_test_roc_auc,split4_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
0,0.071793,0.006479,0.004565,0.005978,,2,50,"{'max_depth': None, 'min_samples_split': 2, 'n...",0.670833,0.704167,...,0.013591,32,0.743795,0.772316,0.737104,0.751709,0.720419,0.745069,0.017083,32
1,0.139065,0.0072,0.005235,0.005124,,2,100,"{'max_depth': None, 'min_samples_split': 2, 'n...",0.6875,0.716667,...,0.012191,35,0.752579,0.772802,0.730226,0.748854,0.725678,0.746028,0.016921,28
2,0.272566,0.007723,0.024715,0.008714,,2,200,"{'max_depth': None, 'min_samples_split': 2, 'n...",0.691667,0.708333,...,0.00677,30,0.744655,0.765513,0.737403,0.749831,0.720795,0.743639,0.014687,34
3,0.068115,0.007716,0.008481,0.007059,,5,50,"{'max_depth': None, 'min_samples_split': 5, 'n...",0.695833,0.725,...,0.010069,22,0.756728,0.778297,0.729366,0.755052,0.724363,0.748761,0.019732,26
4,0.129453,0.007157,0.010506,0.007522,,5,100,"{'max_depth': None, 'min_samples_split': 5, 'n...",0.675,0.725,...,0.021344,22,0.758373,0.770634,0.736693,0.76403,0.719781,0.749902,0.018884,21
5,0.249755,0.007636,0.025995,0.008672,,5,200,"{'max_depth': None, 'min_samples_split': 5, 'n...",0.683333,0.729167,...,0.018745,16,0.756878,0.772279,0.748804,0.76696,0.732477,0.755479,0.014069,17
6,0.06456,0.006225,0.006251,0.007656,,10,50,"{'max_depth': None, 'min_samples_split': 10, '...",0.720833,0.733333,...,0.010069,4,0.773176,0.791717,0.758971,0.762527,0.748929,0.767064,0.01456,3
7,0.126228,0.007727,0.006752,0.00733,,10,100,"{'max_depth': None, 'min_samples_split': 10, '...",0.725,0.7375,...,0.010737,4,0.770634,0.790147,0.756429,0.763504,0.748479,0.765838,0.014207,11
8,0.241707,0.007636,0.025122,0.00695,,10,200,"{'max_depth': None, 'min_samples_split': 10, '...",0.7,0.7375,...,0.015501,10,0.76712,0.787679,0.76084,0.768462,0.746225,0.766065,0.013379,8
9,0.055238,0.007171,0.014943,0.003658,10.0,2,50,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.708333,0.725,...,0.007638,13,0.762261,0.773475,0.748056,0.746,0.71948,0.749854,0.01818,23


### 1.5. Predict RFC with beat result

In [14]:
# Vamos fazer o fit do melhor modelo de Random Forest
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_proba = best_rf.predict_proba(X_test)[:, 1]

# Vamos avaliar as métricas finais
print("\nRandom Forest Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_rf_proba))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Model Evaluation
Accuracy: 0.7233333333333334
AUC-ROC: 0.7449282296650719
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.53      0.58       110
           1       0.75      0.84      0.79       190

    accuracy                           0.72       300
   macro avg       0.70      0.68      0.69       300
weighted avg       0.72      0.72      0.72       300

Confusion Matrix:
 [[ 58  52]
 [ 31 159]]
