In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# GridSeacrhCV kullanarak, bir makine öğrenmesi modelinin hiper-parametreleri farklı değerler için test edilebilir.
# Bu farklı parametreler için model performansı değerlendirilebilir.

In [83]:
# Örnek
from sklearn import svm, datasets
iris = datasets.load_iris()

In [84]:
X, y = iris.data, iris.target

In [85]:
from sklearn.model_selection import train_test_split

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [87]:
svc = svm.SVC() # default C=1.0, kernel='rbf'  ,https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [88]:
svc.fit(X_train,y_train)

SVC()

In [89]:
tahminler = svc.predict(X_test)

In [90]:
from sklearn.metrics import classification_report

In [91]:
print(classification_report(y_test, tahminler))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [93]:
print(confusion_matrix(y_test,tahminler))

[[14  0  0]
 [ 0 10  0]
 [ 0  0  6]]


In [33]:
# GridSearch kullanmadan, farklı hiper-parametreler için ayrı ayrı model oluşturup modelin test edilmesi
kernels = ['rbf','linear','poly']
C_params = [1, 5, 10, 15, 20]
# for döngüsü
for kernel in kernels:
    for C in C_params:
        svc = svm.SVC(kernel=kernel, C=C)
        svc.fit(X_train,y_train)
        tahminler = svc.predict(X_test)
        print(f"Kernel={kernel}, C={C}")
        print(classification_report(y_test,tahminler))
        

Kernel=rbf, C=1
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      0.91      0.95        11
           2       0.92      1.00      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

Kernel=rbf, C=5
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      0.91      0.95        11
           2       0.92      1.00      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

Kernel=rbf, C=10
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      0.91      0.95        11
           2       0.92   

In [37]:
# GridSearch ile çözüm
from sklearn.model_selection import GridSearchCV

parametreler = {'kernel':['rbf','linear','poly'], 'C':[1, 5, 10, 15, 20]}
svc = svm.SVC()
clf = GridSearchCV(svc, parametreler)

In [42]:
clf.fit(X_train,y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 5, 10, 15, 20],
                         'kernel': ['rbf', 'linear', 'poly']})

In [44]:
clf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_kernel', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [45]:
clf.best_estimator_

SVC(C=1, kernel='linear')

In [46]:
tahminler = clf.best_estimator_.predict(X_test)

In [47]:
print(classification_report(y_test,tahminler))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      0.91      0.95        11
           2       0.92      1.00      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [None]:
# Cross Validation
# https://en.wikipedia.org/wiki/Cross-validation_(statistics)

In [48]:
parametreler = {'kernel':['rbf','linear','poly'], 'C':[1, 5, 10, 15, 20]}
svc = svm.SVC()
clf = GridSearchCV(svc, parametreler, cv=7) # cv: Cross Validation=> default:5
clf.fit(X, y)
clf.best_estimator_

SVC(C=10, kernel='linear')

In [49]:
tahminler = clf.best_estimator_.predict(X_test)

In [50]:
print(classification_report(y_test,tahminler))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      0.91      0.95        11
           2       0.92      1.00      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [51]:
# Örnek-2

In [53]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/yasarkucukefe/YBS462/main/data/loan_data.csv")

In [54]:
df_final = pd.get_dummies(df, columns=['purpose'], drop_first=True)

In [55]:
y = df_final['not.fully.paid']
X = df_final.drop('not.fully.paid',axis=1)

In [70]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30)

In [56]:
X.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,1,0,0,0,0,0


In [58]:
from sklearn.ensemble import RandomForestClassifier

In [59]:
rf = RandomForestClassifier()

In [71]:
params = {'n_estimators':[10, 100, 200], 'criterion':['gini','entropy'], 'min_samples_split':[2,3,4]} # 3 * 2 * 3 = 18 model

In [72]:
clf = GridSearchCV(rf, params, verbose=2)

In [73]:
clf.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END criterion=gini, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END criterion=gini, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END criterion=gini, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END criterion=gini, min_samples_split=2, n_estimators=100; total time=   1.6s
[CV] END criterion=gini, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END criterion=gini, min_samples_split=2, n_estimators=100; total time=   1.6s
[CV] END criterion=gini, min_samples_split=2, n_estimators=200; total time=   3.5s
[CV] END criterion=gini, min_sa

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [10, 100, 200]},
             verbose=2)

In [75]:
clf.best_estimator_

RandomForestClassifier(min_samples_split=4, n_estimators=200)

In [74]:
clf.best_params_

{'criterion': 'gini', 'min_samples_split': 4, 'n_estimators': 200}

In [78]:
tahminler = clf.best_estimator_.predict(X_test)

In [79]:
print(classification_report(y_test,tahminler))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92      2430
           1       0.43      0.02      0.04       444

    accuracy                           0.84      2874
   macro avg       0.64      0.51      0.48      2874
weighted avg       0.78      0.84      0.78      2874



In [81]:
from sklearn.metrics import confusion_matrix

In [82]:
print(confusion_matrix(y_test,tahminler))

[[2418   12]
 [ 435    9]]
