# Hyperparameter in einer Pipeline optimieren
### GridSearch ermöglicht das Durchprobieren verschiedener Parameter
### Vorheriger Train-Test-Split 

In [1]:
import pandas as pd

df = pd.read_csv("data/classification.csv")

df.head()

Unnamed: 0,age,interest,success
0,23.657801,18.859917,0.0
1,22.573729,17.969223,0.0
2,32.553424,29.463651,0.0
3,6.718035,25.704665,1.0
4,14.401919,16.770856,0.0


In [2]:
from sklearn.model_selection import train_test_split

X = df[["age", "interest"]].values
y = df["success"].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [3]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

# pipeline.set_params(knn__n_neighbors = 1)

In [4]:
from sklearn.model_selection import GridSearchCV
import numpy as np

clf = GridSearchCV(pipeline, param_grid = {
    "knn__n_neighbors": np.arange(1,100,3)
})
clf.fit(X_train, y_train)

print(clf.best_params_)

{'knn__n_neighbors': 7}


In [5]:
# das sind komplett unbekannte Daten:
print(clf.score(X_test, y_test))

0.9733333333333334


In [6]:
#das sind Trainingsdaten
print(clf.best_score_)

0.9280808080808081
