<a href="https://colab.research.google.com/github/Zerzavot/Algorithms-Homework-1/blob/master/Decision_Trees_and_Ensembles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer

In [None]:
data =  load_breast_cancer()

X = data["data"]
y = data["target"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, 
                                                    random_state = 42)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
mdl = DecisionTreeClassifier(max_depth=3)

mdl.fit(X_train, y_train)
ypred_train = mdl.predict(X_train)
ypred_test = mdl.predict(X_test)

print("Train F1 Score:", f1_score(y_train, ypred_train))
print("Test F1 Score:", f1_score(y_test, ypred_test))

Train F1 Score: 0.9826388888888888
Test F1 Score: 0.9583333333333334


In [None]:
from sklearn.model_selection import GridSearchCV

params = {"max_depth" : [2,3,4], "min_samples_split" : [2,5,10,15]}

grid_search =  GridSearchCV(DecisionTreeClassifier(),
                            params, cv = 5, n_jobs=-1)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_split': [2, 5, 10, 15]})

In [None]:
grid_search.best_estimator_

DecisionTreeClassifier(max_depth=4, min_samples_split=10)

In [None]:
mdl = DecisionTreeClassifier(max_depth=4, min_samples_split=10)

mdl.fit(X_train, y_train)
ypred_train = mdl.predict(X_train)
ypred_test = mdl.predict(X_test)

print("Train F1 Score:", f1_score(y_train, ypred_train))
print("Test F1 Score:", f1_score(y_test, ypred_test))

Train F1 Score: 0.9947643979057592
Test F1 Score: 0.9436619718309859


In [None]:
results= pd.DataFrame([], columns = ["Train", "Test"])
results

Unnamed: 0,Train,Test


In [None]:
results.loc["Decision Tree"] = [f1_score(y_train, ypred_train),
                                f1_score(y_test, ypred_test)]
results

Unnamed: 0,Train,Test
Decision Tree,0.994764,0.943662


In [None]:
from sklearn.ensemble import RandomForestClassifier

mdl =  RandomForestClassifier(n_estimators=250, 
                              max_depth= 10, random_state = 42)

mdl.fit(X_train, y_train)

ypred_train = mdl.predict(X_train)
ypred_test = mdl.predict(X_test)
results.loc["Random  Forest"] = [f1_score(y_train, ypred_train),
                                f1_score(y_test, ypred_test)]
results

Unnamed: 0,Train,Test
Decision Tree,0.994764,0.943662
Random Forest,1.0,0.972222


In [None]:
params = {"max_depth" : [2,5,10,15,30], "n_estimators" : [30,50,100,250,500]}

grid_search =  GridSearchCV(RandomForestClassifier(random_state=42),
                            params, cv = 5, n_jobs=-1)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_

RandomForestClassifier(max_depth=10, n_estimators=250, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

mdl =  LogisticRegression(C = 0.5)
mdl.fit(X_train, y_train)

ypred_train = mdl.predict(X_train)
ypred_test = mdl.predict(X_test)
results.loc["Logistic Regression"] = [f1_score(y_train, ypred_train),
                                f1_score(y_test, ypred_test)]
results

Unnamed: 0,Train,Test
Decision Tree,0.994764,0.943662
Random Forest,1.0,0.972222
Logistic Regression,0.989547,0.979021


In [None]:
params = {"C" : [1e-3,1e-2,0.1,0.5,1,10,1e2,1e3]}

grid_search =  GridSearchCV(LogisticRegression(),
                            params, cv = 5, n_jobs=-1)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_

LogisticRegression(C=0.5)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

mdl =  GradientBoostingClassifier(learning_rate=0.1,
                                  n_estimators=250,
                                  max_depth = 3,
                                  random_state = 42)

mdl.fit(X_train, y_train)

ypred_train = mdl.predict(X_train)
ypred_test = mdl.predict(X_test)
results.loc["Gradient Boost"] = [f1_score(y_train, ypred_train),
                                f1_score(y_test, ypred_test)]
results

Unnamed: 0,Train,Test
Decision Tree,0.994764,0.943662
Random Forest,1.0,0.972222
Logistic Regression,0.989547,0.979021
Gradient Boost,1.0,0.965035


In [None]:
params = {"max_depth" : [2,3,5,7,10], "n_estimators" : [30,50,100,250,500],
          "learning_rate" : [1e-4,1e-3,1e-2,0.1,0.5]}

grid_search =  GridSearchCV(GradientBoostingClassifier(random_state=42),
                            params, cv = 5, n_jobs=-1)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_

GradientBoostingClassifier(n_estimators=250, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

mdl =  KNeighborsClassifier(n_neighbors = 5)
mdl.fit(X_train, y_train)
ypred_train = mdl.predict(X_train)
ypred_test = mdl.predict(X_test)
results.loc["KNN"] = [f1_score(y_train, ypred_train),
                                f1_score(y_test, ypred_test)]
results

Unnamed: 0,Train,Test
Decision Tree,0.994764,0.943662
Random Forest,1.0,0.972222
Logistic Regression,0.989547,0.979021
Gradient Boost,1.0,0.965035
KNN,0.984509,0.957746


In [None]:
params = {"n_neighbors" : [2,3,5,7,9,13,15,25]}

grid_search =  GridSearchCV(KNeighborsClassifier(),
                            params, cv = 5, n_jobs=-1)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_

KNeighborsClassifier()