<a href="https://colab.research.google.com/github/zerotodeeplearning/ztdl-masterclasses/blob/master/solutions_do_not_open/Machine_Learning_with_Scikit_Learn_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Copyright 2020 Catalit LLC.

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Machine Learning with Scikit Learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url = "https://raw.githubusercontent.com/zerotodeeplearning/ztdl-masterclasses/master/data/"

## Regression

In [None]:
df = pd.read_csv(url + 'weight-height.csv')

In [None]:
df.head()

In [None]:
sns.scatterplot(data=df, x='Height', y='Weight', hue='Gender');

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
X = df[['Height']].values
y = df['Weight'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

### Exercise 1

More features: `sqft`, `bdrms`, `age`, `price`

- replace the dataset above with `housing-data.csv`
- adapt the code so that there are no errors:
    - plot it using `sns.pairplot`
    - add more columns in the feature definition `X = ...`
- train and evaluate the model
- bonus points if you try with a different model like `Ridge` or `Lasso`

In [None]:
df = pd.read_csv(url + 'housing-data.csv')

In [None]:
df.head()

In [None]:
sns.pairplot(df);

In [None]:
X = df.drop('price', axis=1).values
y = df['price'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
model = Ridge()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## Classification

In [None]:
df = pd.read_csv(url + 'isp_data.csv')

In [None]:
df.head()

In [None]:
sns.scatterplot(data=df, x='download', y='upload', hue='label');

In [None]:
X = df[['download', 'upload']].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix

In [None]:
model = DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X)

In [None]:
confusion_matrix(y, y_pred)

In [None]:
wrong_pred = X[y != y_pred]

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
plot_tree(model, fontsize=14, ax=ax, rounded=True, feature_names=['download', 'upload']);

In [None]:
def plot_decision_boundary(model, X, ax):
    x_min = X[:, 0].min() - 0.1
    x_max = X[:, 0].max() + 0.1
    y_min = X[:, 1].min() - 0.1
    y_max = X[:, 1].max() + 0.1
    hticks = np.linspace(x_min, x_max, 101)
    vticks = np.linspace(y_min, y_max, 101)
    aa, bb = np.meshgrid(hticks, vticks)
    ab = np.c_[aa.ravel(), bb.ravel()]

    c = model.predict(ab)
    cc = c.reshape(aa.shape)

    ax.contourf(aa, bb, cc, cmap='bwr', alpha=0.2)

In [None]:
ax = sns.scatterplot(data=df, x='download', y='upload', hue='label');
ax.plot(wrong_pred[:, 0], wrong_pred[:, 1], 'or', markersize=10, alpha=0.4);
plot_decision_boundary(model, X, ax)

### Exercise 2

Use a different classifier. Replace the `DecisionTreeClassifier` with another classifier, e.g.:
- `LogisticRegression`
- `SVC`
- `RandomForestClassifier`

or any other model you can find here: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
and compare their behavior with the decision tree.

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X)

In [None]:
confusion_matrix(y, y_pred)

In [None]:
wrong_pred = X[y != y_pred]

In [None]:
ax = sns.scatterplot(data=df, x='download', y='upload', hue='label');
ax.plot(wrong_pred[:, 0], wrong_pred[:, 1], 'or', markersize=10, alpha=0.4);
plot_decision_boundary(model, X, ax)

## Clustering

In [None]:
df = pd.read_csv(url + '/iris.csv')

In [None]:
df.head()

In [None]:
df.plot.scatter(x='sepal_length', y='petal_length', title='Iris Flowers');

In [None]:
X = df.drop('species', axis=1).values

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(2)
model.fit(X)

In [None]:
centers = model.cluster_centers_
centers

In [None]:
plt.scatter(df.sepal_length, df.petal_length, c=model.labels_)
plt.scatter(centers[:,0], centers[:,2], marker='o', c='r', s=100)
plt.xlabel('sepal_length')
plt.ylabel('petal_length');