# DS-SF-23 | Codealong 09 | Introduction to Logistic Regression | Answer Key

In [None]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from sklearn import linear_model

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

## `Iris` dataset, Take 2

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'iris.csv'))

In [None]:
df

### Feature matrix and label vector

In [None]:
X = df[ ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'] ]

In [None]:
X

## Activity | Create binary/dummy variables for the outcome variable

In [None]:
ys = pd.get_dummies(df.Species, prefix = None)

In [None]:
ys

## Activity | Run logistic regression to learn whether or not an iris plant is a Setosa

In [None]:
model_Setosa = linear_model.LogisticRegression()

model_Setosa.fit(X, ys.Setosa)

print model_Setosa.coef_
print model_Setosa.intercept_

## Activity | What's the accuracy of your model?

In [None]:
model_Setosa.score(X, ys.Setosa)

## Activity | Plot the log odds (that it is a Setosa) for each plant in the dataset

In [None]:
y_logit_Setosa = model_Setosa.intercept_ + (X * model_Setosa.coef_[0]).sum(axis = 1)

In [None]:
y_logit_Setosa.plot()

## Activity | Using just the log odds, confirm the model accuracy

In [None]:
((y_logit_Setosa >= 0) == (ys.Setosa == 1)).sum()

In [None]:
((y_logit_Setosa >= 0) == (ys.Setosa == 1)).mean()

## Activity | Plot the odds (that it is a Setosa) for each plant in the dataset

In [None]:
y_odds_Setosa = np.exp(y_logit_Setosa)

In [None]:
y_odds_Setosa.plot()

## Activity | Now, using just the odds, confirm the model accuracy

In [None]:
((y_odds_Setosa >= 1) == (ys.Setosa == 1)).sum()

In [None]:
((y_odds_Setosa >= 1) == (ys.Setosa == 1)).mean()

## Activity | Plot the class probabilities (that it is a Setosa) for each plant in the dataset

In [None]:
y_p_Setosa = y_odds_Setosa / (1 + y_odds_Setosa)

In [None]:
y_p_Setosa

In [None]:
y_p_Setosa.plot()

## Activity | Using the class probabilities, confirm for the last time the model accuracy

In [None]:
((y_p_Setosa >= .5) == (ys.Setosa == 1)).sum()

In [None]:
((y_p_Setosa >= .5) == (ys.Setosa == 1)).mean()

In [None]:
y_p_Setosa

## Activity | Finally, compute the outcome class for the Setosa model and verify the model accuracy

In [None]:
y_class_Setosa = y_p_Setosa.round()

In [None]:
y_class_Setosa.plot(ylim = (0, 1.01))

In [None]:
(y_class_Setosa == ys.Setosa).sum()

In [None]:
(y_class_Setosa == ys.Setosa).mean()

## Versicolor

### Activity

In [None]:
model_Versicolor = linear_model.LogisticRegression()
model_Versicolor.fit(X, ys.Versicolor)

In [None]:
model_Versicolor.score(X, ys.Versicolor)

In [None]:
y_logit_Versicolor = model_Versicolor.intercept_ + (X * model_Versicolor.coef_[0]).sum(axis = 1)

In [None]:
y_logit_Versicolor.plot()

In [None]:
((y_logit_Versicolor >= 0) == (ys.Versicolor == 1)).mean()

In [None]:
((y_logit_Versicolor >= 0) == (ys.Versicolor == 1)).sum()

In [None]:
((y_logit_Versicolor >= 0) != (ys.Versicolor == 1)).sum()

In [None]:
y_odds_Versicolor = np.exp(y_logit_Versicolor)

In [None]:
y_odds_Versicolor.plot()

In [None]:
y_p_Versicolor = y_odds_Versicolor / (1 + y_odds_Versicolor)

In [None]:
y_odds_Versicolor.plot()

In [None]:
y_class_Versicolor = y_p_Versicolor.round()

In [None]:
y_class_Versicolor.plot()

In [None]:
pd.DataFrame({'class': y_class_Versicolor, 'index': y_class_Versicolor.index}).plot(kind = 'scatter', x = 'index', y = 'class')

In [None]:
pd.DataFrame({'class': ys.Versicolor, 'index': ys.Versicolor.index}).plot(kind = 'scatter', x = 'index', y = 'class')

## Virginica

### Activity

In [None]:
model_Virginica = linear_model.LogisticRegression().\
    fit(X, ys.Virginica)

In [None]:
model_Virginica.score(X, ys.Virginica)

In [None]:
y_logit_Virginica = model_Virginica.intercept_ + (X * model_Virginica.coef_[0]).sum(axis = 1)

In [None]:
y_logit_Virginica.plot()

In [None]:
((y_logit_Virginica >= 0) == (ys.Virginica == 1)).mean()

In [None]:
((y_logit_Virginica >= 0) == (ys.Virginica == 1)).sum()

In [None]:
((y_logit_Virginica >= 0) != (ys.Virginica == 1)).sum()

In [None]:
y_odds_Virginica = np.exp(y_logit_Virginica)

In [None]:
y_odds_Virginica.plot()

In [None]:
y_p_Virginica = y_odds_Virginica / (1 + y_odds_Virginica)

In [None]:
y_p_Virginica.plot()

In [None]:
y_class_Virginica = y_p_Virginica.round()

In [None]:
y_class_Virginica.plot()

In [None]:
pd.DataFrame({'class': y_class_Virginica, 'index': y_class_Virginica.index}).plot(kind = 'scatter', x = 'index', y = 'class')

In [None]:
pd.DataFrame({'class': ys.Virginica, 'index': ys.Virginica.index}).plot(kind = 'scatter', x = 'index', y = 'class')

## Activity | Let's combine the models together!

In [None]:
y_hats = pd.DataFrame({'Setosa': y_class_Setosa, 'Versicolor': y_class_Versicolor, 'Virginica': y_class_Virginica})

In [None]:
y_hats.sum()

In [None]:
y_hats.sum().sum()

In [None]:
y_hats.sum(axis = 1).value_counts()

In [None]:
y_hats = pd.DataFrame({'Setosa': y_p_Setosa, 'Versicolor': y_p_Versicolor, 'Virginica': y_p_Virginica})

y_hats['p'] = y_hats.max(axis = 1)

y_hats['Class'] = 'Setosa'
y_hats.loc[y_hats.Versicolor == y_hats.p, 'Class'] = 'Versicolor'
y_hats.loc[y_hats.Virginica == y_hats.p, 'Class'] = 'Virginica'

In [None]:
y_hats

In [None]:
(y_hats.Class == df.Species).mean()

In [None]:
(y_hats.Class == df.Species).sum()

In [None]:
(y_hats.Class != df.Species).sum()