# DS_HK_15 | Class 09 | Introduction to Logistic Regression

### Guided Practice: Logit Function and Odds

In [None]:
from IPython.core.display import HTML
HTML("""
<style>
.container { width:100% !important;}
.plotly-graph-div.js-plotly-plot {margin:0 auto;}
</style>
""")

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

import plotly
import plotly.figure_factory as ff

plotly.offline.init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline()

In [None]:
def logit_func(odds):
    # uses a float (odds) and returns back the log odds (logit)
    return np.log(odds)

def sigmoid_func(logit):
    # uses a float (logit) and returns back the probability
    return 1. / (1 + np.exp(-logit))


In [None]:
odds_set = [
    5./1,
    20./1,
    1.1/1,
    1.8/1,
    1.6/1
]

In [None]:
# Probabilities for each team
# Your code here

for odds in odds_set:
    print(sigmoid_func(logit_func(odds)))

### College Admissions

In [None]:
# Read in the data

df = pd.read_csv('../../assets/dataset/collegeadmissions.csv')

In [None]:
df.head()

In [None]:
df = df.join(pd.get_dummies(df['rank'], prefix='rank'))

In [None]:
df.head()

In [None]:
df.groupby('admit').gpa.hist()

In [None]:
# Group data together
x1 = df[df.admit == 1]['gpa'].values
x2 = df[df.admit == 0]['gpa'].values

hist_data = [x1, x2]

group_labels = ['Admited', 'Not Admited']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.05)
fig['layout'].update(title='GPA')

# Plot!
plotly.offline.iplot(fig, filename='Distplot with Multiple Datasets')

In [None]:
def genHist(the_df, colour_col, value_col, bin_size):
    hist_data = []
    for x in set(the_df[colour_col]):
        hist_data.append(df[df[colour_col] == x][value_col].values)
    group_labels = list(set(df['admit']))
    fig = ff.create_distplot(hist_data, group_labels, bin_size=bin_size)
    fig['layout'].update(title=value_col)
    plotly.offline.iplot(fig)

In [None]:
genHist(df, 'admit', 'gpa', 0.05)

In [None]:
lm = LogisticRegression()

ft_list = ['gpa']

lm.fit(df[ft_list], df['admit'])

In [None]:
print lm.coef_
print lm.intercept_

In [None]:
lm = LogisticRegression()

ft_list = ['gpa', 'gre']

lm.fit(df[ft_list], df['admit'])

In [None]:
print lm.coef_
print lm.intercept_

In [None]:
lm = LogisticRegression()

ft_list = ["rank_1"]

lm.fit(df[ft_list], df['admit'])

In [None]:
print lm.coef_
print lm.intercept_

In [None]:
df.columns.tolist()

In [None]:
lm = LogisticRegression()

ft_list = df.columns[1:].tolist()

lm.fit(df[ft_list], df['admit'])

In [None]:
print(df.admit.mean())

In [None]:
ft_list.remove('rank')

In [None]:
X = df[ft_list]
y = df["admit"]
lm.fit(X, y)

In [None]:
predicted = lm.predict(X)
predicted_proba = lm.predict_proba(X)
threshold = 0.75

predicted_proba = [s[0] for s in predicted_proba]
predicted_proba[:5]

In [None]:
predicted_classes = [int(proba > threshold) for proba in predicted_proba]

In [None]:
from sklearn.metrics import accuracy_score

#threshold = 0.75
accuracy_score(y, predicted_classes)

In [None]:
#threshold = 0.5
accuracy_score(y, predicted)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y, predicted_classes)

Below is some code to walk through confusion matrices. It'll be useful for working through the Titanic problem.

Below the ROC curve is based on various thresholds: it shows with a false positive rate (x-axis) ~0, it also expects a true positive rate (y-axis) ~0 (the same, ish, for the top right hand of the figure).

The second chart, which does not play with thesholds, shows the one true TPR and FPR point, joined to 0,0 and 1,1.

The first chart will be more effective as you compare models and determine where the decision line should exist for the data. The second simplifies the first in case this idea of thresholds is confusing.

In [None]:
df_result = pd.DataFrame(roc_curve(df[['admit']], predicted_classes)[0], roc_curve(df[['admit']], predicted_classes)[1], columns=["ROC"])
df_result["RandomGuess"] = df_result.index

df_result.iplot(x = "ROC", xTitle = "False Positive Rate", y = "RandomGuess", yTitle = "True Positive Rate", title = "ROC Curve")

Finally, you can use the `roc_auc_score` function to calculate the area under these curves (AUC).

In [None]:
roc_auc_score(df['admit'], predicted_classes)

In [None]:
auc_score = roc_auc_score(df['admit'], predicted_classes)
title = ("ROC Curve" + " - AUC: " + "{:.2f}".format(auc_score))
df_result.iplot(x = "ROC", xTitle = "False Positive Rate", y = "RandomGuess", yTitle = "True Positive Rate", title = title)

### Titanic Problem

** Goals **

1. Spend a few minutes determining which data would be most important to use in the prediction problem. You may need to create new features based on the data available. Consider using a feature selection aide in sklearn. But a worst case scenario; identify one or two strong features that would be useful to include in the model.
2. Spend 1-2 minutes considering which _metric_ makes the most sense to optimize. Accuracy? FPR or TPR? AUC? Given the business problem (understanding survival rate aboard the Titanic), why should you use this metric?
3. Build a tuned Logistic model. Be prepared to explain your design (including regularization), metric, and feature set in predicting survival using the tools necessary (such as a fit chart).

In [None]:
titanic = pd.read_csv('../../assets/dataset/titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic.shape

In [None]:
titanic.info()

In [None]:
titanic.set_index('PassengerId', inplace=True)
titanic = titanic.join(pd.get_dummies(titanic.Pclass, prefix="pclass"))
titanic['is_male'] = titanic.Sex.apply(lambda x: 1 if x == 'male' else 0)

In [None]:
%matplotlib inline
titanic.groupby('Survived').Age.hist()

In [None]:
titanic.shape

In [None]:
titanic['Age'] = titanic.groupby(["Sex", 'Pclass']).Age.transform(lambda x: x.fillna(x.mean()))
titanic['had_parents'] = titanic.Parch.apply(lambda x: 1 if x > 0 else 0)
titanic['had_siblings'] = titanic.SibSp.apply(lambda x: 1 if x > 0 else 0)

In [None]:
#Old Way
from sklearn import grid_search, cross_validation
from sklearn.linear_model import LogisticRegression

feature_set = titanic[['is_male', 'pclass_1', 'pclass_2', 'Fare', 'Age', 'had_parents', 'had_siblings']]

para_dict = {'C': [10**-i for i in range(-5, 5)], 'class_weight': [None, 'balanced']},

gs = grid_search.GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=para_dict,
    cv=cross_validation.KFold(n=len(titanic), n_folds=10),
    scoring='roc_auc'
)


gs.fit(feature_set, titanic.Survived)
gs.grid_scores_
#print gs.best_estimator_

In [None]:
gs.best_score_

In [None]:
#New Way
from sklearn import model_selection

# use model_selection.GridSearchCV instead of grid_search.GridSearchCV
gs = model_selection.GridSearchCV(    estimator=LogisticRegression(),
    param_grid={'C': [10**-i for i in range(-5, 5)], 'class_weight': [None, 'balanced']},
    cv=cross_validation.KFold(n=len(titanic), n_folds=10),
    scoring='roc_auc'
)

# same as before
gs.fit(feature_set, titanic.Survived)

In [None]:
# use cv_results_ instaed of grid_scores_
# cv_results_ contians lots more info now
gs.cv_results_

In [None]:
gs.cv_results_["mean_test_score"]

In [None]:
gs.cv_results_["std_test_score"]

In [None]:
gs_cv_result = zip(["%.2f" % x for x in gs.cv_results_["mean_test_score"]], 
                   [round(x, 2) for x in gs.cv_results_["std_test_score"]], 
                   gs.cv_results_["params"])

In [None]:
gs_cv_result

In [None]:
gs_cv_result = zip(["%.2f" % x for x in gs.cv_results_["mean_test_score"]], 
                   [round(x, 2) for x in gs.cv_results_["std_test_score"]], 
                   gs.cv_results_["params"])
gs_cv_result = [{"mean":x[0], "std":x[1], "C":x[2]["C"], "Weight":x[2]["class_weight"]} for x in gs_cv_result]
gs_cv_result

In [None]:
# Same as the old one
print(gs.best_estimator_)

In [None]:
# Same as the old one
print(gs.best_score_)

In [None]:
titanic.sample(2)[['is_male', 'pclass_1', 'pclass_2', 'Fare', 'Age', 'had_parents', 'had_siblings']]

In [None]:
gs.predict_proba(titanic.sample(2)[['is_male', 'pclass_1', 'pclass_2', 'Fare', 'Age', 'had_parents', 'had_siblings']])