# Class 10 - Starter Code

In [None]:
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn import neighbors
from sklearn import grid_search
from sklearn import metrics
from sklearn import linear_model
from sklearn import dummy

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1)
%matplotlib inline

### Load Dataset and Pre-Process

In [None]:
# load data
df = pd.read_csv('../../assets/dataset/flight_delays.csv')

# check head
df.head()

In [None]:
# drop all blank column
df = df.drop('Unnamed: 15', axis=1)

In [None]:
# drop missing values if there are any
if df.isnull().sum().sum():
    print "There are missing values"
    df = df.dropna()
    print "Missing values dropped"

In [None]:
# check data types
print df.dtypes

# Part 1: Visualizing models over variables

### 1.1 Visualize over Week and Time of Day

API Docs for [sklearn.linear_model.LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
# create dummy variables for `DAY_OF_WEEK` feature
# this check allows this code to be run multiple times
if 'DAY_OF_WEEK' in df.columns:
    # get dummy variables for DAY_OF_WEEK
    df = df.join(pd.get_dummies(df['DAY_OF_WEEK'], prefix='DAY_OF_WEEK'))
    # remove DAY_OF_WEEK column
    df.drop(['DAY_OF_WEEK'], axis=1, inplace=True)
    
# list of features to use to build model
features = [i for i in df.columns if 'DAY_OF_WEEK_' in i]
features += ['CRS_DEP_TIME']
print features

In [None]:
# set X and y and build model
X = df[features]
y = df['DEP_DEL15']

# train model
lm = linear_model.LogisticRegression()
lm.fit(X, y)

# predict
df['probability'] = lm.predict_proba(X).T[1]

In [None]:
# plot
ax = plt.subplot(111)
labels = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
from matplotlib import cm
colors = [ cm.jet(x) for x in np.linspace(0, 1, len(labels)) ]
print colors

for i, v in enumerate(labels):
    df[df[features[i]] == 1].plot(x='CRS_DEP_TIME', y='probability', kind='scatter', label=v, color=colors[i], ax=ax)

ax.set(title='Probability of Delay\n Based on Day of Week and Time of Day')
plt.legend(loc='upper left')

### 1.2 Visualizing over Airline Carrier

In [None]:
# create dummy variables for `CARRIER` feature
### FILL IN ###
    
# list of features to use to build model
### FILL IN ###

In [None]:
# set X and y and build model
### FILL IN ###

# train model
### FILL IN ###

# predict
### FILL IN ###

In [None]:
# plot
ax = plt.subplot(111)
labels = [i for i in df.columns if 'CARRIER_' in i]
from matplotlib import cm
colors = [cm.jet(x) for x in np.linspace(0, 1, len(labels))]

for i, v in enumerate(labels):
    df[df[features[i]] == 1].plot(x='CRS_DEP_TIME', y='probability', kind='scatter', label=v, color=colors[i], ax=ax)

ax.set(title='Probability of Delay\n Based on Day of Week and Time of Day')

# Shrink current axis by 20%
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.show()

# Part 2: Visualizing Performance Against Baseline

### 2.1 Visualizing ROC Curves

API Docs for [sklearn.metrics.roc_curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html)

In [None]:
# create dummy variables for `DAY_OF_WEEK` feature
# this check allows this code to be run multiple times
if 'DAY_OF_WEEK' in df.columns:
    # get dummy variables for DAY_OF_WEEK
    df = df.join(pd.get_dummies(df['DAY_OF_WEEK'], prefix='DAY_OF_WEEK'))
    # remove DAY_OF_WEEK column
    df.drop(['DAY_OF_WEEK'], axis=1, inplace=True)
    
# list of features to use to build model
features = [i for i in df.columns if 'DAY_OF_WEEK_' in i]
features += ['CRS_DEP_TIME']
print features

In [None]:
# set X and y and build model
X = df[features]
y = df['DEP_DEL15']

# train and predict using dummy model
dm = dummy.DummyClassifier()
dm.fit(X, y)
df['probability_dm'] = dm.predict_proba(X).T[1]

# train and predict using logistic model
lm = linear_model.LogisticRegression()
lm.fit(X, y)
df['probability_lm'] = lm.predict_proba(X).T[1]

In [None]:
# plot ROC
ax = plt.subplot(111)
vals = metrics.roc_curve(df['DEP_DEL15'], df['probability_dm'])
ax.plot(vals[0], vals[1], label='Dummy Model')
vals = metrics.roc_curve(df['DEP_DEL15'], df['probability_lm'])
ax.plot(vals[0], vals[1], label='Logistic Regression')

ax.set(title='Area Under the Curve for Prediction (delayed=1)', ylabel='True Positive Rate', xlabel='False Positive Rate', xlim=(0, 1), ylim=(0, 1))
plt.legend(loc='lower right')

### 2.2 Visualizing Precision-Recall Curves

API Docs for [sklearn.metrics.precision_recall_curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html)

In [None]:
# plot precision-recall curve
ax = plt.subplot(111)
vals = metrics.### FILL IN ###
ax.plot(vals[1], vals[0], label='Dummy Model')
vals = metrics.### FILL IN ###
ax.plot(vals[1], vals[0], label='Logistic Regression')

ax.set(title='Precision-Recall Curve for Prediction (delayed=1)', ylabel='Precision', xlabel='Recall', xlim=(0, 1), ylim=(0, 1))
plt.legend(loc='upper right')