In [None]:
import numpy as np
import pandas as pd
import patsy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV

from IPython.core.display import HTML, Image

# Advanced Model Tuning

In [None]:
Image(url='http://pix-media.s3.amazonaws.com/blog/1086/t-pain.jpg', width=400)

## AKA Autotune...

## AKA...<br><br>One simple trick to minimizing your loss functions!

# We are going to build a model using SF crime data. It will use day, time, and district predict the crime type.

## Load our data set 

In [None]:
sf_crime = pd.read_csv('./assets/datasets/sf_crime_train.csv')
sf_crime = sf_crime.dropna()

In [None]:
sf_crime.head()

## Data type conversions and transformations

In [None]:
sf_crime['Dates'] = pd.to_datetime(sf_crime.Dates)
sf_crime_dates = pd.DatetimeIndex(sf_crime.Dates.values, dtype='datetime64[ns]', freq=None)

sf_crime['hour'] = sf_crime_dates.hour
sf_crime['month'] = sf_crime_dates.month
sf_crime['year'] = sf_crime_dates.year

## Let's see what all the listed crimes are

In [None]:
sf_crime['Category'].unique()

## Select a subsection of the listed crimes

In [None]:
subset = ['VEHICLE THEFT','BURGLARY','DRUG/NARCOTIC']
sf_crime_sub = sf_crime[sf_crime['Category'].isin(subset)]

In [None]:
sf_crime_sub.head()

## Check the total number of districts

In [None]:
sf_crime_sub['PdDistrict'].unique()

In [None]:
sf_crime_sub['PdDistrict'].nunique()

## Set up our design matrix and target vector with Patsy

### Patsy allows us to use R-style formulas to do this 

In [None]:
X = patsy.dmatrix('~ C(hour) + C(DayOfWeek) + C(PdDistrict)', sf_crime_sub)
y = sf_crime_sub['Category'].values

In [None]:
# sf_crime_sub[["hour","DayOfWeek","PdDistrict"]]

In [None]:
# sf_crime_sub["hour"] = sf_crime_sub["hour"].astype('category', ordered=True)
# X = pd.get_dummies(sf_crime_sub[["hour","DayOfWeek","PdDistrict"]])
# X.head()

In [None]:
y

## Let's look at our design matrix as a DataFrame

In [None]:
pdf = pd.DataFrame(X, columns=X.design_info.column_names)
pdf['Target'] = y
pdf

## Let's see how many districts are listed in our design matrix 

In [None]:
sf_crime_sub['PdDistrict'].nunique()

In [None]:
[x for x in pdf.columns if 'PdDistrict' in x]

In [None]:
pd.Series([x for x in pdf.columns if 'PdDistrict' in x]).nunique()

## And how many hours?

In [None]:
sf_crime_sub['hour'].nunique()

In [None]:
pd.Series([x for x in pdf.columns if 'hour' in x]).nunique()

## Check: Why is there one less on both?

## Set up our training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=77)

## Now let's fit a standard logistic regression model

In [None]:
lr = LogisticRegression(solver='liblinear')

In [None]:
lr_model = lr.fit(X_train, y_train)

### Make our predictions

In [None]:
lr_ypred = lr_model.predict(X_test)

### Check our misclassifications with a confusion matrix

In [None]:
lr_cm = confusion_matrix(y_test, lr_ypred, labels=lr.classes_)
lr_cm = pd.DataFrame(lr_cm, columns=lr.classes_, index=lr.classes_)
lr_cm

### Check our precision, recall, and f1

In [None]:
print(classification_report(y_test, lr_ypred, labels=lr.classes_))

## Check the CV Score

In [None]:
cross_val_score(lr, X, y, cv=3).mean()

## Let's now use a penalized regression - we'll use lasso (l1)

In [None]:
lr_l1 = LogisticRegression(C=1.5, penalty='l1', solver='liblinear')
lr_l1_model = lr_l1.fit(X_train, y_train)

In [None]:
lr_l1_model = lr_l1.fit(X_train, y_train)

In [None]:
lr_l1_ypred = lr_l1_model.predict(X_test)

## Get confusion matrix

In [None]:
lr_l1_cm = confusion_matrix(y_test, lr_l1_ypred, labels=lr_l1.classes_)
lr_l1_cm = pd.DataFrame(lr_l1_cm, columns=lr_l1.classes_, index=lr_l1.classes_)
lr_l1_cm

## Get classification report

In [None]:
print(classification_report(y_test, lr_l1_ypred, labels=lr_l1.classes_))

## Get mean cross val score

In [None]:
cross_val_score(lr_l1, X, y, cv=3).mean()

 ## Looks like a minimal improvement with L1 penalty at 1.5, how about other values?

## We can build a function to test this

In [None]:
def test_penalties(c_val):
    lr_l1 = LogisticRegression(C=c_val, penalty='l1', solver='liblinear')
    cvs = cross_val_score(lr_l1, X, y, cv=3)
    return cvs

In [None]:
test_cs = pd.Series([.001, .01, .1, 1, 1.5, 2.5, 5, 10, 100]).to_frame('c_vals')
score_frame = pd.DataFrame([test_penalties(x) for x in test_cs['c_vals']]).mean(axis=1).to_frame('score')

pd.concat([test_cs, score_frame], axis=1)

## Sklearn has a function that will do this for us already

In [None]:
# fit model with five folds and lasso regularization
# use Cs=15 to test a grid of 15 distinct parameters
# remeber: Cs describes the inverse of regularization strength
logreg_cv = LogisticRegressionCV(Cs=20, solver='liblinear', cv=3, penalty='l1', scoring='f1')
cv_model = logreg_cv.fit(X_train, y_train)

## Find best C per class

In [None]:
print('best C for class:')
best_C = {logreg_cv.classes_[i]:x for i, (x, c) in enumerate(zip(logreg_cv.Cs_, logreg_cv.classes_))}
print(best_C)

## Get the classification report for best model

In [None]:
print(classification_report(y_test, logreg_cv.predict(X_test)))

## E1. 

## Using the data set (pdf), fit a model to predict between "Burglary" and "Drug/Narcotic" crimes
## One model should use l1 and the other should use an l2 penalty
## Make sure to use train_test_split
## Print out a confusion matrix and a classification report for both
## Finally, build a third model that uses LogisticRegressionCV
## Print our a confusion matrix, classification report and the best value of C

## Select the appropriate rows

## Apply train test split

## Fit our 2 models

## Use our fitted models to make predictions

## Get our confusion matrices

## L1 confusion matrix

## L2 matrix

## L1 model classification report

## L2 classification report

## Now using LRCV

## Get our best C

## Our confusion matrix

## Our classification report

## Introducing GridSearchCV

## To start we'll select a model and penalties and some hyperparameters 
## Then will pass those to GridSearchCV

In [None]:
logreg = LogisticRegression(solver='liblinear')
C_vals = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs = GridSearchCV(logreg, {'penalty': penalties, 'C': C_vals}, verbose=False, cv=15)
gs.fit(X, y)

## Now let's find the best parameters

In [None]:
gs.best_params_

## Use this parameter to .fit, .predict, and print a classification_report for our X and Y

In [None]:
logreg = LogisticRegression(C=gs.best_params_['C'], penalty=gs.best_params_['penalty'])
cv_model = logreg.fit(vice_X_train, vice_y_train)

In [None]:
cv_pred = cv_model.predict(vice_X_test)

## Now let's check our stats...

In [None]:
cm3 = confusion_matrix(vice_y_test, cv_pred, labels=logreg.classes_)
cm3 = pd.DataFrame(cm3, columns=logreg.classes_, index=logreg.classes_)

In [None]:
cm3

In [None]:
print(classification_report(vice_y_test, cv_pred, labels=logreg.classes_))

## Independent Practice

## Use GridSearchCV with knn on the iris data set
## Use train_test_split with a test size of .66
## Set a parameter diction with the number of neighbors and one other parameter
## Get your best estimator and print out a classification report

## First, we load our data

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

## Set our X matrix and y vector

In [None]:
X = iris.data
y = iris.target

## Train test split our data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.66)

## Next we set up our possible params and choose a model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
param_dict = dict(n_neighbors=range(1, 31), weights=['uniform', 'distance'])

## These are then passed into GridSearchCV and fit

In [None]:
gscv = GridSearchCV(knn, param_dict, scoring='accuracy')

In [None]:
gscv_model = gscv.fit(X_train, y_train)

## Let's see our best model

In [None]:
gscv_model.best_estimator_

## We could actually call fit on this model as it is a model object

## Can also just retrieve the params

In [None]:
gscv.best_params_

## Using the gridsearched params we can get predictions

In [None]:
gscv_ypred = gscv.predict(X_test)

## Now get our reports

In [None]:
print(classification_report(y_test, gscv_ypred))

In [None]:
gscv_model.best_estimator_.get_params()