In [138]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction, feature_selection
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import grid_search
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer

from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

In [2]:
%load_ext sql

  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")


## Pre-Task: Describe the goals of your study

## Part 1: Aquire the Data

In [3]:
#psql -h dsi.c20gkj5cvu3l.us-east-1.rds.amazonaws.com -p 5432 -U dsi_student titanic
#password: gastudents

#### 1. Connect to the remote database

In [4]:
%%sql postgresql://dsi_student:gastudents@dsi.c20gkj5cvu3l.us-east-1.rds.amazonaws.com/titanic
SELECT table_name
FROM information_schema.tables
where table_catalog = 'titanic'
and table_schema = 'public'
ORDER BY table_name;

8 rows affected.


table_name
account
account_information
evictions_simple
howie
jacques
table1
train
user


In [5]:
%%sql
select * from train limit 3

3 rows affected.


index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


#### 2. Query the database and aggregate the data

In [6]:
df = %sql select * from train;
df = df.DataFrame()

891 rows affected.


In [7]:
df.head()

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### 5. What are the risks and assumptions of our data? 

## Part 2: Exploratory Data Analysis

#### 1. Describe the Data

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
index          891 non-null int64
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [9]:
from sklearn.feature_extraction import DictVectorizer as DV

In [10]:
class SeriesImputer(BaseEstimator, TransformerMixin):
    """ Impute missing values.
       Columns of dtype object are imputed with the most frequent value 
       in column.

       Columns of other types are imputed with mean of column.
    """
       # def __init__(self)
   
    def fit(self, X, y=None):
        if len(X.mode()) != 0 :
            self.fill = (X.mode()[0] if X.dtype == np.dtype('O') else X.mean())
        else :      
            self.fill = X[X.first_valid_index()]        
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [11]:
class OneHotEncoderPandasCategoricalSeries(BaseEstimator, TransformerMixin):
    def __init__(self, sparse=False, **kwargs):       
        self.ohe = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=sparse, **kwargs)
        self.NA_CAT = '-NA-'
        
    def _X_for_onehotenc(self, X):       
        return X.cat.codes.reshape(-1,1 )
      
    
    def fit(self, X, y=None):   
        X2 = pd.Series(pd.Categorical(X, categories=list(X.cat.categories)+[self.NA_CAT]))
        X2.fillna(self.NA_CAT, inplace=True)        
        self.categories = X2.cat.categories
        self.ohe.fit(self._X_for_onehotenc(X2))
        return self
                
    def transform(self, X, y=None): 
        X = pd.Series(pd.Categorical(X, categories=list(X.cat.categories)+[self.NA_CAT]))
        X.fillna(self.NA_CAT, inplace=True)
        if (len(X.cat.categories) != len(self.categories)) or np.all(X.cat.categories != self.categories):
            X = pd.Series(pd.Categorical(X, categories=self.categories))
        
        print X.cat.categories
        print X.cat.codes.unique()
        return self.ohe.transform(self._X_for_onehotenc(X))

In [12]:
class DebugTransformer():
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        print X.shape
        return X

In [13]:
class CatOneColumn(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):    
        #self.categories = X.astype('category').cat.categories
        return self

    def transform(self, X, y=None):
        #X = pd.Series(pd.Categorical(X, categories=self.categories))
        X =  X.astype('category')
        return X
    

In [170]:
class DictVectSeries(BaseEstimator, TransformerMixin):
 

    def fit(self, X, y=None):    
        #self.categories = X.astype('category').cat.categories
        #X.to_dict()
        X = X.astype(str)
        
        samples = [dict(enumerate(sample)) for sample in X]
        self.dict_ = DictVectorizer()
        
        self.dict_.fit(samples)
        
        return self

    def transform(self, X, y=None):
        #X = pd.Series(pd.Categorical(X, categories=self.categories))
        X = X.astype(str)
        samples = [dict(enumerate(sample)) for sample in X]        
        return self.dict_.transform(samples)
    

In [14]:
class SelectOneColumn(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # return X.ix[:, self.column]
        return X[self.column]

In [15]:
class GetVectorizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self = self
        
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
       # if isinstance(X, pd.Series):
        return feature_extraction.text.TfidfVectorizer().fit_transform(X)
       # else:
     #       raise TypeError("This transformer only works with Pandas Dataframes")
    

    

In [16]:
class DFback(BaseEstimator, TransformerMixin):
    def __init__(self):
        self = self

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # return X.ix[:, self.column]
        return pd.DataFrame(X)

In [17]:
class DenseTransformer(BaseEstimator, TransformerMixin):

    def fit_transform(self, X, y=None):
        return self.transform(X)

    def transform(self, X):
        return X.todense()

    def fit(self, X, y=None):
        return self

In [18]:
#OneHotEncoderPandasCategoricalSeries() df['Sex']

In [168]:
Embarked_pipe = Pipeline([('Col_Embarked', SelectOneColumn('Embarked')), 
                          ('Fill_na_Embarked', SeriesImputer()),
                        
                        ('Dummy_Embarked', DictVectSeries())])

In [171]:
SibSp_pipe = Pipeline([('Col_SibSp', SelectOneColumn('SibSp')), 
                       ('Fill_na_SibSp', SeriesImputer()),
                      
                      ('Dummy_SibSp', DictVectSeries())])

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)
    
fit_model = SibSp_pipe.fit(X_train)
X_train_t = fit_model.transform(X_train)

In [77]:
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]

In [78]:
D

[{'bar': 2, 'foo': 1}, {'baz': 1, 'foo': 3}]

In [124]:

class RowIterator(BaseEstimator, TransformerMixin):
    """ Prepare dataframe for DictVectorizer """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return (item for _, item in X.iteritems())




In [155]:
pptest = make_pipeline(DictVectSeries())

In [156]:
pptest.fit(s)

Pipeline(steps=[('dictvectseries', DictVectSeries())])

In [157]:
pptest.transform(s)

<3x2 sparse matrix of type '<type 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [158]:
pptest.transform(shitass)

<2x2 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [159]:
type(s)

pandas.core.series.Series

In [141]:
samples = [dict(enumerate(sample)) for sample in s]

In [142]:
vectorizer = make_pipeline(DictVectorizer())



In [146]:
# now you can use vectorizer as you might expect, e.g.
hihi = vectorizer.fit(samples)

In [149]:
hihi.transform(samplesShit)

<2x2 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [148]:
samplesShit = [dict(enumerate(sample)) for sample in shitass]

In [147]:
shitass = pd.Series(['A','D'])

In [145]:
hihi

<3x2 sparse matrix of type '<type 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [128]:
type(hihi)

generator

In [95]:
s = pd.Series(['A','B','A'])

In [96]:
s

0    A
1    B
2    A
dtype: object

In [134]:
ss = dict(s)

In [135]:
DictVectorizer().fit(ss)

AttributeError: 'numpy.int64' object has no attribute 'iteritems'

In [27]:
s = pd.Series(pd.Categorical([1,2,3], categories=[1,2]))

In [28]:
s

0    1.0
1    2.0
2    NaN
dtype: category
Categories (2, int64): [1, 2]

In [29]:
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=True, categorical_features=[1,2,3])

In [30]:
s2 = pd.Series(pd.Categorical(s, categories=[1,2,'NA']))

In [31]:
s2.fillna('NA').cat.codes

0    0
1    1
2    2
dtype: int8

In [32]:
import sklearn

In [33]:
sklearn.__version__

'0.17.1'

In [34]:
ohe.transform(pd.Series(s))



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [35]:
Age_pipe = Pipeline([('Col_Age', SelectOneColumn('Age')), 
                      ('Fill_na_Age', SeriesImputer()),
                     ('DataFrame_Age',DFback()),
                      ('StandardScaler_Age', MinMaxScaler())])

In [36]:
Fare_pipe = Pipeline([('Col_Fare', SelectOneColumn('Fare')), 
                      ('Fill_na_Fare', SeriesImputer()),
                      ('DataFrame_Age',DFback()),
                      ('StandardScaler_Fare', MinMaxScaler())])

In [37]:
#testpipe = Pipeline([('test', CatOneColumn())])

In [38]:
Embarked_pipe = Pipeline([('Col_Embarked', SelectOneColumn('Embarked')), 
                          ('Fill_na_Embarked', SeriesImputer()),
                          ('Cat_Embarked', CatOneColumn()),
                        ('Dummy_Embarked', OneHotEncoderPandasCategoricalSeries())])
Sex_pipe = Pipeline([('Col_Sex', SelectOneColumn('Sex')), 
                      ('Fill_na_Sex', SeriesImputer()),
                      ('Cat_Sex', CatOneColumn()),
                      ('Dummy_Sex', OneHotEncoderPandasCategoricalSeries())])
Pclass_pipe = Pipeline([('Col_Pclass', SelectOneColumn('Pclass')), 
                        ('Fill_na_Pclass', SeriesImputer()),
                        ('Cat_Pclass', CatOneColumn()),
                      ('Dummy_Pclass', OneHotEncoderPandasCategoricalSeries())])
SibSp_pipe = Pipeline([('Col_SibSp', SelectOneColumn('SibSp')), 
                       ('Fill_na_SibSp', SeriesImputer()),
                       ('Cat_SibSp', CatOneColumn()),
                      ('Dummy_SibSp', OneHotEncoderPandasCategoricalSeries())])
Parch_pipe = Pipeline([('Col_Parch', SelectOneColumn('Parch')), 
                       ('Fill_na_Parch', SeriesImputer()),
                       ('Cat_Parch', CatOneColumn()),
                      ('Dummy_Parch', OneHotEncoderPandasCategoricalSeries())])


In [39]:
Name_pipe = Pipeline([('Col_Name', SelectOneColumn('Name')), 
                       ('Fill_na_Name', SeriesImputer()),
                      ('Vectorize_Name', TfidfVectorizer()),
                         ('Dense_Name', DenseTransformer())])
Ticket_pipe = Pipeline([('Col_Ticket', SelectOneColumn('Ticket')), 
                       ('Fill_na_Ticket', SeriesImputer()),
                      ('Vectorize_Ticket', TfidfVectorizer()),
                         ('Dense_Ticket', DenseTransformer())])
Cabin_pipe = Pipeline([('Col_Cabin', SelectOneColumn('Cabin')), 
                       ('Fill_na_Cabin', SeriesImputer()),
                      ('Vectorize_Cabin', TfidfVectorizer()),
                         ('Dense_Cabin', DenseTransformer())])

In [40]:
union = make_union(Age_pipe, Fare_pipe, 
                   Embarked_pipe, Sex_pipe, Pclass_pipe, SibSp_pipe, Parch_pipe, 
                   Name_pipe,Ticket_pipe, Cabin_pipe
                  )

In [41]:
X = df #[list(set(df.columns)-set(['Cabin']))]
y = df[u'Survived']

In [42]:
#union.fit(X)

In [43]:
cv = cross_validation.KFold(len(y), n_folds=5, shuffle=True)

In [44]:
result = cross_validation.cross_val_score(LogisticRegression(), 
                                          X=union.fit_transform(X), y=y, cv=cv)

Index([u'C', u'Q', u'S', u'-NA-'], dtype='object')
[2 0 1]
Index([u'female', u'male', u'-NA-'], dtype='object')
[1 0]
Index([1, 2, 3, u'-NA-'], dtype='object')
[2 0 1]
Index([0, 1, 2, 3, 4, 5, 8, u'-NA-'], dtype='object')
[1 0 3 4 2 5 6]
Index([0, 1, 2, 3, 4, 5, 6, u'-NA-'], dtype='object')
[0 1 2 5 3 4 6]


'Categorical.from_codes(codes, categories)'?
'Categorical.from_codes(codes, categories)'?


In [45]:
result

array([ 0.83240223,  0.83146067,  0.80898876,  0.80337079,  0.83146067])

In [46]:
result.mean()

0.82153662670265537

In [47]:
X['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

# Now, use feature_selection first and then LogisticRegression

In [48]:
union_pipe = make_pipeline(union, feature_selection.SelectFromModel(LogisticRegressionCV()))

In [49]:
result = cross_validation.cross_val_score(LogisticRegression(), 
                                          X=union_pipe.fit(X,y).transform(X), y=y, cv=cv, n_jobs=-1)

Index([u'C', u'Q', u'S', u'-NA-'], dtype='object')
[2 0 1]
Index([u'female', u'male', u'-NA-'], dtype='object')
[1 0]
Index([1, 2, 3, u'-NA-'], dtype='object')
[2 0 1]
Index([0, 1, 2, 3, 4, 5, 8, u'-NA-'], dtype='object')
[1 0 3 4 2 5 6]
Index([0, 1, 2, 3, 4, 5, 6, u'-NA-'], dtype='object')
[0 1 2 5 3 4 6]


'Categorical.from_codes(codes, categories)'?
'Categorical.from_codes(codes, categories)'?


Index([u'C', u'Q', u'S', u'-NA-'], dtype='object')
[2 0 1]
Index([u'female', u'male', u'-NA-'], dtype='object')
[1 0]
Index([1, 2, 3, u'-NA-'], dtype='object')
[2 0 1]
Index([0, 1, 2, 3, 4, 5, 8, u'-NA-'], dtype='object')
[1 0 3 4 2 5 6]
Index([0, 1, 2, 3, 4, 5, 6, u'-NA-'], dtype='object')
[0 1 2 5 3 4 6]


In [50]:
result.mean()

0.82043186240662858

### Some improvements

In [51]:
Xt = union_pipe.fit(X,y).transform(X)

Index([u'C', u'Q', u'S', u'-NA-'], dtype='object')
[2 0 1]
Index([u'female', u'male', u'-NA-'], dtype='object')
[1 0]
Index([1, 2, 3, u'-NA-'], dtype='object')
[2 0 1]
Index([0, 1, 2, 3, 4, 5, 8, u'-NA-'], dtype='object')
[1 0 3 4 2 5 6]
Index([0, 1, 2, 3, 4, 5, 6, u'-NA-'], dtype='object')
[0 1 2 5 3 4 6]


'Categorical.from_codes(codes, categories)'?
'Categorical.from_codes(codes, categories)'?


Index([u'C', u'Q', u'S', u'-NA-'], dtype='object')
[2 0 1]
Index([u'female', u'male', u'-NA-'], dtype='object')
[1 0]
Index([1, 2, 3, u'-NA-'], dtype='object')
[2 0 1]
Index([0, 1, 2, 3, 4, 5, 8, u'-NA-'], dtype='object')
[1 0 3 4 2 5 6]
Index([0, 1, 2, 3, 4, 5, 6, u'-NA-'], dtype='object')
[0 1 2 5 3 4 6]


In [52]:
grid_search_pipe = Pipeline([('Logistic_Reg', LogisticRegression())])

In [53]:
grid_search_pipe.get_params().keys()

['Logistic_Reg__multi_class',
 'Logistic_Reg__dual',
 'Logistic_Reg__fit_intercept',
 'Logistic_Reg__max_iter',
 'Logistic_Reg__intercept_scaling',
 'Logistic_Reg__warm_start',
 'Logistic_Reg__penalty',
 'Logistic_Reg__n_jobs',
 'Logistic_Reg__C',
 'Logistic_Reg__solver',
 'steps',
 'Logistic_Reg__class_weight',
 'Logistic_Reg__random_state',
 'Logistic_Reg__tol',
 'Logistic_Reg',
 'Logistic_Reg__verbose']

In [54]:
logreg_parameters = {
    'Logistic_Reg__penalty':['l1','l2'],
    'Logistic_Reg__C':np.logspace(-5,1,50),
    'Logistic_Reg__solver':['liblinear']
}

In [55]:
gs = grid_search.GridSearchCV(grid_search_pipe, param_grid=logreg_parameters, cv=cv, verbose=1, n_jobs=-1)

In [56]:
gs.fit(Xt, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  49 tasks       | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 199 tasks       | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 449 tasks       | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    5.7s finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=891, n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=Pipeline(steps=[('Logistic_Reg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'Logistic_Reg__solver': ['liblinear'], 'Logistic_Reg__penalty': ['l1', 'l2'], 'Logistic_Reg__C': array([  1.00000e-05,   1.32571e-05,   1.75751e-05,   2.32995e-05,
         3.08884e-05,   4.09492e-05,   5.42868e-05,   7.19686e-05,
         9.54095e-05,   1.26486e-04,   1.67683e-04,   2.2...    2.44205e+00,   3.23746e+00,   4.29193e+00,   5.68987e+00,
         7.54312e+00,   1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [57]:
gs.best_score_

0.84399551066217737

In [58]:
gs.best_estimator_

Pipeline(steps=[('Logistic_Reg', LogisticRegression(C=7.5431200633546069, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [59]:
gs.best_params_

{'Logistic_Reg__C': 7.5431200633546069,
 'Logistic_Reg__penalty': 'l2',
 'Logistic_Reg__solver': 'liblinear'}

### Nice!

#### 2. Visualize the Data

## Part 3: Data Wrangling

#### 1. Create Dummy Variables for *Sex* 

## Part 4: Logistic Regression and Model Validation

#### 1. Define the variables that we will use in our classification analysis

#### 2. Transform "Y" into a 1-Dimensional Array for SciKit-Learn

#### 3. Conduct the logistic regression

#### 4. Examine the coefficients to see our correlations

#### 6. Test the Model by introducing a *Test* or *Validaton* set 

#### 7. Predict the class labels for the *Test* set

#### 8. Predict the class probabilities for the *Test* set

#### 9. Evaluate the *Test* set

#### 10. Cross validate the test set

#### 11. Check the Classification Report

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

def print_cm_cr(y_true, y_pred, names):
    """prints the confusion matrix and the classification report"""
    cm = confusion_matrix(y_true, y_pred)
    cols = ['pred_' + c for c in names]
    dfcm = pd.DataFrame(cm, columns = cols, index = names)
    print dfcm
    print
    print classification_report(y_true, y_pred)

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
def do_cm_cr(X, y, names):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)
    
    fit_model = union.fit(X_train,y_train)
    X_train_t = fit_model.transform(X_train)
    
    cv = cross_validation.KFold(len(y_train), n_folds=5, shuffle=True)
    gs = grid_search.GridSearchCV(grid_search_pipe, param_grid=logreg_parameters, cv=cv, verbose=1, n_jobs=-1)
    
    gs.fit(X_train_t, y_train)
    
    X_test_t = fit_model.transform(X_test)
    y_pred = gs.predict(X_test_t)
    
    print_cm_cr(y_test, y_pred, names)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)
    
fit_model = SibSp_pipe.fit(X_train,y_train)
X_train_t = fit_model.transform(X_train)

In [None]:
X

In [None]:
X_train_t

In [None]:
testdf = pd.DataFrame(X_train_t)

In [None]:
testdf

In [None]:
X_test

In [None]:
X_test['SibSp'].value_counts()

In [None]:
X_test_t = fit_model.transform(X_test)

In [None]:
do_cm_cr(X, y, ['survived', 'dead'])

#### 12. What do the classification metrics tell us?

#### 13. Check the Confusion Matrix

#### 14. What does the Confusion Matrix tell us? 

#### 15. Plot the ROC curve

#### 16. What does the ROC curve tell us?

## Part 5: Gridsearch

#### 1. Use GridSearchCV with logistic regression to search for optimal parameters 

- Use the provided parameter grid. Feel free to add if you like (such as n_jobs).
- Use 5-fold cross-validation.

In [None]:
logreg_parameters = {
    'penalty':['l1','l2'],
    'C':np.logspace(-5,1,50),
    'solver':['liblinear']
}

#### 2. Print out the best parameters and best score. Are they better than the vanilla logistic regression?

#### 3. Explain the difference between the difference between the L1 (Lasso) and L2 (Ridge) penalties on the model coefficients.

#### 4. What hypothetical situations are the Ridge and Lasso penalties useful?

#### 5. [BONUS] Explain how the regularization strength (C) modifies the regression loss function. Why do the Ridge and Lasso penalties have their respective effects on the coefficients?

#### 6.a. [BONUS] You decide that you want to minimize false positives. Use the predicted probabilities from the model to set your threshold for labeling the positive class to need at least 90% confidence. How and why does this affect your confusion matrix?

## Part 6: Gridsearch and kNN

#### 1. Perform Gridsearch for the same classification problem as above, but use KNeighborsClassifier as your estimator

At least have number of neighbors and weights in your parameters dictionary.

#### 2. Print the best parameters and score for the gridsearched kNN model. How does it compare to the logistic regression model?

#### 3. How does the number of neighbors affect the bias-variance tradeoff of your model?

#### [BONUS] Why?

#### 4. In what hypothetical scenario(s) might you prefer logistic regression over kNN, aside from model performance metrics?

#### 5. Fit a new kNN model with the optimal parameters found in gridsearch. 

#### 6. Construct the confusion matrix for the optimal kNN model. Is it different from the logistic regression model? If so, how?

#### 7. [BONUS] Plot the ROC curves for the optimized logistic regression model and the optimized kNN model on the same plot.

## Part 7: [BONUS] Precision-recall

#### 1. Gridsearch the same parameters for logistic regression but change the scoring function to 'average_precision'

`'average_precision'` will optimize parameters for area under the precision-recall curve instead of for accuracy.

#### 2. Examine the best parameters and score. Are they different than the logistic regression gridsearch in part 5?

#### 3. Create the confusion matrix. Is it different than when you optimized for the accuracy? If so, why would this be?

#### 4. Plot the precision-recall curve. What does this tell us as opposed to the ROC curve?

[See the sklearn plotting example here.](http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html)

## Part 8: [VERY BONUS] Decision trees, ensembles, bagging

#### 1. Gridsearch a decision tree classifier model on the data, searching for optimal depth. Create a new decision tree model with the optimal parameters.

#### 2. Compare the performace of the decision tree model to the logistic regression and kNN models.

#### 3. Plot all three optimized models' ROC curves on the same plot. 

#### 4. Use sklearn's BaggingClassifier with the base estimator your optimized decision tree model. How does the performance compare to the single decision tree classifier?

#### 5. Gridsearch the optimal n_estimators, max_samples, and max_features for the bagging classifier.

#### 6. Create a bagging classifier model with the optimal parameters and compare it's performance to the other two models.