In [1]:
import os
import requests
from requests import session
import pandas as pd
import numpy as np
%matplotlib inline

## Import data

In [45]:
train_df = pd.read_csv('../data/processed/train_data.csv',index_col='PassengerId')
test_df = pd.read_csv('../data/processed/test_data.csv',index_col='PassengerId')

## Data preparation

In [46]:
X = train_df.loc[:,'Age':].as_matrix().astype('float')
y = train_df['Survived'].ravel()

  """Entry point for launching an IPython kernel.


In [28]:
print X.shape, y.shape

(891L, 32L) (891L,)


In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

(712L, 32L) (712L,)
(179L, 32L) (179L,)


In [32]:
print y_train.mean(), y_test.mean()

0.38342696629213485 0.3854748603351955


## Baseline model

In [5]:
from sklearn.dummy import DummyClassifier

In [6]:
#create model
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [7]:
#train model
model_dummy.fit(X_train,y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [8]:
#performannce metrics
model_dummy.score(X_test,y_test)

0.6145251396648045

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [10]:
#accuracy score
accuracy_score(y_test, model_dummy.predict(X_test))

0.6145251396648045

In [46]:
#confusion matrix
confusion_matrix(y_test,model_dummy.predict(X_test))

array([[110,   0],
       [ 69,   0]], dtype=int64)

## First kaggle submission

In [11]:
# converting to the matrix


  


In [13]:
# get predictions


In [38]:
def get_submission_file(model,filename):
    test_X = test_df.as_matrix().astype('float')
    predictions = model.predict(test_X)
    df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})
    df_submission.to_csv('../data/external/' + filename, index=False)

In [21]:
get_submission_file(model_dummy,'01_submission.csv')

  


## Logistic Regression

In [50]:
#import function
from sklearn.linear_model import LogisticRegression

In [25]:
#create model
model_lr_1 = LogisticRegression(random_state=0)

In [26]:
#train model
model_lr_1.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
model_lr_1.score(X_test,y_test)

0.8324022346368715

In [30]:
accuracy_score(y_test, model_lr_1.predict(X_test))


0.8324022346368715

In [31]:
confusion_matrix(y_test, model_lr_1.predict(X_test))

array([[95, 15],
       [15, 54]], dtype=int64)

In [34]:
precision_score(y_test, model_lr_1.predict(X_test))

0.782608695652174

In [36]:
model_lr_1.coef_

array([[-0.03147953,  0.00431171, -0.4505999 ,  0.        , -0.85367437,
         0.12880321, -0.1574353 , -0.41494899,  0.48948549,  1.10379061,
         0.37731386, -0.1100642 , -0.32267072,  0.95024469,  0.48958562,
        -0.34555635,  0.25340044,  1.2590464 ,  0.48939953, -1.48672669,
         1.20514836, -0.13370371, -0.49229037,  0.14594108,  0.24652032,
         0.28989946,  0.41191309,  0.50945462,  0.44620851,  0.13861084,
         0.55142075,  0.54285321]])

In [48]:
get_submission_file(model_lr_1,'02_submission.csv')


  


## Fine tuning predictive model

In [52]:
model_lr = LogisticRegression(random_state=0)


In [53]:
from sklearn.model_selection import GridSearchCV

In [87]:
parameters = {'C': [1.0,10.0,50.0,100.0,1000.0],'penalty':['l1','l2']}
clf = GridSearchCV(model_lr,param_grid=parameters,cv=3)

In [95]:
clf.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1.0, 10.0, 50.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [96]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [97]:
clf.best_score_

0.8328651685393258

In [98]:
clf.score(X_test,y_test)


0.8268156424581006

In [99]:
get_submission_file(clf,'03_submission.csv')

  


## Saving model

In [100]:
import pickle

In [101]:
model_file_pickle = open('..\models\lr_model.pl', 'wb')
pickle.dump(clf,model_file_pickle)

In [102]:
model_file_pickle.close()