# Data Science

## Making the training and test data sets

In [None]:
import numpy as nm
import pandas as pd
import os

In [None]:
train_df=pd.read_csv('train.csv', index_col='Enter index column')
test_df=pd.read_csv('test.csv', index_col='Enter index column')

In [None]:
X=train_df.loc[:,'Replace by columns name from where to start except the answer column':].as_matrix().astype('float')

y=train_df['Replace by answer column'].ravel()

print X.shape, y.shape

## Splitting of datasets into train and test for input and output

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0)

print X_train.shape, y_train.shape

print X_test.shape, y_test.shape

In [None]:
# PRINTING AVERAGE SURVIVAL IN TRAIN AND TEST

# this output will be positive cases ie the people who survived
# we want the two results to be nearly same, so that we know that both test and train are similar in nature 

print 'Mean of result in train : {0:.3f}'.format(nm.mean(y_train))

print 'Mean of result in test : {0:.3f}'.format(nm.mean(y_test))

## Making a baseline model for comparision purpose

In [None]:
from sklearn.dummy import DummyClassifier

model_dummy= DummyClassifier(strategy='most_frequent', random_state=0)

model_dummy.fit(X_train,y_train)

#giving inputs to the dummy algo or the baseline algo

In [None]:
print 'Score for baseline model : {0: .2f}'.format(model_dummy.score(X_test, y_test))

# in this the model will first get a prediction as output from dummy model or baseline model
# all these outputs will then be compared to the y_test which the actual result, hence will give as an idea as to how accurately the algo is predicting by comparing the predicted o/p to the actual values

#if the answer is 0.61, this means that 61% of times the baseline models predicts right

# so without using ML, we are still getting .61 accuracy just by classification for predicitng survival or death

## Now making a performance matrix so that we can compare the scores of baseline with other algos such as regression


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

# all these are different kinds of performance matrixes 

In [None]:
# accuracy score

print 'accuracy for baseline model : {0: .2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test)))

# this output will give us the accuracy score of the baseline function

# we have sent y_test is the actual result and model_dummy.predict(X_test) will send its predicted output
# these both together will be compared to eachother and then accuracy will be shown

In [None]:
# now showing the confusion matrix, the parameters is same for all the performance matrix

print 'confusion matrix for baseline model : \n {0}'.format(confusion_matrix(y_test,model_dummy.predict(X_test)))

In [None]:
# precision and recall scores

print 'precision score for baseline model : \n {0}'.format(precision_score(y_test,model_dummy.predict(X_test)))

print 'recall score for baseline model : \n {0}'.format(recall_score(y_test,model_dummy.predict(X_test)))

# the warning is fine. Zero will answer to both

## Storing the output ie predicted values on a csv file( for submission or just reference)

In [None]:
test_X= test_df.as_matrix().astype('float')

# test_df is the dataframe for which we dont have answers

# we will use baseline model to predict, so first we are converting it to matrix|

In [None]:
# getting the predictions
# predictions will get the predicted values from dummy algo ie baseline model

predictions=model_dummy.predict(test_X)

In [None]:
#now making a data frame that we will save/submit
# we are using passengerID as index and their predicted value
# so will have a file that will show whether that passenger ID person is dead or alive using prediction

# remember this is prediction from baseline model. It can be wrong

df_submissions=pd.DataFrame({'Enter the Index': test_df.index,'Enter the name of the answers column': predictions})

In [None]:
df_submissions.head()

# showing the dataframe of predicted values that we created

In [None]:
df_submissions.to_csv('01_dummy.csv', index=False)

#we are setting index=False so that no other columns is formed separately for the index

# we can check after executing this line that a new file named 01_dummy.csv is created in the parent folder

# Logistic Regression Model


In [None]:
from sklearn.linear_model import LogisticRegression

#creating a model object
model_lr_1=LogisticRegression(random_state=0)

In [None]:
#training the model

model_lr_1.fit(X_train, y_train)

# we can see a big message after this is run saying c=1 and various other things

# these are various regularization features, we can change these values to get better results

# we'll explore them after few steps under regularization steps

In [None]:
# getting the model score on the test data
# we'll get a score higher than the baseline model, so LR is atleast better than baseline

print 'score for logistic regression v 1 : {0:.2f}'.format(model_lr_1.score(X_test,y_test))

In [None]:
print 'accuracy for LR model : {0: .2f}'.format(accuracy_score(y_test, model_lr_1.predict(X_test)))

print 'confusion matrix for LR model : \n {0}'.format(confusion_matrix(y_test,model_lr_1.predict(X_test)))

print 'precision score for LR model : \n {0}'.format(precision_score(y_test,model_lr_1.predict(X_test)))

print 'recall score for LR model : \n {0}'.format(recall_score(y_test,model_lr_1.predict(X_test)))


#We can see that everything improved from baseline model

### Grid Search (Hyperparameter Optimization technique)

In [None]:
model_lr=LogisticRegression(random_state=0)

from sklearn.model_selection import GridSearchCV

#using gridsearchcv function for hyperparameter optimization

In [None]:
parameters = {'C':[1.0,10.0,50,100,1000], 'penalty' : ['l1','l2']}

# we are creatign a parameter dictionary to try during the grid operation

# so we are trying 1.0, 10.0 are various other numbers for C and similarly L1 and L2 for penalty 

clf=GridSearchCV(model_lr,param_grid=parameters, cv=3)

#first we mentioned the algo name on which we will be applying the optimization

# param_grid will have all the different parameters that we want to try 

#cv=3 means perform 3 fold cross validation

#clf is grid search object

In [None]:
clf.fit(X_train, y_train)

#now here we are passing the training data into the grid search object

#the object when it was created above, we already sent the algo name,so in this line we dont need to mention the algo name that is regression, the object knows it

#so we only have to send the trianing and test data

In [None]:
clf.best_params_

#best_params is a function which will give us the best and optimized values of C, so we are getting C=1 and L1 as most optimized hyperparameters

In [None]:
print 'best score :{0:.2f}'.format(clf.best_score_)

#no significant difference observed, most advanced algo we get improvements

In [None]:
#evalute model

print 'score for Logistic Regression version : {0:.2f}'.format(clf.score(X_test,y_test))

## Feature Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
#feature normalization

scaler=MinMaxScaler()

X_train_scaled=scaler.fit_transform(X_train)

# this line is working in two parts, first part is sending X_train to fit_transform, this will fit the scaler and transform the scaled output 

In [None]:
X_train_scaled[:,0].min(),X_train_scaled[:,0].max()

# this is givnig the minimum and maximum values of the scaled values

In [None]:
#normalization test data
# this is scaling the tst data also
X_test_scaled=scaler.transform(X_test)

## feature standardization

In [None]:
scaler=StandardScaler()

X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
model_lr=LogisticRegression(random_state=0)
parameters={'C':[1.0,10.0,50.0,100.0,1000.0],'penalty':['l1','l2']}

clf=GridSearchCV(model_lr, param_grid=parameters,cv=3)
clf.fit(X_train,y_train)

In [None]:
clf.best_score_

In [2]:
#evaluation model

print 'score for logistic regression - v2 : {0:.2f}'.format(clf.score(X_test_scaled, y_test))

# we can see that their is no imporvement with feature standardiaztion

# this happens because standardized features dont have good affect on  LR due to some techincal reasons

#but still we apply to see if we get any improvement

## Model persistence

### Our work on LR is done, now we are trying to save the model so that we can directly use this model, hence we are saving the model, this is called model persistence

In [None]:
import pickle

In [None]:
#creating a file and opening it in write mode

model_file_pickle=open('lr_model.pkl','wb')
scaler_file_pickle=open('scaler_model.pkl','wb')

# we need scaler model to save the standardized scalers we created in the over standarization
# so we created scaler_model.pkl also

# wb stands for writting in binary mode

In [None]:
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

# model_file_pickle is the object name of the file

# clf is the object of the hyperparametatized logistic regression

# that is ... clf=GridSearchCV(model_lr,param_grid=parameters, cv=3)

# so clf is like the optimized LR model with best parameters in it

# becasue grid search CV will return a LR model with parameters set to the most optimized values



# dump function in used to write the model and scalar objects 

In [None]:
model_file_pickle.close()
scaler_file_pickle.close()

### Loading the persistent file


In [None]:
# now opening persisted files as read


model_file_pickle=open('lr_model.pkl','r')
scaler_file_pickle=open('scaler_model.pkl', 'r')

#load files

clf_loaded=pickle.load(model_file_pickle)

scaler_loaded=pickle.load(scaler_file_pickle)

#close files

model_file_pickle.close()
scaler_file_pickle.close()

In [None]:
clf_loaded

In [None]:
scaler_loaded

In [None]:
# transform the test data using loaded scaler object

X_test_scaled=scaler_loaded.transform(X_test)

#calculate the score using loaded model object

print 'score for persisited logistc regression : {0:.2f}'.format(clf_loaded.score(X_test_scaled, y_test))

print 'score for persisited logistc regression non scaled: {0:.2f}'.format(clf_loaded.score(X_test, y_test))

# so this perticular problem does not work well when using a scaled data set

# also we can see that we saved the whole model and no need to retrain the model

# we are just sending it the test data sets, and no train data set

# this is persisted model, a model which is alread learnt and does not require training