#### sklearn/scikit learn package is used for model building and evaluation

In [1]:
import sklearn as sk
len(dir(sk))

29

In [2]:
## few important functions/methods from sklearn package
from sklearn.linear_model import RidgeClassifierCV,RidgeCV, LinearRegression, LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import cross_val_score,KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.svm import SVC,SVR
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import accuracy_score,auc, f1_score,confusion_matrix
from sklearn.impute import SimpleImputer

In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [4]:
#### pre-processing data
df        = pd.read_csv('train.csv')
# Encoding 'Sex'
df['Sex'] = df.Sex.map({'male':0,'female':1})
# fill NaN in column'Embarked' with high frequency level
df['Embarked'].fillna(value = 'S',inplace = True)
# fill 'Age' with mean/average
df['Age'].fillna(value = np.mean(df['Age']),inplace=True)
# dropping name, cabin, ticket
df.drop(['Name','Cabin','Ticket'], axis = 1, inplace=True)
# transform 'Fare' to reduce skewness
df.Fare = np.log1p(df.Fare) 
# perform OHE on 'Embarked'
df2 = pd.get_dummies(df)
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,0,22.0,1,0,2.110213,0,0,1
1,2,1,1,1,38.0,1,0,4.280593,1,0,0
2,3,1,3,1,26.0,0,0,2.188856,0,0,1
3,4,1,1,1,35.0,1,0,3.990834,0,0,1
4,5,0,3,0,35.0,0,0,2.202765,0,0,1


In [5]:
df2.shape

(891, 11)

In [6]:
df_test = pd.read_csv('test.csv')
#### drop Name, Cabin, Ticket
df_test.drop(['Name','Cabin','Ticket'], axis = 1, inplace = True)
df_test['Sex'] = df_test['Sex'].map({'male':0,'female':1})
df_test['Age'] = df_test['Age'].fillna(value = np.mean(df.Age))
df_test['Fare'] = df_test['Fare'].fillna(value = np.mean(df.Age))
# transform 'Fare' to reduce skewness
df_test.Fare = np.log1p(df_test.Fare) 
df_test = pd.get_dummies(df_test)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,892,3,0,34.5,0,0,2.178064,0,1,0
1,893,3,1,47.0,1,0,2.079442,0,0,1
2,894,2,0,62.0,0,0,2.369075,0,1,0
3,895,3,0,27.0,0,0,2.268252,0,0,1
4,896,3,1,22.0,1,1,2.586824,0,0,1


In [7]:
df_test.shape

(418, 10)

In [8]:
#### 

X = df2[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked_C','Embarked_Q','Embarked_S']]
y = df2['Survived']


In [9]:
### scaling: - crucial for linear models

scale  = StandardScaler()
scale  = scale.fit(X)
X_     = scale.transform(X)  # scaling train features
X_test = scale.transform(df_test[X.columns]) # scaling test features

In [10]:
# X_test

In [11]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,22.0,1,0,2.110213,0,0,1
1,1,1,38.0,1,0,4.280593,1,0,0
2,3,1,26.0,0,0,2.188856,0,0,1
3,1,1,35.0,1,0,3.990834,0,0,1
4,3,0,35.0,0,0,2.202765,0,0,1


In [12]:
X_

array([[ 0.82737724, -0.73769513, -0.5924806 , ..., -0.48204268,
        -0.30756234,  0.61583843],
       [-1.56610693,  1.35557354,  0.63878901, ...,  2.0745051 ,
        -0.30756234, -1.62380254],
       [ 0.82737724,  1.35557354, -0.2846632 , ..., -0.48204268,
        -0.30756234,  0.61583843],
       ...,
       [ 0.82737724,  1.35557354,  0.        , ..., -0.48204268,
        -0.30756234,  0.61583843],
       [-1.56610693, -0.73769513, -0.2846632 , ...,  2.0745051 ,
        -0.30756234, -1.62380254],
       [ 0.82737724, -0.73769513,  0.17706291, ..., -0.48204268,
         3.25137334, -1.62380254]])

In [13]:
#### base model

model_cl = LogisticRegression()
model_cl = model_cl.fit(X_ , y) #### training 
predict_train = model_cl.predict(X_)

In [14]:
print(X_.shape)
print(predict_train.shape)

(891, 9)
(891,)


In [15]:
### model evaluation
accuracy_score(y, predict_train)

0.7968574635241302

In [16]:
# auc(y, predict_train)

In [18]:
### prediction on the test.csv

y_test = model_cl.predict(X_test)
# y_test

In [19]:
print(X_test.shape)
print(y_test.shape)

(418, 9)
(418,)


In [20]:
### create a new column 'Survived' in df_test and save predicted values
df_test['Survived'] = y_test
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Survived
0,892,3,0,34.5,0,0,2.178064,0,1,0,0
1,893,3,1,47.0,1,0,2.079442,0,0,1,0
2,894,2,0,62.0,0,0,2.369075,0,1,0,0
3,895,3,0,27.0,0,0,2.268252,0,0,1,0
4,896,3,1,22.0,1,1,2.586824,0,0,1,1


In [21]:
### save dataframe as csv file
df_test[['PassengerId','Survived']].to_csv('submission.csv', index = False)

In [22]:
scores = cross_val_score(model_cl,X_,y,scoring='accuracy', cv=5)
print(scores)
print(np.mean(scores))
print(np.std(scores))

[0.78212291 0.7877095  0.76966292 0.76966292 0.81920904]
0.7856734568958578
0.018188247191924134


In [23]:
#### base model without scaling features 
model_cl_unscaled = LogisticRegression()
model_cl_unscaled = model_cl_unscaled.fit(X , y) #### training 
print(accuracy_score(y, model_cl_unscaled.predict(X)))

### note: accuracy has reduced 

0.7957351290684624
