# 3.Statistical analysis

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display
%matplotlib inline

In [27]:
train = pd.read_csv('../data/Preprocessed/train.csv')
test = pd.read_csv('../data/Preprocessed/test.csv')

In [28]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Cabin_First',
       'FamilyCount', 'title'],
      dtype='object')

In [46]:
# we will select just the columns that we want to use in our analysis
trainML = train[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket','Fare', 'Embarked', 'FamilyCount','Cabin_First', 'title']]
trainML = trainML.dropna()

In [47]:
trainML.isnull().sum()


Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
FamilyCount    0
Cabin_First    0
title          0
dtype: int64

## 3.1 Starting regressions


In [51]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

### 3.1.1 Logistic Regression: Survival on age

In [52]:
X_Age = trainML[['Age']].values
y = trainML['Survived'].values
# Use the fit method to train
log_reg.fit(X_Age,y)
# Make a prediction
y_predict = log_reg.predict(X_Age)
(y == y_predict).mean()

0.681592039800995

### 3.1.2 Logistic regression: Survival on sex / Pclass

In [53]:
X_sex = pd.get_dummies(trainML['Sex']).values
y = trainML['Survived'].values
# Use the fit method to train
log_reg.fit(X_sex, y)
# Make a prediction
y_predict = log_reg.predict(X_sex)
(y == y_predict).mean()

0.746268656716418

### 3.1.3 random Random forest (on numerical variables)

In [None]:
from sklearn.ensemble import RandomForestClassifier
X = trainML[['Age', 'SibSp', 'Parch',
       'Fare', 'FamilyCount']].values # Taking all the numerical values
y = trainML['Survived'].values
RF = RandomForestClassifier()
RF.fit(X, y)
# Make a prediction
y_predict = RF.predict(X)
(y == y_predict).mean()


## 3.2 Logistic regression
It is useful in the prediction on a binary output variable. To perform it we will use the logistic function, which is the canonical link function in the GLM framework, and this one will be maximized using the log-likelyhood. In a nutshell this means that we will not define the response Y directly, but we will model the porbability that Y belongs to a particular category fiven the input.
From a computationperspective, we will have to remember that to define the maximum of the likelyhood will be necessary to compute the derivative of the log-likelyhood. The derivative will not be anymore linear in \beta and this means that the porblem doesn't have a closed form solution. We will have a iterative algorithm:
- Newton Rapson: at every step we will have to compute the inverse of (X^T W(t) X) where W(t) changes at every step, then it is computationally expensive
- Majorize Minimize: minimize the loss funcition by approximating it with a simpler one


In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [42]:
train_ML = train[['Age', 'SibSp', 'Parch','Fare', 'FamilyCount','Survived']]
X = train_ML.drop('Survived', axis = 1)
y = train_ML['Survived']

In [45]:
log_reg.fit(X, y)
y_pred = log_reg.predict(X)
y_pred
(y == y_pred).mean()

0.6936026936026936

### 3.2.1 accuracy scores and confusion matrix on basic Multiple logistic regression
To actually evaluate the performance of our model would be necessary to know what are the actual results, ten, as in our case, we will rely on a train, test split to evaluate if it is actually good at predicting the result. The following will be useful as metrics to compare different ML models' performance

In [55]:
X = train_ML.drop('Survived', axis = 1)
y = train_ML['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [56]:
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0])

In [57]:
accuracy_score(y_pred, y_test)

0.711864406779661

In [58]:
confusion_matrix(y_pred, y_test)

array([[178,  79],
       [  6,  32]])

### Further addons, categorical variables
