## Importing the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer

sns.set()

## Loading the raw data

In [2]:
raw_data = pd.read_csv('train.csv')

raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocessing

In [3]:
raw_data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Pears, Mrs. Thomas (Edith Wearne)",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### Dealing with outliers

In [4]:
print(raw_data.isnull().sum(), len(raw_data))

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64 891


In [5]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose=0)
imputer = imputer.fit(raw_data.iloc[:, 5:6])

raw_data_with_mean_age = raw_data
raw_data_with_mean_age.iloc[:, 5:6] = imputer.transform(raw_data.iloc[:, 5:6])

In [6]:
print(raw_data_with_mean_age.isnull().sum())
raw_data_with_mean_age.describe(include='all')

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Pears, Mrs. Thomas (Edith Wearne)",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,13.002015,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,22.0,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,29.699118,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,31.0,,


In [7]:
# drop cabin temporarily because of missing values
preprocessed_data = raw_data_with_mean_age.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
preprocessed_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [8]:
processed_sexes = pd.get_dummies(preprocessed_data['Sex'])
preprocessed_data['Female'], preprocessed_data['Male'] = processed_sexes['female'], processed_sexes['male']

preprocessed_data = preprocessed_data.drop(['Sex'], axis=1)
preprocessed_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Female,Male
0,0,3,22.0,1,0,7.25,0,1
1,1,1,38.0,1,0,71.2833,1,0
2,1,3,26.0,0,0,7.925,1,0
3,1,1,35.0,1,0,53.1,1,0
4,0,3,35.0,0,0,8.05,0,1


In [9]:
preprocessed_data['Pclass'] = preprocessed_data['Pclass'].map({ 1: 3, 3: 1, 2: 2 })
preprocessed_data.head()
#q = preprocessed_data['Fare'].quantile(0.99)
#preprocessed_data = preprocessed_data[preprocessed_data['Fare']<q]
#preprocessed_data.describe(include='all')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Female,Male
0,0,1,22.0,1,0,7.25,0,1
1,1,3,38.0,1,0,71.2833,1,0
2,1,1,26.0,0,0,7.925,1,0
3,1,3,35.0,1,0,53.1,1,0
4,0,1,35.0,0,0,8.05,0,1


### Multicollinearity

In [10]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = preprocessed_data[['Pclass', 'SibSp', 'Parch', 'Fare']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['features'] = variables.columns
vif

Unnamed: 0,VIF,features
0,1.98516,Pclass
1,1.439724,SibSp
2,1.492792,Parch
3,2.016145,Fare


## Logistic Regression Model

### Declare the inputs and the targets

In [11]:
targets = preprocessed_data['Survived']
inputs = preprocessed_data.drop(['Survived'],axis=1)

### Scale the data

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [13]:
inputs_scaled = scaler.transform(inputs)

### Train Test Split

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)

### Training the model

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [16]:
reg = LogisticRegression()
reg.fit(x_train, y_train)
reg.score(x_train, y_train)



0.7991573033707865

In [17]:
reg.score(x_test, y_test)

0.7988826815642458

In [18]:
print(y_train.reset_index(drop=True))

0      0
1      1
2      0
3      0
4      1
      ..
707    0
708    0
709    1
710    1
711    0
Name: Survived, Length: 712, dtype: int64


In [19]:
class TitanicDisasterPredictor:
    def __init__(self, raw_data):
        self.raw_data = raw_data
        
    def preprocess(self):
        imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose=0)
        imputer = imputer.fit(self.raw_data.iloc[:, 4:5])
        
        raw_data_with_mean_age = self.raw_data
        raw_data_with_mean_age.iloc[:, 4:5] = imputer.transform(self.raw_data.iloc[:, 4:5])
        
        imputer_1 = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose=0)
        imputer_1 = imputer.fit(self.raw_data.iloc[:, 8:9])
        
        raw_data_with_mean_fare = self.raw_data
        raw_data_with_mean_fare.iloc[:, 8:9] = imputer_1.transform(raw_data_with_mean_age.iloc[:, 8:9])
        
        preprocessed_data = raw_data_with_mean_fare.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
        
        processed_sexes = pd.get_dummies(preprocessed_data['Sex'])
        preprocessed_data['Female'], preprocessed_data['Male'] = processed_sexes['female'], processed_sexes['male']

        preprocessed_data = preprocessed_data.drop(['Sex'], axis=1)
        self.scaled_data = scaler.transform(preprocessed_data)
        
    def getResult(self):
        #targets = self.preprocessed_data['Survived']
        #inputs = self.preprocessed_data.drop(['Survived'],axis=1)
        return reg.predict(self.scaled_data)

In [20]:
test_data = pd.read_csv('test.csv')

predictor = TitanicDisasterPredictor(test_data)
predictor.preprocess()

final_result = test_data[['PassengerId']]
final_result['Survived'] = predictor.getResult()
final_result.describe(include='all')

In [21]:
final_result.to_csv(path_or_buf='result.csv',index=False)