In [209]:
# Headers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Model Requirements
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [210]:
# Importing Data

data = pd.read_csv("./Dataset/titanicSurvival/train.csv")
test_data = pd.read_csv("./Dataset/titanicSurvival/test.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [211]:
print("The shape of the dataset is {0} passengers and {1} features".format(data.shape[0],data.shape[1]))

The shape of the dataset is 891 passengers and 12 features


# Data Analysis
"""

1. Checking if we have Null Values or not in the data?
    if we have we replace them mostly with statistical median else we replace them with mean
2. Check if the data is skewed?
    if skewed we need to transform it
3. Analyze how different features effect your output
4. Remove Dummies and also perform encoding for categorical data

"""


In [212]:
# Checking for null values
data.isnull().sum()
data["Cabin"].replace("nan","NaN",inplace=True)

In [243]:
# Filling Missing Data

data["Age"].fillna(data["Age"].median(),inplace=True)
data["Cabin"].fillna(data["Cabin"].mode(),inplace=True)

test_data["Age"].fillna(test_data["Age"].median(),inplace=True)
test_data["Cabin"].fillna(test_data["Cabin"].mode(),inplace=True)
test_data["Fare"].dropna(inplace = True)

In [244]:
data["Cabin"].unique()

array(['B96 B98', 'C85', 'G6', 'C123', nan, 'E46', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'E10', 'E44', 'A34',
       'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37',
       'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39',
       'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41',
       'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58',
       'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [247]:
# Checking for null values
test_data_encode.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          326
OHE_female       0
OHE_male         0
OHE_C            0
OHE_Q            0
OHE_S            0
dtype: int64

In [248]:
# Data Pre Processing
"""
1. Label Encoding
2. One hot Encoding
3. Dummy Variable Trap
"""

data_columns = ["Sex","Embarked"]
df_encode = pd.get_dummies(data=data,prefix="OHE",prefix_sep="_",columns=data_columns,dtype = 'int8')

test_data_encode = pd.get_dummies(data=test_data,prefix="OHE",prefix_sep="_",columns=data_columns,dtype = 'int8')



In [249]:
print(test_data_encode.columns)
print(df_encode.columns)

Index(['PassengerId', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'OHE_female', 'OHE_male', 'OHE_C', 'OHE_Q', 'OHE_S'],
      dtype='object')
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'OHE_female', 'OHE_male', 'OHE_C', 'OHE_Q',
       'OHE_S'],
      dtype='object')


In [250]:
#data
X = df_encode.drop(['PassengerId','Survived','Name','Cabin','Ticket'],axis = 1)
y = df_encode['Survived']

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=28)



In [251]:
model = LinearRegression()
model.fit(x_train,y_train)

sk_theta = [model.intercept_]+list(model.coef_)
parameter_df = pd.Series(sk_theta, name='Sklearn_theta')
parameter_df

0     1.298318
1    -0.200815
2    -0.005194
3    -0.048699
4     0.007040
5    -0.000442
6     0.259916
7    -0.259916
8    -0.139328
9    -0.157278
10   -0.202874
Name: Sklearn_theta, dtype: float64

In [254]:
formatted_test_data = test_data_encode.drop(['PassengerId','Name','Cabin','Ticket'],axis = 1)
formatted_test_data = formatted_test_data.dropna()
print(formatted_test_data.columns, X.columns)
from sklearn.metrics import mean_squared_error
pred_values = model.predict(formatted_test_data)

pred_values

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'OHE_female', 'OHE_male',
       'OHE_C', 'OHE_Q', 'OHE_S'],
      dtype='object') Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'OHE_female', 'OHE_male',
       'OHE_C', 'OHE_Q', 'OHE_S'],
      dtype='object')


array([ 0.09601465,  0.4569894 ,  0.15316529,  0.0890071 ,  0.59154888,
        0.15628406,  0.639309  ,  0.24436551,  0.71976712,  0.01592719,
        0.08934611,  0.38427989,  0.95000205,  0.04646319,  0.83466528,
        0.83165706,  0.29223367,  0.18435449,  0.56046606,  0.57952334,
        0.33761029,  0.19197279,  1.00028995,  0.56907992,  0.82517438,
       -0.08174133,  1.08345198,  0.17656306,  0.40826156,  0.04940006,
        0.11398887,  0.19790952,  0.53778767,  0.56768214,  0.46819694,
        0.19733834,  0.60910957,  0.64000444,  0.09902536,  0.06785686,
        0.09515999,  0.48272807,  0.0166464 ,  0.79215311,  0.84886567,
        0.09972177,  0.45138423,  0.13500679,  0.87270038,  0.57297073,
        0.43482169,  0.35055133,  0.74931706,  0.76079969,  0.35030995,
        0.02610351,  0.04779185,  0.09984336,  0.03701971,  0.9150835 ,
        0.14128894,  0.26171182,  0.13615913,  0.6808209 ,  0.4314035 ,
        0.80419863,  0.70152985,  0.37183409,  0.52461821,  0.70