In [1]:
import pandas as pd
import numpy as np 
import sklearn
import matplotlib.pyplot as plt

In [50]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
y_test = pd.read_csv('gender_submission.csv').set_index('PassengerId').to_numpy()


In [3]:
corr_matrix = train_data.corr()

In [4]:
corr_matrix['Survived'].sort_values(ascending = False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

#### Dropping the not so important features 

Name, ticket and cabin number aren't much important features in my opinion and transforming them into numbers can also be tedious task. So, I decided to left out this features.

In [24]:
train_clean = train_data.drop(['Name', 'Ticket', 'Cabin'], axis = 1)
test_clean = test_data.drop(['Name', 'Ticket', 'Cabin'], axis = 1)

#### Setting PassengerID was Index 

In [25]:
train_clean.set_index('PassengerId', inplace = True)
test_clean.set_index('PassengerId', inplace = True)

#### Encoding Text data into Numerical Data 

The sex and embarked columns contains text data. ML algorithms works on numerical data that's why I encoded the data in this columns into numerical values.

In [26]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
transformation_list = [2 , 7]
for i in transformation_list: #Transforming different column entries with a for loop
    train_clean[train_clean.columns[i]] = encoder.fit_transform( train_clean[train_clean.columns[i]])
    test_clean[test_clean.columns[i-1]] = encoder.fit_transform(test_clean[test_clean.columns[i-1]])


#### Filling up missing values 

The Age feature has multiple mixing values. Though it may seem irrational still I decided to fill up the empty space with the median of this feature.

In [29]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
train_transformed = train_clean['Age'].to_numpy().reshape(-1, 1) # Simple Imputer excepts input in the form of array 
# but the 'Age' column is a Series object. So, converted it into a array like form.
test_transformed = test_clean['Age'].to_numpy().reshape(-1,1)
train_age_transformed = imputer.fit_transform(train_transformed)
test_age_tranformed = imputer.fit_transform(test_transformed)
train_clean['Age'] = train_age_transformed
test_clean['Age'] = test_age_tranformed

#### Normalizing/ Scaling the data 

Age and Fare are on very different range and it can effect the model's effort to find the global minima. So, using the Standard Scaler I scaled down these two features to the range of -1 to 1.  

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
transformation_list = [3, 6]
for i in transformation_list:
    train_clean[train_clean.columns[i]] = scaler.fit_transform( train_clean[train_clean.columns[i]].to_numpy().reshape(-1,1))
    test_clean[test_clean.columns[i-1]] = scaler.fit_transform( test_clean[train_clean.columns[i-1]].to_numpy().reshape(-1,1))

In [37]:
feature_vector = train_clean.iloc[:,1:]

In [38]:
target = train_clean.iloc[:, 0]

In [63]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
sgd_clf = SGDClassifier()
log_reg = LogisticRegression()
sgd_clf.fit(feature_vector, target)
y_pred = sgd_clf.predict(test_clean)

In [61]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
cross_val_score(sgd_clf, feature_vector, target, cv = 3)

array([0.76430976, 0.78787879, 0.79461279])

In [72]:
from sklearn.metrics import f1_score, recall_score, precision_score
f1_score(y_test, y_pred)

0.9770491803278688

In [73]:
recall_score(y_test, y_pred)

0.9802631578947368

In [58]:
test_clean

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,0.755929,0,0,-0.400248,1
893,3,0,-1.322876,1,0,-0.400248,2
894,2,1,0.755929,0,0,-0.400248,1
895,3,1,0.755929,0,0,-0.400248,2
896,3,0,-1.322876,1,1,0.619896,2
...,...,...,...,...,...,...,...
1305,3,1,0.755929,0,0,-0.400248,2
1306,1,0,-1.322876,0,0,-0.400248,0
1307,3,1,0.755929,0,0,-0.400248,2
1308,3,1,0.755929,0,0,-0.400248,2


In [66]:
survivor_prediction = pd.Series(y_pred, index = test_clean.index)

In [70]:
survivor_prediction.to_csv('Survivor_prediction.csv')