In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [3]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
train_df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [7]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
drop_cols = ["PassengerId","Name","Ticket","Cabin"]
cat_cols = ["Sex","Embarked"]
non_cat_cols = ["Pclass","Age","SibSp","Parch","Fare"]

In [9]:
X = train_df.drop("Survived",axis=1)
y = train_df["Survived"]
X.drop(drop_cols,axis=1,inplace=True)
test_df.drop(drop_cols,axis=1,inplace=True)

In [10]:
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    X[col] = (X[col]).fillna("Not available") 
    test_df[col] = (test_df[col]).fillna("Not available") 
    X[col] = X[col].astype('str')
    test_df[col] = test_df[col].astype('str')
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test_df[col] = le.fit_transform(test_df[col])

In [11]:
X.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,2.343434
std,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429,1.167398
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,2.0,0.0,20.125,0.0,0.0,7.9104,2.0
50%,3.0,1.0,28.0,0.0,0.0,14.4542,3.0
75%,3.0,1.0,38.0,1.0,0.0,31.0,3.0
max,3.0,1.0,80.0,8.0,6.0,512.3292,3.0


In [12]:
test_df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,332.0,418.0,418.0,417.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,1.401914
std,0.841838,0.481622,14.181209,0.89676,0.981429,55.907576,0.854496
min,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,0.0,21.0,0.0,0.0,7.8958,1.0
50%,3.0,1.0,27.0,0.0,0.0,14.4542,2.0
75%,3.0,1.0,39.0,1.0,0.0,31.5,2.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


In [13]:
X.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [16]:
for col in non_cat_cols:
    null_ind_train = np.where(X[col].isna())[0]
    null_ind_test = np.where(test_df[col].isna())[0]
    if len(null_ind_train)==0 and len(null_ind_test)==0:
        continue    
    X_temp = X.drop(null_ind_train,axis=0).drop(non_cat_cols,axis=1)
    y_temp = y.drop(null_ind_train,axis=0)
    model = LinearRegression()
    model.fit(X_temp,y_temp)
    if len(null_ind_train)>0:
        predictions_train = model.predict(X.drop(non_cat_cols,axis=1).iloc[null_ind_train,:])
        X.loc[null_ind_train,col]=predictions_train
    if len(null_ind_test)>0:
        predictions_test = model.predict(test_df.drop(non_cat_cols,axis=1).iloc[null_ind_test,:])
        test_df.loc[null_ind_test,col]=predictions_test

In [17]:
test_df.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [18]:
X.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

# Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()

In [21]:
classifier1.fit(X_train,y_train)

LogisticRegression()

In [22]:
print("Train Accuracy:",accuracy_score(y_train,classifier1.predict(X_train)))

Train Accuracy: 0.7993579454253612


In [23]:
print("Test Accuracy:",accuracy_score(y_test,classifier1.predict(X_test)))

Test Accuracy: 0.7761194029850746


# Decision Tree Classifier

In [24]:
from sklearn.tree import DecisionTreeClassifier
classifier2=DecisionTreeClassifier()

In [25]:
from sklearn.tree import DecisionTreeClassifier
sample_split = range(2,25)
features = ["sqrt","log2",None]
crit = ["gini","entropy"]
class_wgt = [None,"balanced"]
split = ["best","random"]

max_accuracy = 0
for min_samples_split in sample_split:
    for max_features in features:
        for criterion in crit:
            for class_weight in class_wgt:
                for splitter in split:
                    classifier2=DecisionTreeClassifier(splitter=splitter,class_weight=class_weight,criterion=criterion,min_samples_split=min_samples_split,max_features=max_features,random_state=3)
                    classifier2.fit(X_train,y_train)
                    train_accuracy = accuracy_score(y_train,classifier2.predict(X_train))
                    test_accuracy = accuracy_score(y_test,classifier2.predict(X_test))
                    if test_accuracy>max_accuracy:
                        train_at_max = train_accuracy
                        max_accuracy = test_accuracy
                        best_min_samples_split = min_samples_split
                        best_max_features = max_features
                        best_criterion = criterion
                        best_class_weight = class_weight
                        best_splitter = splitter
                    if test_accuracy>=0.85:
                        print("Train Accuracy:",train_accuracy)
                        print("Test Accuracy:",test_accuracy)

print("Train at max_test: ",train_at_max)
print("Test max: ",max_accuracy)
print("best_min_samples_split: ",min_samples_split)
print("best_max_features: ",max_features)
print("best_criterion: ",criterion)
print("best_class_weight: ",class_weight)
print("best_splitter: ",splitter)

Train at max_test:  0.9117174959871589
Test max:  0.835820895522388
best_min_samples_split:  24
best_max_features:  None
best_criterion:  entropy
best_class_weight:  balanced
best_splitter:  random


# K-nearest neighbour classification

In [72]:
from sklearn.neighbors import KNeighborsClassifier
classifier3=KNeighborsClassifier()

In [73]:
classifier3.fit(X_train,y_train)

KNeighborsClassifier()

In [74]:
print("Train Accuracy:",accuracy_score(y_train,classifier3.predict(X_train)))

Train Accuracy: 0.78330658105939


In [75]:
print("Test Accuracy:",accuracy_score(y_test,classifier3.predict(X_test)))

Test Accuracy: 0.6828358208955224


# Polynomial Features

In [118]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)

In [102]:
X1 = poly.fit_transform(X)


In [103]:
from sklearn.preprocessing import MinMaxScaler

In [104]:
scaler = MinMaxScaler()
X1 = scaler.fit_transform(X1)

In [105]:
X_train,X_test,y_train,y_test = train_test_split(X1,y,test_size=0.3)

# Logistic Regression

In [106]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()

In [107]:
classifier1.fit(X_train,y_train)

LogisticRegression()

In [108]:
print("Train Accuracy:",accuracy_score(y_train,classifier1.predict(X_train)))

Train Accuracy: 0.8186195826645265


In [109]:
print("Test Accuracy:",accuracy_score(y_test,classifier1.predict(X_test)))

Test Accuracy: 0.8134328358208955


# Decision Tree Classifier

In [163]:
X_train.shape

(623, 120)

In [164]:
from sklearn.tree import DecisionTreeClassifier
sample_split = range(2,25)
features = ["sqrt","log2",None]
crit = ["gini","entropy"]
class_wgt = [None,"balanced"]
split = ["best","random"]

max_accuracy = 0
for min_samples_split in sample_split:
    for max_features in features:
        for criterion in crit:
            for class_weight in class_wgt:
                for splitter in split:
                    classifier2=DecisionTreeClassifier(splitter=splitter,class_weight=class_weight,criterion=criterion,min_samples_split=min_samples_split,max_features=max_features,random_state=3)
                    classifier2.fit(X_train,y_train)
                    train_accuracy = accuracy_score(y_train,classifier2.predict(X_train))
                    test_accuracy = accuracy_score(y_test,classifier2.predict(X_test))
                    if test_accuracy>max_accuracy:
                        train_at_max = train_accuracy
                        max_accuracy = test_accuracy
                        best_min_samples_split = min_samples_split
                        best_max_features = max_features
                        best_criterion = criterion
                        best_class_weight = class_weight
                        best_splitter = splitter
                    if test_accuracy>=0.85:
                        print("Train Accuracy:",train_accuracy)
                        print("Test Accuracy:",test_accuracy)

print("Train at max_test: ",train_at_max)
print("Test max: ",max_accuracy)
print("best_min_samples_split: ",min_samples_split)
print("best_max_features: ",max_features)
print("best_criterion: ",criterion)
print("best_class_weight: ",class_weight)
print("best_splitter: ",splitter)

Train at max_test:  0.9101123595505618
Test max:  0.8432835820895522
best_min_samples_split:  24
best_max_features:  None
best_criterion:  entropy
best_class_weight:  balanced
best_splitter:  random


# K-nearest neighbour classification

In [114]:
from sklearn.neighbors import KNeighborsClassifier
classifier3=KNeighborsClassifier()

In [115]:
classifier3.fit(X_train,y_train)

KNeighborsClassifier()

In [116]:
print("Train Accuracy:",accuracy_score(y_train,classifier3.predict(X_train)))

Train Accuracy: 0.8475120385232745


In [117]:
print("Test Accuracy:",accuracy_score(y_test,classifier3.predict(X_test)))

Test Accuracy: 0.7761194029850746
