In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [3]:
dataset_train = pd.read_csv('train.csv')
dataset_test = pd.read_csv('test.csv')
dataset_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [6]:
dataset_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
dataset_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
dataset_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
dataset_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


# Preprocessing both train and test data

In [10]:
Y = dataset_train['Survived'] # taking the label column
dataset_train = dataset_train.drop(['Survived'], axis =1)

In [11]:
dataset_train = dataset_train.drop(['Ticket'], axis = 1)
dataset_test = dataset_test.drop(['Ticket'], axis = 1)

In [12]:
gender={"male":0, "female":1}
dataset_train['Sex'] = dataset_train['Sex'].map(gender)
dataset_test['Sex'] = dataset_test['Sex'].map(gender)

In [13]:
dataset_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,,S


In [14]:
dataset_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,12.2875,,S


In [15]:
dataset_train['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [16]:
top_occurance = 'S'

In [17]:
dataset_train['Embarked'] = dataset_train['Embarked'].fillna(top_occurance)

In [18]:
embarked_loc = {"S": 0, "C": 1, "Q": 2}
dataset_train['Embarked'] = dataset_train['Embarked'].map(embarked_loc)
dataset_test['Embarked'] = dataset_test['Embarked'].map(embarked_loc)

In [19]:
dataset_test['Fare'].describe()

count    417.000000
mean      35.627188
std       55.907576
min        0.000000
25%        7.895800
50%       14.454200
75%       31.500000
max      512.329200
Name: Fare, dtype: float64

In [20]:
mean_fare_train = dataset_train['Fare'].mean()
mean_fare_test = dataset_test['Fare'].mean()
dataset_train['Fare'] = dataset_train['Fare'].fillna(mean_fare_train)
dataset_test['Fare'] = dataset_test['Fare'].fillna(mean_fare_test)

In [21]:
dataset_train['Fare']=dataset_train['Fare'].astype(int)
dataset_test['Fare']=dataset_test['Fare'].astype(int)

In [22]:
train_age = dataset_train['Age'].mean()
test_age = dataset_train['Age'].mean()
dataset_train['Age'] = dataset_train['Age'].fillna(train_age)
dataset_test['Age'] =  dataset_test['Age'].fillna(test_age)

In [23]:
dataset_train['Age'] = dataset_train['Age'].astype(int)
dataset_test['Age'] =  dataset_test['Age'].astype(int)

In [24]:
dataset_train.loc[dataset_train['Age'] <= 11, 'Age'] = 0
dataset_test.loc[dataset_test['Age'] <= 11, 'Age'] = 0

dataset_train.loc[(dataset_train['Age'] > 11) & (dataset_train['Age'] <= 18), 'Age'] = 1
dataset_test.loc[(dataset_test['Age'] > 11) & (dataset_test['Age'] <= 18), 'Age'] = 1

dataset_train.loc[(dataset_train['Age'] > 18) & (dataset_train['Age'] <= 22), 'Age'] = 2
dataset_test.loc[(dataset_test['Age'] > 18) & (dataset_test['Age'] <= 22), 'Age'] = 2

dataset_train.loc[(dataset_train['Age'] > 22) & (dataset_train['Age'] <= 27), 'Age'] = 3
dataset_test.loc[(dataset_test['Age'] > 22) & (dataset_test['Age'] <= 27), 'Age'] = 3

dataset_train.loc[(dataset_train['Age'] > 27) & (dataset_train['Age'] <= 33), 'Age'] = 4
dataset_test.loc[(dataset_test['Age'] > 27) & (dataset_test['Age'] <= 33), 'Age'] = 4

dataset_train.loc[(dataset_train['Age'] > 33) & (dataset_train['Age'] <= 40), 'Age'] = 5
dataset_test.loc[(dataset_test['Age'] > 33) & (dataset_test['Age'] <= 40), 'Age'] = 5


dataset_train.loc[(dataset_train['Age'] > 40) & (dataset_train['Age'] <= 66), 'Age'] = 6
dataset_test.loc[(dataset_test['Age'] > 40) & (dataset_test['Age'] <= 66), 'Age'] = 6


dataset_train.loc[ dataset_train['Age'] > 66, 'Age'] = 6
dataset_test.loc[ dataset_test['Age'] > 66, 'Age'] = 6

In [25]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null int32
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null int32
Cabin          204 non-null object
Embarked       891 non-null int64
dtypes: int32(2), int64(6), object(2)
memory usage: 62.7+ KB


In [26]:
dataset_train = dataset_train.drop(['Cabin', 'Name'], axis =1)
dataset_test = dataset_test.drop(['Cabin', 'Name'], axis =1)

In [27]:
dataset_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,2,1,0,7,0
1,2,1,1,5,1,0,71,1
2,3,3,1,3,0,0,7,0
3,4,1,1,5,1,0,53,0
4,5,3,0,5,0,0,8,0


In [28]:
dataset_train.loc[ dataset_train['Fare'] <= 7.91, 'Fare'] = 0
dataset_test.loc[ dataset_test['Fare'] <= 7.91, 'Fare'] = 0

dataset_train.loc[(dataset_train['Fare'] > 7.91) & (dataset_train['Fare'] <= 14.454), 'Fare'] = 1
dataset_test.loc[(dataset_test['Fare'] > 7.91) & (dataset_test['Fare'] <= 14.454), 'Fare'] = 1

dataset_train.loc[(dataset_train['Fare'] > 14.454) & (dataset_train['Fare'] <= 31), 'Fare']   = 2
dataset_test.loc[(dataset_test['Fare'] > 14.454) & (dataset_test['Fare'] <= 31), 'Fare']   = 2

dataset_train.loc[(dataset_train['Fare'] > 31) & (dataset_train['Fare'] <= 99), 'Fare']   = 3
dataset_test.loc[(dataset_test['Fare'] > 31) & (dataset_test['Fare'] <= 99), 'Fare']   = 3

dataset_train.loc[(dataset_train['Fare'] > 99) & (dataset_train['Fare'] <= 250), 'Fare']   = 4
dataset_test.loc[(dataset_test['Fare'] > 99) & (dataset_test['Fare'] <= 250), 'Fare']   = 4

dataset_train.loc[ dataset_train['Fare'] > 250, 'Fare'] = 5
dataset_test.loc[ dataset_test['Fare'] > 250, 'Fare'] = 5


In [29]:
dataset_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,2,1,0,0,0
1,2,1,1,5,1,0,3,1
2,3,3,1,3,0,0,0,0
3,4,1,1,5,1,0,3,0
4,5,3,0,5,0,0,1,0


In [30]:
dataset_train = dataset_train.drop(['PassengerId'], axis =1)
dataset_test = dataset_test.drop(['PassengerId'], axis =1)

## SGD

In [31]:
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(dataset_train, Y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [32]:
Y_pred = sgd.predict(dataset_test)

In [33]:
Y_pred.size

418

In [34]:
sgd.score(dataset_train, Y)
acc_sgd = round(sgd.score(dataset_train, Y) * 100, 2)

In [35]:
acc_sgd

74.41

## Random Forest

In [36]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(dataset_train, Y)

Y_pred_rand = random_forest.predict(dataset_test)

random_forest.score(dataset_train, Y)
acc_random_forest = round(random_forest.score(dataset_train, Y) * 100, 2)

In [37]:
acc_random_forest

90.46

## Logistic Regression

In [38]:
logreg = LogisticRegression()
logreg.fit(dataset_train, Y)

Y_pred_log = logreg.predict(dataset_test)

acc_log = round(logreg.score(dataset_train, Y) * 100, 2)



In [39]:
acc_log

80.81

## K-Nearest Neighbour

In [40]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(dataset_train, Y)
Y_pred_knn = knn.predict(dataset_test)
acc_knn = round(knn.score(dataset_train, Y) * 100, 2)

In [41]:
acc_knn

87.09

## Gaussian

In [42]:
gaussian = GaussianNB()
gaussian.fit(dataset_train, Y)
Y_pred_gauss = gaussian.predict(dataset_test)
acc_gaussian = round(gaussian.score(dataset_train, Y) * 100, 2)

In [43]:
acc_gaussian

76.88

## Decision Tree

In [44]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(dataset_train, Y)
Y_pred_dec = decision_tree.predict(dataset_test)
acc_decision_tree = round(decision_tree.score(dataset_train, Y) * 100, 2)

In [45]:
acc_decision_tree

90.46