In [1]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
#load data
data=pd.read_csv('titanic_train.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
#check columns with missing values
data.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [4]:
#Removing columns with less significant information
y_train=data['Survived']
x_train=data.drop(['PassengerId','Survived','Name','Ticket','SibSp','Fare','Cabin','Embarked','Age'],axis=1)
x_train

Unnamed: 0,Pclass,Sex,Parch
0,3,male,0
1,1,female,0
2,3,female,0
3,1,female,0
4,3,male,0
...,...,...,...
886,2,male,0
887,1,female,0
888,3,female,2
889,1,male,0


In [5]:
#Data preprocessing
x_train['Sex'].replace(['female','male'],[0,1],inplace=True)
x_train

Unnamed: 0,Pclass,Sex,Parch
0,3,1,0
1,1,0,0
2,3,0,0
3,1,0,0
4,3,1,0
...,...,...,...
886,2,1,0
887,1,0,0
888,3,0,2
889,1,1,0


In [6]:
#Test data preparation
data2=pd.read_csv('titanic_test.csv')
x_test=data2.drop(['PassengerId','Name','Ticket','Fare','SibSp','Cabin','Embarked','Age'],axis=1)
x_test['Sex'].replace(['female','male'],[0,1],inplace=True)
x_test

Unnamed: 0,Pclass,Sex,Parch
0,3,1,0
1,3,0,0
2,2,1,0
3,3,1,0
4,3,0,1
...,...,...,...
413,3,1,0
414,1,0,0
415,3,1,0
416,3,1,0


In [7]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [8]:
#Classifier, model training and prediction

classifier = RandomForestClassifier(random_state=0, max_depth=2)
classifier.fit(x_train, y_train)
y_pred= classifier.predict(x_test)
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [9]:
len(y_pred)

418

In [10]:
#Writing to csv file

df=pd.DataFrame()
df["PassengerId"]=data2["PassengerId"]
df["Survived"] = y_pred
df.to_csv('myfile.csv',index=False)