In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer

In [2]:
train = pd.DataFrame(pd.read_csv('train.csv'))
y = train['Survived']
non_imputed_train = train.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
non_imputed_test = pd.DataFrame(pd.read_csv('test.csv')).drop(['Name', 'Ticket', 'Cabin'], axis=1)
sample = pd.DataFrame(pd.read_csv('gender_submission.csv'))

In [3]:
train = non_imputed_train.copy()
test = non_imputed_test.copy()

cols_with_missing = (col for col in non_imputed_train.columns if non_imputed_train[col].isnull().any())
for col in cols_with_missing:
    train[col + '_was_missing'] = train[col].isnull()
    test[col + '_was_missing'] = test[col].isnull()

In [4]:
train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_was_missing,Embarked_was_missing
0,1,3,male,22.0,1,0,7.25,S,False,False
1,2,1,female,38.0,1,0,71.2833,C,False,False
2,3,3,female,26.0,0,0,7.925,S,False,False
3,4,1,female,35.0,1,0,53.1,S,False,False
4,5,3,male,35.0,0,0,8.05,S,False,False


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_was_missing,Embarked_was_missing
0,892,3,male,34.5,0,0,7.8292,Q,False,False
1,893,3,female,47.0,1,0,7.0,S,False,False
2,894,2,male,62.0,0,0,9.6875,Q,False,False
3,895,3,male,27.0,0,0,8.6625,S,False,False
4,896,3,female,22.0,1,1,12.2875,S,False,False


In [6]:
sample.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [7]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)
train, test = train.align(test, join='left', axis=1)

In [8]:
my_imputer = SimpleImputer()
train = my_imputer.fit_transform(train)
test = my_imputer.transform(test)

In [9]:
model_lr = LogisticRegression()
model_lr.fit(train, y)
predict_lr = model_lr.predict(test)



In [10]:
model_nb = GaussianNB()
model_nb.fit(train, y)
predict_nb = model_nb.predict(test)

In [11]:
model_sgd = SGDClassifier(loss='modified_huber', shuffle=True, random_state=101)
model_sgd.fit(train, y)
predict_sgd = model_sgd.predict(test)



In [12]:
model_knn = KNeighborsClassifier(n_neighbors=2)
model_knn.fit(train, y)
predict_knn = model_knn.predict(test)

In [13]:
model_dt = DecisionTreeClassifier(max_depth=10, random_state=101, max_features=None, min_samples_leaf=2)
model_dt.fit(train, y)
predict_dt = model_dt.predict(test)

In [14]:
model_svc = SVC(kernel='linear', C=0.025, random_state=101)
model_svc.fit(train, y)
predict_svc = model_svc.predict(test)

In [15]:
model_xg = XGBClassifier()
model_xg.fit(train, y)
predict_xg = model_xg.predict(test)

In [16]:
indices = np.arange(892, 892+len(test))

In [17]:
put = pd.DataFrame({'PassengerId':indices, 'Survived':predict_lr})
put.to_csv('predict_lr.csv', mode='a', index=False)

In [18]:
put = pd.DataFrame({'PassengerId':indices, 'Survived':predict_nb})
put.to_csv('predict_nb.csv', mode='a', index=False)

In [19]:
put = pd.DataFrame({'PassengerId':indices, 'Survived':predict_sgd})
put.to_csv('predict_sgd.csv', mode='a', index=False)

In [20]:
put = pd.DataFrame({'PassengerId':indices, 'Survived':predict_knn})
put.to_csv('predict_knn.csv', mode='a', index=False)

In [21]:
put = pd.DataFrame({'PassengerId':indices, 'Survived':predict_dt})
put.to_csv('predict_dt.csv', mode='a', index=False)

In [22]:
put = pd.DataFrame({'PassengerId':indices, 'Survived':predict_svc})
put.to_csv('predict_svc.csv', mode='a', index=False)

In [23]:
put = pd.DataFrame({'PassengerId':indices, 'Survived':predict_xg})
put.to_csv('predict_xg.csv', mode='a', index=False)