In [1]:
import numpy as np
import pandas as pd
from scipy.stats import mode
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train = train[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
train = pd.concat([train, pd.get_dummies(train["Sex"]), pd.get_dummies(train["Embarked"])], axis = 1)
test = pd.concat([test, pd.get_dummies(test["Sex"]), pd.get_dummies(test["Embarked"])], axis = 1)
train.drop(["Sex", "Embarked"], axis = 1, inplace = True)
test.drop(["Sex", "Embarked"], axis = 1, inplace = True)
train["Age"] = train["Age"].fillna(mode(train["Age"])[0][0])
test["Age"] = test["Age"].fillna(mode(test["Age"])[0][0])
train["Fare"] = train["Fare"].fillna(np.mean(train["Fare"]))
test["Fare"] = test["Fare"].fillna(np.mean(test["Fare"]))
train = train[["Survived", "Pclass", "Fare", "male", "C", "S"]]
test = test[["Pclass", "Fare", "male", "C", "S"]]

In [2]:
from sklearn.model_selection import cross_val_score as cvs
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import KFold as KF
kf = KF(shuffle = True, n_splits = 5)
X_train = train.drop("Survived", axis = 1)
X_test = test
y_train = train["Survived"]

In [3]:
scores = {}
dtc = DTC().fit(X_train, y_train)
scores.update({"dtc": cvs(dtc, X_train, y_train, cv = kf)})
gnb = GNB().fit(X_train, y_train)
scores.update({"gnb": cvs(gnb, X_train, y_train, cv = kf)})
svc = SVC().fit(X_train, y_train)
scores.update({"svc": cvs(svc, X_train, y_train, cv = kf)})
lr = LR().fit(X_train, y_train)
scores.update({"lr": cvs(lr, X_train, y_train, cv = kf)})

In [4]:
df_scores = pd.DataFrame(scores).mean(axis = 0)
df = pd.DataFrame({"dtc": dtc.predict(X_test), "gnb": gnb.predict(X_test), "svc": svc.predict(X_test), "lr": lr.predict(X_test)})
for i, clf in enumerate(df_scores.index):
    df[clf] = df[clf].apply(lambda x: x*df_scores.values[i])
y_test = pd.Series(dtc.predict(X_test))
y_test2 = pd.Series(round(df.sum(axis = 1)/4))
y_diff = pd.Series(abs(y_test2 - y_test))
diff_index = np.array(y_diff[y_diff == 1].index)
y_test.iloc[diff_index] = [abs(round(i)) for i in np.random.uniform(size = len(diff_index))]
y_pred = pd.DataFrame({"PassengerId": y_test.index + 892, "Survived": y_test.values}).set_index("PassengerId")
y_pred["Survived"] = y_pred["Survived"].astype(int)
y_pred.to_csv("pred.csv")