In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import pipeline, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection, metrics

from df_transformers import SelectColumnsTransfomer

%matplotlib inline

In [2]:
train_data = pd.read_csv("data/preprocessed_train_data.csv", index_col="PassengerId")
test_data = pd.read_csv("data/preprocessed_test_data.csv", index_col="PassengerId")
print(f"Shape of train data: {train_data.shape}. Shape of test data: {test_data.shape}")

Shape of train data: (891, 201). Shape of test data: (418, 200)


### Разделяем выборку на трейн и тест

In [3]:
X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived']
X.shape

(891, 200)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Grid search

In [5]:
# parameteres = {'SVM__C':[0.001,0.1,10,100,10e5], 'SVM__gamma':[0.1,0.01]}
# grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
# grid.fit(X_train, y_train)
# print "score = %3.2f" %(grid.score(X_test,y_test))
# print grid.best_params_

### Define a cross validation strategy

In [6]:
def accuracy_cv(model):
    accuracies = cross_val_score(model, X_train, y_train, scoring="accuracy", cv=5)
    return(accuracies)

## Models

### Логистическая регрессия

In [7]:
lr = LogisticRegression()

In [8]:
print(f"The mean accuracy is {accuracy_cv(lr).mean()}, std: {accuracy_cv(lr).std()}")

The mean accuracy is 0.8042642729134665, std: 0.022745580430136615




Last result: The accuracy is 0.7930265232974911, std: 0.0174907006254278

### Random Forest classifaer

In [9]:
rfc = RandomForestClassifier(n_estimators = 300)

In [10]:
print(f"The mean accuracy is {accuracy_cv(rfc).mean()}, std: {accuracy_cv(rfc).std()}")

The mean accuracy is 0.797761456733231, std: 0.03843498556403983


Last result: The mean accuracy is 0.7865105990783411, std: 0.019310961490566207

### SVC

In [11]:
svc = SVC(gamma='scale')

In [12]:
print(f"The mean accuracy is {accuracy_cv(svc).mean()}, std: {accuracy_cv(svc).std()}")

The mean accuracy is 0.6486719150025602, std: 0.037719993098365955


### KNeighborsClassifier

In [13]:
knn = KNeighborsClassifier(n_neighbors = 3)

In [14]:
print(f"The mean accuracy is {accuracy_cv(knn).mean()}, std: {accuracy_cv(knn).std()}")

The mean accuracy is 0.6999815988223247, std: 0.03793851991670292


## Test data submission

In [15]:
test_data.shape

(418, 200)

In [16]:
lr.fit(X_train, y_train)
predicted = lr.predict(test_data)
test_data["Survived"] = predicted
test_data["Survived"].to_csv('logistic_regression.csv', header=True)



In [17]:
test_data["Survived"].head()

PassengerId
892    0
893    0
894    0
895    0
896    1
Name: Survived, dtype: int64