In [1]:
import numpy as np
import pandas as pd
from sklearn import pipeline, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection, metrics

from df_transformers import SelectColumnsTransfomer

In [2]:
train_data = pd.read_csv("data/train.csv", index_col="PassengerId")
test_data = pd.read_csv("data/test.csv", index_col="PassengerId")
print(f"Shape of train data: {train_data.shape}. Shape of test data: {test_data.shape}")

Shape of train data: (891, 11). Shape of test data: (418, 10)


In [3]:
columns_to_remove = ['Name', 'Ticket']
train_data = train_data.drop(columns_to_remove, axis=1)
test_data = test_data.drop(columns_to_remove, axis=1)

In [4]:
train_data[['Cabin', 'Embarked']] = train_data[['Cabin', 'Embarked']].fillna("NAN")
test_data[['Cabin', 'Embarked']] = test_data[['Cabin', 'Embarked']].fillna("NAN")

mean_age = train_data['Age'].mean()
median_age = train_data['Age'].median()
print(f'Среднее значение возраста: {mean_age}. Медиана: {median_age}')

train_data[['Age']] = train_data[['Age']].fillna(median_age)
test_data[['Age']] = test_data[['Age']].fillna(median_age)

mean_fare = test_data['Fare'].mean()
test_data[['Fare']] = test_data[['Fare']].fillna(mean_fare)

Среднее значение возраста: 29.69911764705882. Медиана: 28.0


In [5]:
X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived']
X.shape

(891, 8)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']
cat_features = X.columns.drop(numerical_features).tolist()

### Собираем пайплайн

In [8]:
# Для выбора столбцов можно воспользоваться ColumnTransformer

num_pipeline = make_pipeline(
    SelectColumnsTransfomer(numerical_features),
    #preprocessing.StandardScaler(with_mean = 0)
)

cat_pipeline = make_pipeline(
    SelectColumnsTransfomer(cat_features),
    preprocessing.OneHotEncoder(handle_unknown = 'ignore')
)

preprocessing_features = pipeline.FeatureUnion(transformer_list = [        
            ('numeric_variables_processing', num_pipeline),
            ('categorical_variables_processing', cat_pipeline),
        ])

### Grid search

In [9]:
# parameteres = {'SVM__C':[0.001,0.1,10,100,10e5], 'SVM__gamma':[0.1,0.01]}
# grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
# grid.fit(X_train, y_train)
# print "score = %3.2f" %(grid.score(X_test,y_test))
# print grid.best_params_

### Define a cross validation strategy

In [10]:
def accuracy_cv(model):
    accuracies = cross_val_score(model, X_train, y_train, scoring="accuracy", cv=5)
    return(accuracies)

## Models

### Логистическая регрессия

In [11]:
lr = LogisticRegression()

In [12]:
lr_pipeline = make_pipeline(preprocessing_features, lr)
lr_pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('numeric_variables_processing', Pipeline(memory=None,
     steps=[('selectcolumnstransfomer', SelectColumnsTransfomer(columns=['Age', 'SibSp', 'Parch', 'Fare']))])), ('categorical_variables_processing', Pipeline(memory=None,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [13]:
print(f"The accuracy is {accuracy_cv(lr_pipeline).mean()}, std: {accuracy_cv(lr_pipeline).std()}")



The accuracy is 0.7930265232974911, std: 0.0174907006254278




Last result: The accuracy is 0.7930265232974911, std: 0.0174907006254278

## Test data submission

In [14]:
predicted = lr_pipeline.predict(test_data)
test_data["Survived"] = predicted
test_data["Survived"].to_csv('logistic_regression.csv', header=True)

In [15]:
test_data["Survived"].head()

PassengerId
892    0
893    0
894    0
895    0
896    1
Name: Survived, dtype: int64