In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import joblib

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/data.csv')
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Split data into train and test

In [3]:
drop_var = ['PassengerId', 'Name', 'Cabin', 'Ticket']

df = df.drop(drop_var, axis=1)
X, y = df.drop('Survived', axis=1), df['Survived']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((712, 7), (179, 7))

# Missing Data

In [5]:
cat_na_with_mode = ['Embarked']

for var in cat_na_with_mode:
    mode = X_train[var].mode()[0]

    print(var, mode)

    X_train[var] = X_train[var].fillna(mode)
    X_test[var] = X_test[var].fillna(mode)

Embarked S


In [6]:
num_na_with_median = ['Age']

for var in num_na_with_median:
    median = X_train[var].median()

    print(var, median)

    X_train[var] = X_train[var].fillna(median)
    X_test[var] = X_test[var].fillna(median)

Age 29.0


# Cut skewness data

In [7]:
cut_vars = ['Fare']
bins = [0, 7.9104, 14.4542, 31.0, 512.3292]
labels = ['Low', 'Medium', 'High', 'Very High']

for var in cut_vars:
    X_train['Fare'] = pd.cut(X_train['Fare'] , bins=bins, labels=labels, include_lowest=True)
    X_test['Fare'] = pd.cut(X_test['Fare'] , bins=bins, labels=labels, include_lowest=True)

# Apply mapping

In [8]:
sex_map = {'male': 1, 'female': 0}
var = 'Sex'

X_train[var] = X_train[var].map(sex_map)
X_test[var] = X_test[var].map(sex_map)

In [9]:
fare_map = {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}
var = 'Fare'

X_train[var] = X_train[var].map(fare_map)
X_test[var] = X_test[var].map(fare_map)

# Apply dummies

In [11]:
dummies_var = 'Embarked'

X_train = pd.get_dummies(X_train, columns=[dummies_var], dtype=int, drop_first=True)
X_test = pd.get_dummies(X_test, columns=[dummies_var], dtype=int, drop_first=True)

# Feature Scaling

In [13]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns
)

# Save

In [14]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,1.0,0.0,0.357116,0.0,0.333333,0.666667,0.0,0.0
1,0.5,1.0,0.382327,0.0,0.0,0.333333,0.0,1.0
2,0.5,1.0,0.382327,0.125,0.166667,1.0,0.0,0.0
3,1.0,1.0,0.243666,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.256271,0.0,0.0,0.0,0.0,1.0


In [16]:
X_train.to_csv('data/X_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)

y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

In [17]:
joblib.dump(scaler, 'model/minmaxscaler.joblib')

['model/minmaxscaler.joblib']