Import packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split


Import dataset and separate features from labels

In [2]:
path = r"C:\Users\User\Kaggle\Kaggle-main\Titanic\titanic" + "\\"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

train.describe()
test.describe()
print(train.isnull().sum())
print(test.isnull().sum())

train.info()
test.info()
  
X = train.iloc[:, 0]
X = pd.concat([X, train.iloc[:, 2:]], axis = 1)
y_train = train.iloc[:, 1].values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64



Preprocessing datasets function

In [3]:
def preprocessing(train):
    """
    Clearing 'Ticket' column
    """
    res = []
    for i in range(0,train['Ticket'].size):
        if [int(j) for j in train.loc[i,'Ticket'].split() if j.isdigit()] == []:
            res.append(None)
        else:
            res.append([int(j) for j in train.loc[i,'Ticket'].split() if j.isdigit()][0])
    train["clear_ticket"] = res
    
    """
    Clearing passenger's Status
    """
    status = []
    for i in range(0,train['Ticket'].size):
        status.append((train.loc[i,'Name'].split(", "))[1].split(".")[0])
    train["clear_status"] = status


    train = train.fillna(value = {'clear_ticket': 0, 'Embarked': 0})


    train = remove_nan_from_age(train, 3, 4, 5, 6, 7)
    train = train.fillna(train.mean())
    
    """
    Get dummies from Categorical variables
    """
    from sklearn.preprocessing import LabelEncoder
    import pandas as pd

    lbe = LabelEncoder()
    train['Sex'] = lbe.fit_transform(train['Sex'])
    dummies = pd.get_dummies(train.loc[:, ['Embarked', 'clear_status']], prefix = ['Embarked', 'Status'])
    train = pd.concat([train, dummies], axis = 1)
    train = train[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'clear_ticket', 'Embarked_C', 'Embarked_Q', 'Status_Col', 'Status_Dr',
       'Status_Master', 'Status_Miss', 'Status_Mr', 'Status_Mrs',
       'Status_Ms', 'Status_Rev']]
    return train
    
    
 


We can use Decision tree or Random Forest algorithm to replace NaNs in Age column

In [4]:
def remove_nan_from_age(frame_orig, name_col_numb, sex_col_numb, Age_col_numb, sibsp_col_numb, parch_col_numb):
    frame = frame_orig
    cluster = []
    for i in range(0,frame['Ticket'].size):
        if frame.iloc[i, sibsp_col_numb] == 0 and frame.iloc[i, parch_col_numb] == 0:
            cluster.append(0)
        elif frame.iloc[i, sex_col_numb] == 'female' and '(' in frame.iloc[i, name_col_numb] and frame.iloc[i, sibsp_col_numb] > 0 and frame.iloc[i, parch_col_numb] > 0:
            cluster.append(1)
        elif frame.iloc[i, sex_col_numb] == 'female' and '(' not in frame.iloc[i, name_col_numb] and frame.iloc[i, parch_col_numb] > 1:
            cluster.append(2)
        elif frame.iloc[i, sex_col_numb] == 'female' and '(' not in frame.iloc[i, name_col_numb] and frame.iloc[i, parch_col_numb] == 1:
            cluster.append(3)
        elif frame.iloc[i, sex_col_numb] == 'female' and '(' not in frame.iloc[i, name_col_numb] and frame.iloc[i, parch_col_numb] == 0 and frame.iloc[i, sibsp_col_numb] > 0:
            cluster.append(4)
        elif frame.iloc[i, sex_col_numb] == 'male' and frame.iloc[i, sibsp_col_numb] == 0 and frame.iloc[i, parch_col_numb] == 1:
            cluster.append(5)
        elif frame.iloc[i, sex_col_numb] == 'male' and frame.iloc[i, sibsp_col_numb] == 0 and frame.iloc[i, parch_col_numb] > 1:
            cluster.append(6)
        elif frame.iloc[i, sex_col_numb] == 'male' and frame.iloc[i, sibsp_col_numb] > 0 and frame.iloc[i, parch_col_numb] == 0:
            cluster.append(7)
        elif frame.iloc[i, sex_col_numb] == 'male' and frame.iloc[i, sibsp_col_numb] == 1 and frame.iloc[i, parch_col_numb] > 0:
            cluster.append(8)
        elif frame.iloc[i, sex_col_numb] == 'male' and frame.iloc[i, sibsp_col_numb] > 1 and frame.iloc[i, parch_col_numb] > 0:
            cluster.append(9)
        else:
            cluster.append(10)
    frame['cluster'] = cluster
    cluster_median = frame.groupby('cluster')['Age'].median().reset_index() 
    for i in range(0,frame['Ticket'].size):
        if not frame.iloc[i,Age_col_numb] > -1:
            frame.iloc[i,Age_col_numb] = cluster_median.iloc[frame.iloc[i,frame.shape[1] - 1],1]
    return frame
            

Split dataframe on test and train datasets

In [5]:
X_train = preprocessing(X)
X_test = preprocessing(test)

print(X_train.isnull().sum())
print(X_test.isnull().sum())
X_train_model, X_test_model, y_train_model, y_test_model = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)




"""
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)X_test = sc_X.transform(X_test)sc_y = StandardScaler()y_train = sc_y.fit_transform(y_train)
res = [int(i) for i in test_string.split() if i.isdigit()]
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
imputer = imputer.fit(train.iloc[:, [5, 9]])
train.iloc[:, [5, 9]] = imputer.transform(train.iloc[:, [5, 9]])
"""

PassengerId      0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
clear_ticket     0
Embarked_C       0
Embarked_Q       0
Status_Col       0
Status_Dr        0
Status_Master    0
Status_Miss      0
Status_Mr        0
Status_Mrs       0
Status_Ms        0
Status_Rev       0
dtype: int64
PassengerId      0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
clear_ticket     0
Embarked_C       0
Embarked_Q       0
Status_Col       0
Status_Dr        0
Status_Master    0
Status_Miss      0
Status_Mr        0
Status_Mrs       0
Status_Ms        0
Status_Rev       0
dtype: int64


'\nsc_X = StandardScaler()\nX_train = sc_X.fit_transform(X_train)X_test = sc_X.transform(X_test)sc_y = StandardScaler()y_train = sc_y.fit_transform(y_train)\nres = [int(i) for i in test_string.split() if i.isdigit()]\nimputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)\nimputer = imputer.fit(train.iloc[:, [5, 9]])\ntrain.iloc[:, [5, 9]] = imputer.transform(train.iloc[:, [5, 9]])\n'

Build a model

In [6]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train_model, y_train_model)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_model, classifier.predict(X_test_model))
print(cm)

[[101   9]
 [ 44  25]]
