# Import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Import data

In [2]:
df = pd.read_csv("Data/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Transform "Name" to "Title"

In [3]:
df.rename(columns = {"Name": "Title"}, inplace = True)
titles = {'Mr.': 0, 'Mrs.': 1, 'Miss': 2, 'Don.': 3, 'Master.': 4, 'Rev.': 5,
          'Dr.': 6,'Mme.' : 7, 'Ms.': 8, 'Major':9, 'Mlle.':10, 'Col.':11,
          'Capt.':11, 'Jonkheer. ': 0, 'the Countess': 0}

for title in titles:
    df.loc[df['Title'].str.contains(title), 'Title'] = str(titles[title])
    
df = df.astype({"Title": int})

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,0,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,0,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,2,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,0,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,0,male,35.0,0,0,373450,8.05,,S


# Transform "Cabin" to "Deck"

In [4]:
#change Cabin into deck - for empty 0
df.rename(columns = {"Cabin": "Deck"}, inplace = True)
decks = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T': 0}
df = df.astype({"Deck": str})

df.loc[df['Deck'].str.contains("nan"), 'Deck'] = "0"

for deck in decks:
    df.loc[df['Deck'].str.contains(deck), 'Deck'] = str(decks[deck])
    
df = df.astype({"Deck": float})
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Deck,Embarked
0,1,0,3,0,male,22.0,1,0,A/5 21171,7.25,0.0,S
1,2,1,1,0,female,38.0,1,0,PC 17599,71.2833,3.0,C
2,3,1,3,2,female,26.0,0,0,STON/O2. 3101282,7.925,0.0,S
3,4,1,1,0,female,35.0,1,0,113803,53.1,3.0,S
4,5,0,3,0,male,35.0,0,0,373450,8.05,0.0,S


# Dataframe optimization

In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Title          891 non-null int32
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Deck           891 non-null float64
Embarked       889 non-null object
dtypes: float64(3), int32(1), int64(5), object(3)
memory usage: 222.3 KB


In [12]:
df.Sex = df.Sex.astype("category")
df.Embarked = df.Embarked.astype("category")
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Title          891 non-null int32
Sex            891 non-null category
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Deck           891 non-null float64
Embarked       889 non-null category
dtypes: category(2), float64(3), int32(1), int64(5), object(1)
memory usage: 116.9 KB


# Explore DataFrame #todo

# Create numpy objects

In [None]:
x = np.array(df.iloc[:,[2,3,4,5,6,7,9,10,11]])
y = np.array(df.iloc[:,1])

# Fill blanks 

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
x[:,[0,1,2,4,5,7,8]] = imputer.fit_transform(x[:,[0,1,2,4,5,7,8]])

imputer2 = SimpleImputer(missing_values=np.nan, strategy="mean")
x[:,[3,6]] = imputer2.fit_transform(x[:,[3,6]])

# Change categorical variables to dummy variables

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2, 8])] , remainder='passthrough')
x = np.array(ct.fit_transform(x))
x = x[:,[1,3,4,5,6,7,8,9]]

# Standarization

In [None]:
ss = StandardScaler()
x = ss.fit_transform(x)

# Split data into train and test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Create a models

In [None]:
models = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=7),
    RandomForestClassifier(max_depth=5, n_estimators=15, max_features=1),
    SVC(kernel = "rbf", gamma = "auto")
]

# Fit all models

In [None]:
accs = []
for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    accs.append(acc)

# Choose best accuracy

In [None]:
accuracy = max(accs)
model = models[accs.index(max(accs))]
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)

# Show results

In [None]:
print("accuracy: {0:.1f}% ".format(100*accuracy))

In [None]:
print("Confiusion matrix:")
print(cm)

# Generate results for Kaggle competition

# Load new data

In [None]:
df = pd.read_csv("Data/test.csv")

# Change data for testing

In [None]:
df.rename(columns = {"Name": "Title"}, inplace = True)
titles = {'Mr.': 0, 'Mrs.': 1, 'Miss': 2, 'Don.': 3, 'Master.': 4, 'Rev.': 5,
          'Dr.': 6,'Mme.' : 7, 'Ms.': 8, 'Major':9, 'Mlle.':10, 'Col.':11,
          'Capt.':11, 'Jonkheer. ': 0, 'the Countess': 0}


for title in titles:
    df.loc[df['Title'].str.contains(title), 'Title'] = str(titles[title])
    
df = df.astype({"Title": float})

#change Cabin into deck - for empty 0
df.rename(columns = {"Cabin": "Deck"}, inplace = True)
decks = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T': 0}
df = df.astype({"Deck": str})

df.loc[df['Deck'].str.contains("nan"), 'Deck'] = "0"

for deck in decks:
    df.loc[df['Deck'].str.contains(deck), 'Deck'] = str(decks[deck])
    
df = df.astype({"Deck": float})

# Create and modify numpy object

In [None]:
x = np.array(df.iloc[:,[1,2,3,4,5,6,8,9,10]])
y = np.array(df.iloc[:,[0,1]])
x[:,[0,1,2,4,5,7,8]] = imputer.transform(x[:,[0,1,2,4,5,7,8]])
x[:,[3,6]] = imputer2.transform(x[:,[3,6]])
x = np.array(ct.fit_transform(x))
x = x[:,[1,3,4,5,6,7,8,9]]
x = ss.transform(x)

# Make predictions

In [None]:
preds = model.predict(x)
y[:,1] = preds

# Save predictions to file

In [None]:
df = pd.DataFrame(y,columns= ["PassengerId","Survived"])
df.to_csv("output.csv", index = False)