In [29]:
import pandas as pd
import os
import torch
import numpy as np
import random
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [30]:
# fix seed for reproducibility
SEED = 0
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [31]:
# Preprocess and clean data
def clean_df(df, test=False):
    
    # Drop PassengerId and Name
    df.drop('PassengerId', axis='columns', inplace=True)
    df.drop('Name', axis='columns', inplace=True)

    # Cabin dictionary, char to int
    cabin_dict = {
    'A':0,
    'B':1,
    'C':2,
    'D':3,
    'E':4,
    'F':5,
    'G':6,
    'T':7
    }

    # Change cabin value just to char ids, ex.  B36 -> 1
    # Column for training data
    if not test:
        # Iterate by rows
        for c_i, cabin in enumerate(df['Cabin']):
            # Change only value with str datatype
            if not isinstance(cabin,str):
                continue
            # Iterate by row value
            for n_i ,char in enumerate(cabin):
                # Start iterating from last to first char
                index = -1 - n_i
                # If given char in cabin dictionary
                if cabin[index] in cabin_dict.keys():
                    # Change given row value to char id
                    df['Cabin'][c_i] = cabin_dict[cabin[index]]
                    break


    # Change ticket value just to numbers
    # Iterate by rows
    for t_i, ticket in enumerate(df['Ticket']):
        # If ticket value == LINE change to 0
        if ticket == 'LINE':
            df.loc[t_i, 'Ticket'] = 0
        # Iterate by row value
        for c_i ,char in enumerate(ticket):
            # Start iterating from last to first char
            index = -1 - c_i
            # If whitespace found 
            if ticket[index] == ' ':
                # Change value to numbers after whitespace
                df.loc[t_i, 'Ticket'] = df['Ticket'][t_i][index+1:]   
                break
        continue

        # Reset dataframe indexing
    df.reset_index()

    # Create column for each embarked
    df['Embarked_C'] = [1 if embarked == 'C' else 0 for embarked in df['Embarked']]
    df['Embarked_Q'] = [1 if embarked == 'Q' else 0 for embarked in df['Embarked']]
    df['Embarked_S'] = [1 if embarked == 'S' else 0 for embarked in df['Embarked']]
    
    # delete old embarked column
    df.drop('Embarked', axis='columns', inplace=True)

    # Sex dictionary, str to int
    sex_dict = {
        'male':0,
        'female':1
    }

    # Replace values in dataframe with dictionary
    df.replace({'Sex':sex_dict}, inplace=True)

    # Fill na values with mean
    df.fillna(
                {
                'Age': df['Age'].mean(),
                'Fare': df['Fare'].mean(),
                },
                inplace=True
                ) 

    # If training data save temporary column
    if not test:
        df_y = df['Cabin']
        df_y= (df_y).astype(float)
        # drop Cabin from dataframe
        df.drop('Cabin', axis='columns', inplace=True)

    # Drop rows with empty Ticket value
    df = df[df.Ticket != '']
    # Change Ticket datatype
    df['Ticket'] = (df['Ticket']).astype(float)

    # Change dataframe values range from 0 to 1
    df=(df-df.min())/(df.max()-df.min())

    # If training join Cabin column
    if not test:
        df = df.join(df_y)

    return df

In [32]:
# load data
DATA_PATH = 'data'
train_data_path = os.path.join(DATA_PATH,'test.csv')
train_df = pd.read_csv(train_data_path, sep=',')

# extract null values to predict
df_to_predict = train_df[train_df.isnull().any(axis=1)]
df_to_predict.reset_index(inplace=True)

# save list of row indexes for prediction
null_indexes = df_to_predict['index']
# drop index and Cabin column
df_to_predict.drop('index', axis='columns', inplace=True)
df_to_predict.drop('Cabin', axis='columns', inplace=True)
# preprocess data for prediction
df_to_predict = clean_df(df_to_predict, test=True)
# drop null values
train_df = train_df.dropna()
train_df.reset_index(inplace=True)
# drop index
train_df.drop('index', axis='columns', inplace=True)
# preprocess training data
train_df = clean_df(train_df)
# change df to np
train_set = train_df.to_numpy()
data_to_predict = df_to_predict.to_numpy()

#split data
print(len(train_set))
train_set, val_set = torch.utils.data.random_split(train_set, [70, 17])

# change np to array | to avoid slice errors
train_set = np.array(train_set)
val_set = np.array(val_set)
X_train,y_train = train_set[:,:-1],train_set[:,-1]
X_val,y_val = val_set[:,:-1],val_set[:,-1]

87


In [33]:
# create model
clf = SVC(random_state=0)

# train model
clf.fit(X_train, y_train)

# score model
clf.score(X_val, y_val)

# cross validation
scores = cross_val_score(clf, X_train, y_train, cv = 4)
scores

array([0.38888889, 0.38888889, 0.41176471, 0.35294118])

In [34]:
# cabin dictionary
cabin_dict = {
'A':0,
'B':1,
'C':2,
'D':3,
'E':4,
'F':5,
'G':6,
'T':7
}

# create list for predictions
filled_null_cabin_chars = []
# predict cabin letter
preds = clf.predict(data_to_predict)
# Create a list of PassengerId: prediction
for index,pred in zip(null_indexes,preds):
    pred = int(pred)
    cabin_number = list(cabin_dict.keys())[list(cabin_dict.values()).index(pred)]
    filled_null_cabin_chars.append({
        'PassengerId':index,
        'value':cabin_number
    })
filled_null_cabin_chars

[{'PassengerId': 0, 'value': 'C'},
 {'PassengerId': 1, 'value': 'F'},
 {'PassengerId': 2, 'value': 'C'},
 {'PassengerId': 3, 'value': 'F'},
 {'PassengerId': 4, 'value': 'F'},
 {'PassengerId': 5, 'value': 'F'},
 {'PassengerId': 6, 'value': 'C'},
 {'PassengerId': 7, 'value': 'B'},
 {'PassengerId': 8, 'value': 'C'},
 {'PassengerId': 9, 'value': 'F'},
 {'PassengerId': 10, 'value': 'F'},
 {'PassengerId': 11, 'value': 'C'},
 {'PassengerId': 13, 'value': 'C'},
 {'PassengerId': 15, 'value': 'C'},
 {'PassengerId': 16, 'value': 'C'},
 {'PassengerId': 17, 'value': 'C'},
 {'PassengerId': 18, 'value': 'F'},
 {'PassengerId': 19, 'value': 'C'},
 {'PassengerId': 20, 'value': 'C'},
 {'PassengerId': 21, 'value': 'F'},
 {'PassengerId': 22, 'value': 'C'},
 {'PassengerId': 23, 'value': 'C'},
 {'PassengerId': 25, 'value': 'F'},
 {'PassengerId': 27, 'value': 'C'},
 {'PassengerId': 29, 'value': 'C'},
 {'PassengerId': 30, 'value': 'C'},
 {'PassengerId': 31, 'value': 'B'},
 {'PassengerId': 32, 'value': 'F'},
 {

In [35]:
# save predition to file
output = pd.DataFrame(filled_null_cabin_chars)
output.to_csv('filled_null_cabin_chars_test.csv', index=False)