In [27]:
import pandas as pd
import os
import torch
import numpy as np
import random
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
import warnings
warnings.filterwarnings('ignore')

In [28]:
# fix seed for reproducibility
SEED = 0
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [29]:
# Preprocess and clean data
def clean_df(df, test=False):
    
    # Drop PassengerId and Name
    df.drop('PassengerId', axis='columns', inplace=True)
    df.drop('Name', axis='columns', inplace=True)

    # Cabin dictionary, char to int
    cabin_dict = {
    'A':0,
    'B':1,
    'C':2,
    'D':3,
    'E':4,
    'F':5,
    'G':6,
    'T':7
    }

    # Change cabin value just to char ids, ex.  B36 -> 1
    # Column for training data
    if not test:
        # Iterate by rows
        for c_i, cabin in enumerate(df['Cabin']):
            # Change only value with str datatype
            if not isinstance(cabin,str):
                continue
            # Iterate by row value
            for n_i ,char in enumerate(cabin):
                # Start iterating from last to first char
                index = -1 - n_i
                # If given char in cabin dictionary
                if cabin[index] in cabin_dict.keys():
                    # Cabin number
                    cabin_number = df['Cabin'][c_i][index+1:]
                    # If cabin number == cabin letter put 0
                    if cabin_number in cabin_dict.keys(): cabin_number = 0
                    # Change given row value to cabin number
                    df.loc[c_i, 'Cabin'] =   int(cabin_number)
                    break

    # Change ticket value just to numbers
    # Iterate by rows
    for t_i, ticket in enumerate(df['Ticket']):
        # If ticket value == LINE change to 0
        if ticket == 'LINE':
            df.loc[t_i, 'Ticket'] = 0
        # Iterate by row value
        for c_i ,char in enumerate(ticket):
            # Start iterating from last to first char
            index = -1 - c_i
            # If whitespace found 
            if ticket[index] == ' ':
                # Change value to numbers after whitespace
                df.loc[t_i, 'Ticket'] = df['Ticket'][t_i][index+1:]   
                break
        continue

        # Reset dataframe indexing
    df.reset_index()

    # Create column for each embarked
    df['Embarked_C'] = [1 if embarked == 'C' else 0 for embarked in df['Embarked']]
    df['Embarked_Q'] = [1 if embarked == 'Q' else 0 for embarked in df['Embarked']]
    df['Embarked_S'] = [1 if embarked == 'S' else 0 for embarked in df['Embarked']]
    
    # delete old embarked column
    df.drop('Embarked', axis='columns', inplace=True)

    # Sex dictionary, str to int
    sex_dict = {
        'male':0,
        'female':1
    }

    # Replace values in dataframe with dictionary
    df.replace({'Sex':sex_dict}, inplace=True)

    # Fill na values with mean
    df.fillna(
                {
                'Age': df['Age'].mean(),
                'Fare': df['Fare'].mean(),
                },
                inplace=True
                ) 

    # If training data save temporary column
    if not test:
        df_y = df['Cabin']
        df_y= (df_y).astype(float)
        # drop Cabin from dataframe
        df.drop('Cabin', axis='columns', inplace=True)
    
    # Drop rows with empty Ticket value
    df = df[df.Ticket != '']
    # Change Ticket datatype
    df['Ticket'] = (df['Ticket']).astype(float)

    # If training join Cabin column
    if not test:
        df = df.join(df_y)
        # Print cabin max and min values
        df_cabin_max = df['Cabin'].max()
        print('max',df['Cabin'].max(),'min',df['Cabin'].min())

    # Change dataframe values range from 0 to 1
    df=(df-df.min())/(df.max()-df.min())

    # If training return dataframe and max cabin value
    if not test:
        return df, df_cabin_max
    else:
        return df

In [30]:
# load data
DATA_PATH = 'data'
train_data_path = os.path.join(DATA_PATH,'test.csv')
train_df = pd.read_csv(train_data_path, sep=',')

# extract null values to predict
df_to_predict = train_df[train_df.isnull().any(axis=1)]
df_to_predict.reset_index(inplace=True)

# save list of row indexes for prediction
null_indexes = df_to_predict['index']
# drop index and Cabin column
df_to_predict.drop('index', axis='columns', inplace=True)
df_to_predict.drop('Cabin', axis='columns', inplace=True)
# preprocess data for prediction
df_to_predict = clean_df(df_to_predict, test=True)
# drop null values
train_df = train_df.dropna()
train_df.reset_index(inplace=True)
# drop index
train_df.drop('index', axis='columns', inplace=True)
# preprocess training data
train_df, df_cabin_max = clean_df(train_df)
# change df to np
train_set = train_df.to_numpy()
data_to_predict = df_to_predict.to_numpy()

#split data
print(len(train_set))
train_set, val_set = torch.utils.data.random_split(train_set, [70, 17])

# change np to array | to avoid slice errors
train_set = np.array(train_set)
val_set = np.array(val_set)
X_train,y_train = train_set[:,:-1],train_set[:,-1]
X_val,y_val = val_set[:,:-1],val_set[:,-1]

max 132.0 min 0.0
87


In [31]:
# set params
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

# create model
reg = ensemble.GradientBoostingRegressor(**params)

# train model
reg.fit(X_train, y_train)

# score model
reg.score(X_val, y_val)

# cross validation
scores = cross_val_score(reg, X_train, y_train, cv = 4)
scores

array([-0.01935327, -0.83645562,  0.09830709, -0.50642099])

In [32]:
# predict cabin number
pred = reg.predict(data_to_predict)
# create list for predictions
filled_null_cabin_numbers = []
# Create a list of PassengerId: prediction
for index, value in zip(null_indexes,pred):
    filled_null_cabin_numbers.append({
        'PassengerId':index,
        'value':int(round(value * df_cabin_max,0))
    })
filled_null_cabin_numbers


[{'PassengerId': 0, 'value': 23},
 {'PassengerId': 1, 'value': 43},
 {'PassengerId': 2, 'value': 40},
 {'PassengerId': 3, 'value': 22},
 {'PassengerId': 4, 'value': 27},
 {'PassengerId': 5, 'value': 15},
 {'PassengerId': 6, 'value': 23},
 {'PassengerId': 7, 'value': 26},
 {'PassengerId': 8, 'value': 33},
 {'PassengerId': 9, 'value': 26},
 {'PassengerId': 10, 'value': 22},
 {'PassengerId': 11, 'value': 42},
 {'PassengerId': 13, 'value': 44},
 {'PassengerId': 15, 'value': 30},
 {'PassengerId': 16, 'value': 21},
 {'PassengerId': 17, 'value': 33},
 {'PassengerId': 18, 'value': 26},
 {'PassengerId': 19, 'value': 55},
 {'PassengerId': 20, 'value': 62},
 {'PassengerId': 21, 'value': 36},
 {'PassengerId': 22, 'value': 12},
 {'PassengerId': 23, 'value': 44},
 {'PassengerId': 25, 'value': 48},
 {'PassengerId': 27, 'value': 33},
 {'PassengerId': 29, 'value': 47},
 {'PassengerId': 30, 'value': 42},
 {'PassengerId': 31, 'value': 37},
 {'PassengerId': 32, 'value': 13},
 {'PassengerId': 33, 'value': 

In [33]:
# save predition to file
output = pd.DataFrame(filled_null_cabin_numbers)
output.to_csv('filled_null_cabin_numbers_test.csv', index=False)