In [39]:
import pandas as pd
import os
import torch
import numpy as np
import random
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
import warnings
warnings.filterwarnings('ignore')

In [40]:
# fix seed for reproducibility
SEED = 0
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [41]:
# Preprocess and clean data
def clean_df(df, test=False):
    
    # Drop PassengerId and Name
    df.drop('PassengerId', axis='columns', inplace=True)
    df.drop('Name', axis='columns', inplace=True)

    # Cabin dictionary, char to int
    cabin_dict = {
    'A':0,
    'B':1,
    'C':2,
    'D':3,
    'E':4,
    'F':5,
    'G':6,
    'T':7
    }

# Define list for new column in dataframe
    chars_ids = []
    numbers = []
    # Change cabin column to cabin letter column and cabin number column
    # Iterate by rows
    for c_i, cabin in enumerate(df['Cabin']):
        # Change only value with str datatype
        if not isinstance(cabin,str):
            continue
        # Iterate by row value
        for n_i ,char in enumerate(cabin):
            # Start iterating from last to first char
            index = -1 - n_i
            # If given char in cabin dictionary
            if cabin[index] in cabin_dict.keys():
                # Append cabin id
                chars_ids.append(int(cabin_dict[cabin[index]]))
                # Cabin number
                cabin_number = str(df['Cabin'][c_i][index+1:])
                # If cabin number == cabin letter put 0
                if cabin_number in cabin_dict.keys(): cabin_number = 0
                # Append cabin number
                numbers.append(int(cabin_number))
                break

    # Create new columns
    df['Cabin_char_id'] = chars_ids
    df['Cabin_number'] = numbers
    # Delete old Cabin column
    df.drop('Cabin', axis='columns', inplace=True)

    # Change ticket value just to numbers
    # Iterate by rows
    for t_i, ticket in enumerate(df['Ticket']):
        # If ticket value == LINE change to 0
        if ticket == 'LINE':
            df.loc[t_i, 'Ticket'] = 0
        # Iterate by row value
        for c_i ,char in enumerate(ticket):
            # Start iterating from last to first char
            index = -1 - c_i
            # If whitespace found 
            if ticket[index] == ' ':
                # Change value to numbers after whitespace
                df.loc[t_i, 'Ticket'] = df['Ticket'][t_i][index+1:]   
                break
        continue

    # Reset dataframe indexing
    df.reset_index()

    # Create column for each embarked
    df['Embarked_C'] = [1 if embarked == 'C' else 0 for embarked in df['Embarked']]
    df['Embarked_Q'] = [1 if embarked == 'Q' else 0 for embarked in df['Embarked']]
    df['Embarked_S'] = [1 if embarked == 'S' else 0 for embarked in df['Embarked']]

    # delete old embarked column
    df.drop('Embarked', axis='columns', inplace=True)

    # Sex dictionary, str to int
    sex_dict = {
        'male':0,
        'female':1
    }

    # Replace values in dataframe with dictionary
    df.replace({'Sex':sex_dict}, inplace=True)

    # Fill na values with mean
    df.fillna(
                {
                'Fare': df['Fare'].mean(),
                },
                inplace=True
                ) 
    
    # If training data save temporary column
    if not test:
        df_y = df['Age']
        df_y= (df_y).astype(float)
        # drop Age from dataframe
        df.drop('Age', axis='columns', inplace=True)
    
    # drop rows with empty Ticket value
    df = df[df.Ticket != '']
    
    # Change Ticket datatype
    df['Ticket'] = (df['Ticket']).astype(float)

    # If training join Age column
    if not test:
        df = df.join(df_y)
        max_age = df['Age'].max()

    # Change dataframe values range from 0 to 1
    df=(df-df.min())/(df.max()-df.min())

    # If training return dataframe and max Age value
    if not test:
        return df, max_age
    else:
        return df

In [42]:
# load data
DATA_PATH = 'data'
train_data_path = os.path.join(DATA_PATH,'test.csv')
train_df = pd.read_csv(train_data_path, sep=',')

# load predicted features
cabin_chars_path = 'filled_null_cabin_chars_test.csv'
cabin_chars = pd.read_csv(cabin_chars_path, sep=',')
cabin_chars['value'] = (cabin_chars['value']).astype(str)
cabin_chars.set_index('PassengerId',inplace=True)

cabin_numbers_path = 'filled_null_cabin_numbers_test.csv'
cabin_numbers = pd.read_csv(cabin_numbers_path, sep=',')
cabin_numbers['value'] = (cabin_numbers['value']).astype(str)
cabin_numbers.set_index('PassengerId',inplace=True)

#fill null values
train_df.loc[cabin_chars.index, 'Cabin'] = cabin_chars['value'] + cabin_numbers['value']

# preprocess data
test_df = train_df[train_df.isnull().any(axis=1)]
test_df.reset_index(inplace=True)
null_indexes = test_df['index']
test_df.drop('index', axis='columns', inplace=True)
test_df.drop('Age', axis='columns', inplace=True)
test_df = clean_df(test_df, test=True)

train_df = train_df.dropna()
train_df.reset_index(inplace=True)
train_df.drop('index', axis='columns', inplace=True)
train_df, df_max_age = clean_df(train_df)

print(train_df.info())

# change df to np
train_set = train_df.to_numpy()
test_set = test_df.to_numpy()

#split data
print(len(train_set))
train_set, val_set = torch.utils.data.random_split(train_set, [280, 51])

# change np to array | to avoid slice errors
train_set = np.array(train_set)
val_set = np.array(val_set)
X_train,y_train = train_set[:,:-1],train_set[:,-1]
X_val,y_val = val_set[:,:-1],val_set[:,-1]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331 entries, 0 to 330
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         331 non-null    float64
 1   Sex            331 non-null    float64
 2   SibSp          331 non-null    float64
 3   Parch          331 non-null    float64
 4   Ticket         331 non-null    float64
 5   Fare           331 non-null    float64
 6   Cabin_char_id  331 non-null    float64
 7   Cabin_number   331 non-null    float64
 8   Embarked_C     331 non-null    float64
 9   Embarked_Q     331 non-null    float64
 10  Embarked_S     331 non-null    float64
 11  Age            331 non-null    float64
dtypes: float64(12)
memory usage: 31.2 KB
None
331


In [43]:
# set params
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

# create model
reg = ensemble.GradientBoostingRegressor(**params)

# train model
reg.fit(X_train, y_train)

# score model
reg.score(X_val, y_val)

# cross validation
scores = cross_val_score(reg, X_train, y_train, cv = 4)
scores

array([-0.15475087,  0.39903485,  0.36887231,  0.34963474])

In [44]:
pred = reg.predict(test_set)
filled_null_ages = []
for index, value in zip(null_indexes,pred):
    filled_null_ages.append({
        'PassengerId':index,
        'value':int(round(value * df_max_age,0))
    })
filled_null_ages


[{'PassengerId': 10, 'value': 13},
 {'PassengerId': 22, 'value': 35},
 {'PassengerId': 29, 'value': 21},
 {'PassengerId': 33, 'value': 8},
 {'PassengerId': 36, 'value': 13},
 {'PassengerId': 39, 'value': 24},
 {'PassengerId': 41, 'value': 31},
 {'PassengerId': 47, 'value': 21},
 {'PassengerId': 54, 'value': 18},
 {'PassengerId': 58, 'value': 8},
 {'PassengerId': 65, 'value': 22},
 {'PassengerId': 76, 'value': 13},
 {'PassengerId': 83, 'value': 13},
 {'PassengerId': 84, 'value': 27},
 {'PassengerId': 85, 'value': 19},
 {'PassengerId': 88, 'value': 22},
 {'PassengerId': 91, 'value': 13},
 {'PassengerId': 93, 'value': 9},
 {'PassengerId': 102, 'value': 22},
 {'PassengerId': 107, 'value': 21},
 {'PassengerId': 108, 'value': 19},
 {'PassengerId': 111, 'value': 22},
 {'PassengerId': 116, 'value': 18},
 {'PassengerId': 121, 'value': 22},
 {'PassengerId': 124, 'value': 22},
 {'PassengerId': 127, 'value': 26},
 {'PassengerId': 132, 'value': 25},
 {'PassengerId': 133, 'value': 18},
 {'PassengerI

In [45]:
output = pd.DataFrame(filled_null_ages)
output.to_csv('filled_null_ages_test.csv', index=False)