In [1]:
import pandas as pd
import os
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
import random

In [2]:
# fix seed for reproducibility
SEED = 0
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [3]:
# Preprocess and clean data
def clean_df(df, test=False):
    
    # Drop PassengerId and Name
    df.drop('PassengerId', axis='columns', inplace=True)
    df.drop('Name', axis='columns', inplace=True)

    # Cabin dictionary, char to int
    cabin_dict = {
    'A':1,
    'B':2,
    'C':3,
    'D':4,
    'E':5,
    'F':6,
    'G':7,
    'H':8,
    'T':9
    }

    # Change cabin value just to numbers, ex.  B36 -> 236
    # Iterate by rows
    for c_i, cabin in enumerate(df['Cabin']):
        # Change only value with str datatype
        if not isinstance(cabin,str):
            continue
        # Iterate by row value
        for n_i ,char in enumerate(cabin):
            # Start iterating from last to first char
            index = -1 - n_i
            # If given char in cabin dictionary
            if cabin[index] in cabin_dict.keys():
                # Cabin letter to ID
                cabin_letter_id = str(cabin_dict[cabin[index]])
                # Cabin number
                cabin_number = str(df['Cabin'][c_i][index+1:])
                # If cabin number == cabin letter put empty value
                if cabin_number in cabin_dict.keys(): cabin_number = ''
                # Join letter with number and change that value in dataframe
                df.loc[c_i, 'Cabin'] =  int(cabin_letter_id + cabin_number)
                break

    # Change cabing datatype to float
    df['Cabin'] = (df['Cabin']).astype(float)

    # Change ticket value just to numbers
    # Iterate by rows
    for t_i, ticket in enumerate(df['Ticket']):
        # If ticket value == LINE change to 0
        if ticket == 'LINE':
            df.loc[t_i, 'Ticket'] = 0
        # Iterate by row value
        for c_i ,char in enumerate(ticket):
            # Start iterating from last to first char
            index = -1 - c_i
            # If whitespace found 
            if ticket[index] == ' ':
                # Change value to numbers after whitespace
                df.loc[t_i, 'Ticket'] = df['Ticket'][t_i][index+1:]   
                break
        continue

    # Reset dataframe indexing
    df.reset_index()

    # Embarked dictionary, char to int
    embarked_dict = {
    'C':0,
    'Q':1,
    'S':2
    }

    # Sex dictionary, str to int
    sex_dict = {
        'male':0,
        'female':1
    }

    # Replace values in dataframe with dictionaries
    df.replace({'Sex':sex_dict}, inplace=True)
    df.replace({'Embarked':embarked_dict}, inplace=True)

    # Fill na values with mean
    df.fillna(
                {
                'Age': df['Age'].mean(),
                'Cabin': df['Cabin'].mean(),
                'Embarked': df['Embarked'].mean()
                },
                inplace=True
                ) 
    
    # If training data save temporary column
    if not test:
        df_y = df['Survived']
        df_y= (df_y).astype(float)
        # drop survived from dataframe
        df.drop('Survived', axis='columns', inplace=True)
    
    # Drop rows with empty Ticket value
    df = df[df.Ticket != '']
    # Change Ticket datatype
    df['Ticket'] = (df['Ticket']).astype(float)

    # Change dataframe values range from 0 to 1
    df=(df-df.min())/(df.max()-df.min())

    # If training join Survived column
    if not test:
        df = df.join(df_y)

    return df

In [None]:
# Load data
# Train
DATA_PATH = 'data'
train_data_path = os.path.join(DATA_PATH,'train.csv')
train_df = pd.read_csv(train_data_path, sep=',')

# Test
test_data_path = os.path.join(DATA_PATH,'test.csv')
test_df = pd.read_csv(test_data_path, sep=',')

# Preprocess data
train_df = clean_df(train_df)
test_df = clean_df(test_df, test=True)
train_df = train_df.to_numpy()
test_df = test_df.to_numpy()

# Split data
train_set, val_set = torch.utils.data.random_split(train_df, [700, 191])

# change np to array | to avoid slice errors
train_set = np.array(train_set)
val_set = np.array(val_set)
X_train,y_train = train_set[:,:-1],train_set[:,-1]
X_test,y_test = val_set[:,:-1],val_set[:,-1]

In [5]:
# create model
clf = LogisticRegression(random_state=0)

# train model
clf.fit(X_train, y_train)

# score model
clf.score(X_test, y_test)

0.8115183246073299