# Data formatting

In [None]:
import dataset as ds

import pandas as pd

dataset_df = ds.get_dataset()
print('Dataset size: {}'.format(len(dataset_df)))
print('Attributes:')
print('\n'.join('{}: {}'.format(n, t) for n, t in zip(dataset_df.keys(), dataset_df.dtypes)))

In [None]:
def convert_attribute_to_numeric(df, attribute_name):
    mapping = {n: i for i, n in enumerate(df[attribute_name].unique())}
    df.replace({attribute_name: mapping}, inplace=True)

def convert_attribute_to_categorical(df, attribute_name):
    categories = df[attribute_name].dropna().unique()
    df[attribute_name] = pd.Categorical(df[attribute_name], categories=categories, ordered=False)
    return df

attributes_to_convert = ['Sex', 'Embarked', 'Pclass']
for attribute_name in attributes_to_convert:
    dataset_df = convert_attribute_to_categorical(dataset_df, attribute_name)

print('Attributes:')
print('\n'.join('{}: {}'.format(n, t) for n, t in zip(dataset_df.keys(), dataset_df.dtypes)))
dataset_df.head()

Can we do anything with names, tickets and cabins?

In [None]:
# Extract the title from the names.

import re
from collections import Counter

name_template = r'[^,]+, (?P<title>[^\.]+)\.\s'
name_pattern = re.compile(name_template)

titles_list = []
for name in dataset_df['Name']:
    match = name_pattern.match(name)
    if not match:
        print('!!! -> {}'.format(name))
        titles_list.append(None)
        continue
    else:
        titles_list.append(match.group('title').split(' ')[-1])

titles_counter = Counter(titles_list)

dataset_df['Title'] = pd.Categorical(titles_list, categories=titles_counter.keys(), ordered=False)

print('Attributes:')
print('\n'.join('{}: {}'.format(n, t) for n, t in zip(dataset_df.keys(), dataset_df.dtypes)))
dataset_df.head()

In [None]:
# Extract the ticket number from the tickets.

ticket_template = r'(?P<number>\d+)( (?P<add_info>.*))?$'
ticket_pattern = re.compile(ticket_template)

ticket_numbers = []
ticket_additional_infos = set()
for row_idx, ticket in enumerate(dataset_df['Ticket']):
    match = ticket_pattern.match(ticket[::-1])
    if not match:
        # print('!!! {} -> {}'.format(row_idx, ticket))
        ticket_numbers.append(None)
        continue
    ticket_numbers.append(int(match.group('number')[::-1]))
    
    add_info_match = match.group('add_info')
    if add_info_match:
        ticket_additional_infos.add(add_info_match[::-1])

# for add_info in sorted(ticket_additional_infos):
#     print(add_info)

# for ticket_num in sorted(ticket_numbers):
#     print(ticket_num)

# rows = [179, 271, 302, 597]
# dataset_df.iloc[rows]

dataset_df = dataset_df.assign(TicketNumber=ticket_numbers)
dataset_df.head()

In [None]:
# Extract the floor from the cabin number.

cabin_full_template = r'^([A-Z] )?([A-Z]\d+\s?)+$'
cabin_full_pattern = re.compile(cabin_full_template)

cabin_template = r'(?P<floor>[A-Z])\d+\s?'
cabin_pattern = re.compile(cabin_template)

floors = []
for cabin in dataset_df['Cabin']:
    if cabin != cabin:
        # NaN value
        floors.append(None)
        continue
    
    # Check if the cabin format is atypical.
    if not cabin_full_pattern.match(cabin):
        # print(cabin)
        floors.append(None)
        continue
    
    # If you get here the cabin field is in the form:
    #  A123
    #  or: A123 B456 C78
    #  or: Q A123 B456 C78
    
    cabin_floors = {
        match.group('floor')
        for match in cabin_pattern.finditer(cabin)
    }
    
    if len(cabin_floors) != 1:
        # print(cabin)
        floors.append(None)
        continue
    
    floors.append(cabin_floors.pop())

dataset_df['Floor'] = pd.Categorical(floors, categories=filter(None, set(floors)), ordered=False)

print('Attributes:')
print('\n'.join('{}: {}'.format(n, t) for n, t in zip(dataset_df.keys(), dataset_df.dtypes)))
dataset_df.head()

Drop the non-numerical columns.

In [None]:
non_numerical_columns = ['Name', 'Ticket', 'Cabin']
dataset_df.drop(columns=non_numerical_columns, inplace=True)

print('Attributes:')
print('\n'.join('{}: {}'.format(n, t) for n, t in zip(dataset_df.keys(), dataset_df.dtypes)))
dataset_df.head()