# Merge
This notebook merges the original Kaggle dataset with the data we downloaded.
In general, we are going to trust the downloaded data more than the Kaggle dataset.

In [None]:
# Load and format the data provided by Kaggle.
import pandas as pd
import dataset as ds
import preprocessing as pp

X_dataset, y_dataset = ds.load_training_set()
X_testset = ds.load_test_set()

training_set_ids = X_dataset.index.tolist()
test_set_ids = X_testset.index.tolist()

assert len(set(training_set_ids) & set(test_set_ids)) == 0

df = pd.merge(X_dataset.reset_index(), X_testset.reset_index(), how='outer').set_index(ds.ID_COLUMN_NAME)

df = pp.manual_fixes(df)
df = pp.format_name(df)
df = pp.add_ticket_number_column(df)
df = df.reset_index()

df.isna().sum()

In [None]:
# Load and format the extra data we crawled.
import data.integration.merge as merge

edf = merge.import_extra_data()
edf = merge.apply_post_processing(edf)

edf.isna().sum()

In [None]:
# Load the mapping from Kaggle to extra.
import os
import json

matches_filepath = os.path.join(os.environ['HOME'], 'kaggle/titanic/data/integration', 'matches.json')
with open(matches_filepath, 'r') as f:
    matches = json.load(f)

print(len(matches))

In [None]:
# Add to the extra data the information only provided by Kaggle.
df['EPassengerId'] = df['PassengerId'].apply(lambda x: matches[str(x)]).astype(int)

# Rename to avoid collision.
df = df.rename(columns={'PassengerId': 'KPassengerId'})

# Select the columns to merge: in general we will trust the downloaded data more than Kaggle.
columns_to_merge = [
    'KPassengerId',
#     'Pclass',
#     'Name',
#     'Sex',
#     'Age',
    'SibSp',
    'Parch',
#     'Ticket',
#     'Fare',
#     'Cabin',
#     'Embarked',
#     'LastName',
#     'FirstName',
#     'Title',
#     'UnmarriedFirstName',
#     'UnmarriedLastName',
#     'TicketNumber',
    'EPassengerId',
]

mdf = edf.merge(df[columns_to_merge], how='outer', left_on='PassengerId', right_on='EPassengerId')
print('Before the merge: {}'.format(len(edf)))
print('After the merge: {}'.format(len(mdf)))
mdf.shape

In [None]:
# Split training and test sets.

mdf['Split'] = 'Extra'
mdf.loc[mdf['KPassengerId'].isin(training_set_ids), ['Split']] = 'Training'
mdf.loc[mdf['KPassengerId'].isin(test_set_ids), ['Split']] = 'Test'
mdf.groupby('Split')['Split'].count()

assert len(mdf.loc[mdf['Split'] == 'Training']) == len(training_set_ids)
assert len(mdf.loc[mdf['Split'] == 'Test']) == len(test_set_ids)
assert len(mdf.loc[mdf['Split'] == 'Extra']) == len(edf) - len(df)

In [None]:
import os

# Save the eXtra-datasets.
output_dir = os.path.join(os.environ['HOME'], 'kaggle/titanic/data')

# Augmented training set.
xtrain_df = mdf.loc[mdf['Split'].isin(('Extra', 'Training'))]
xtrain_df.to_csv(os.path.join(output_dir, 'xtrain.csv'))

# Augmented test set.
xtest_df = mdf.loc[mdf['Split'] == 'Test']
xtest_df.to_csv(os.path.join(output_dir, 'xtest.csv'))