# Merge
This notebook merges the original Kaggle dataset with the data we downloaded.
In general, we are going to trust the downloaded data more than the Kaggle dataset.

In [16]:
# Load and format the data provided by Kaggle.
import pandas as pd
import dataset as ds
import preprocessing as pp

X_dataset, y_dataset = ds.load_training_set()
X_testset = ds.load_test_set()

df = pd.merge(X_dataset.reset_index(), X_testset.reset_index(), how='outer').set_index(ds.ID_COLUMN_NAME)

df = pp.manual_fixes(df)
df = pp.format_name(df)
df = pp.add_ticket_number_column(df)
df = df.reset_index()

df.isna().sum()

PassengerId              0
Pclass                   0
Name                     0
Sex                      0
Age                    263
SibSp                    0
Parch                    0
Ticket                   0
Fare                     1
Cabin                 1014
Embarked                 2
LastName                 0
FirstName                0
Title                    0
UnmarriedFirstName       0
UnmarriedLastName        0
TicketNumber             4
dtype: int64

In [17]:
# Load and format the extra data we crawled.
import data.integration.merge as merge

edf = merge.import_extra_data()
edf = merge.apply_post_processing(edf)

edf.isna().sum()

Unnamed: 0               0
Age                      2
BirthDate                2
BirthPlace             479
Cabin                 1900
Destination           1304
Embarked                 0
FirstName                0
Job                    606
LastName                 0
MaritalStatus          785
Nationality              0
Pclass                   0
Relationships            0
Residence              110
Sex                      0
Ticket                 891
Title                    0
UrlId                    0
Survived                 0
TicketPrice              0
TicketNumber           891
BirthPlaceCountry      481
BirthPlaceCity         479
BirthPlaceRegion       775
ResidenceCountry       110
ResidenceCity          110
ResidenceRegion        599
DestinationCountry    1304
DestinationCity       1304
DestinationRegion     1339
CabinDeck             1900
AgeInDays                2
PassengerId              0
dtype: int64

In [18]:
# Load the mapping from Kaggle to extra.
import os
import json

matches_filepath = os.path.join(os.environ['HOME'], 'kaggle/titanic/data/integration', 'matches.json')
with open(matches_filepath, 'r') as f:
    matches = json.load(f)

print(len(matches))

1309


In [19]:
# Add to the extra data the information only provided by Kaggle.
df['EPassengerId'] = df['PassengerId'].apply(lambda x: matches[str(x)]).astype(int)

# Rename to avoid collision.
df = df.rename(columns={'PassengerId': 'KPassengerId'})

# Select the columns to merge: in general we will trust the downloaded data more than Kaggle.
columns_to_merge = [
    'KPassengerId',
#     'Pclass',
#     'Name',
#     'Sex',
#     'Age',
    'SibSp',
    'Parch',
#     'Ticket',
#     'Fare',
#     'Cabin',
#     'Embarked',
#     'LastName',
#     'FirstName',
#     'Title',
#     'UnmarriedFirstName',
#     'UnmarriedLastName',
#     'TicketNumber',
    'EPassengerId',
]

mdf = edf.merge(df[columns_to_merge], how='outer', left_on='PassengerId', right_on='EPassengerId')
print('Before the merge: {}'.format(len(edf)))
print('After the merge: {}'.format(len(mdf)))
mdf.shape

Before the merge: 2208
After the merge: 2208


(2208, 38)