# Data cleaning

Load the training and test data sets from `raw_data/`, apply transformations and write the new data sets in `clean_data/`.

In [1]:
import pandas as pd

In [2]:
raw_columns = ['PassengerId', 'Embarked', 'Pclass', 'Fare', 'Sex', 'Age', 'SibSp', 'Parch']

In [3]:
def transform_data(df):
    # Combine SibSp and Parch into a single variable
    df['Relatives'] = df['SibSp'] + df['Parch']
    df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    # Cleanup Pclass and Embarked
    df['Class'] = df['Pclass'].map({1: 'first', 2: 'second', 3: 'third'})
    df.drop('Pclass', axis=1, inplace=True)
    df['Embarked'] = df['Embarked'].map({'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})

In [4]:
clean_columns = ['PassengerId', 'Embarked', 'Class', 'Fare', 'Sex','Age', 'Relatives']

## Training data set

In [5]:
train = pd.read_csv('raw_data/train.csv', usecols=raw_columns + ['Survived'])

Discard observations where `Embarked` is missing.

In [6]:
train = train[train['Embarked'].notnull()]

Apply transformations.

In [7]:
transform_data(train)

Save the data set.

In [8]:
train = train[clean_columns + ['Survived']]
train.to_csv('clean_data/train.csv', index=False)

## Test data set

In [9]:
test = pd.read_csv('raw_data/test.csv', usecols=raw_columns)

Apply transformations.

In [10]:
transform_data(test)

Save the data set.

In [11]:
test = test[clean_columns]
test.to_csv('clean_data/test.csv', index=False)