# Data cleaning

Load the training and test data sets from `raw_data/`, apply transformations and write the new data sets in `clean_data/`.

In [1]:
import pandas as pd

In [2]:
raw_columns = ['PassengerId', 'Embarked', 'Pclass', 'Fare', 'Sex', 'Age', 'SibSp', 'Parch']

In [3]:
def transform_data(df):
    # Discard observations with missing Embarked values
    df.dropna(subset=['Embarked'], inplace=True)
    # Set missing Age and Fare values to -1 (easier to pass them through patsy)
    df.loc[df['Age'].isnull(), 'Age'] = -1
    df.loc[df['Fare'].isnull(), 'Fare'] = -1
    # Combine SibSp and Parch into a single variable
    df['Relatives'] = df['SibSp'] + df['Parch']
    df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    # Cleanup Pclass and Embarked
    df['Class'] = df['Pclass'].map({1: 'first', 2: 'second', 3: 'third'})
    df.drop('Pclass', axis=1, inplace=True)
    df['Embarked'] = df['Embarked'].map({'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})

In [4]:
clean_columns = ['PassengerId', 'Embarked', 'Class', 'Fare', 'Sex','Age', 'Relatives']

## Training data set

In [5]:
train_data = pd.read_csv('raw_data/train.csv', usecols=raw_columns + ['Survived'])

transform_data(train_data)

train_data = train_data[clean_columns + ['Survived']]
train_data.to_csv('clean_data/train.csv', index=False)

## Test data set

In [6]:
test_data = pd.read_csv('raw_data/test.csv', usecols=raw_columns)

transform_data(test_data)

test_data = test_data[clean_columns]
test_data.to_csv('clean_data/test.csv', index=False)