In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   PassengerId  100000 non-null  int64  
 1   Survived     100000 non-null  int64  
 2   Pclass       100000 non-null  int64  
 3   Name         100000 non-null  object 
 4   Sex          100000 non-null  object 
 5   Age          96708 non-null   float64
 6   SibSp        100000 non-null  int64  
 7   Parch        100000 non-null  int64  
 8   Ticket       95377 non-null   object 
 9   Fare         99866 non-null   float64
 10  Cabin        32134 non-null   object 
 11  Embarked     99750 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 9.2+ MB


In [4]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   PassengerId  100000 non-null  int64  
 1   Pclass       100000 non-null  int64  
 2   Name         100000 non-null  object 
 3   Sex          100000 non-null  object 
 4   Age          96513 non-null   float64
 5   SibSp        100000 non-null  int64  
 6   Parch        100000 non-null  int64  
 7   Ticket       94819 non-null   object 
 8   Fare         99867 non-null   float64
 9   Cabin        29169 non-null   object 
 10  Embarked     99723 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.4+ MB


In [25]:
def prepare_data(data):
    age_map = data[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    data.Age = data.Age.fillna(data.Pclass.map(age_map['Age']))

    fare_map = data[['Fare', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    data.Fare = data.Fare.fillna(data.Pclass.map(fare_map['Fare']))

    data['Cabin number'] = data.Cabin.map(lambda x: str(x)[1:].strip())
    data['Cabin number'] = data['Cabin number'].replace('an', '0').astype(int, copy=False)
    data['Cabin'] = data.Cabin.map(lambda x: str(x)[0].strip())

    data['Ticket'] = data.Ticket.fillna('X')
    data['Embarked'] = data.Embarked.fillna('X')

    return pd.get_dummies(data, columns=['Sex', 'Cabin', 'Embarked'], prefix=['sex', 'cab', 'emb'])

In [26]:
train_data = prepare_data(train_data)
test_data = prepare_data(test_data)

In [27]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   PassengerId   100000 non-null  int64  
 1   Survived      100000 non-null  int64  
 2   Pclass        100000 non-null  int64  
 3   Name          100000 non-null  object 
 4   Age           100000 non-null  float64
 5   SibSp         100000 non-null  int64  
 6   Parch         100000 non-null  int64  
 7   Ticket        100000 non-null  object 
 8   Fare          100000 non-null  float64
 9   Cabin number  100000 non-null  int32  
 10  sex_female    100000 non-null  uint8  
 11  sex_male      100000 non-null  uint8  
 12  cab_A         100000 non-null  uint8  
 13  cab_B         100000 non-null  uint8  
 14  cab_C         100000 non-null  uint8  
 15  cab_D         100000 non-null  uint8  
 16  cab_E         100000 non-null  uint8  
 17  cab_F         100000 non-null  uint8  
 18  cab_G

In [28]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   PassengerId   100000 non-null  int64  
 1   Pclass        100000 non-null  int64  
 2   Name          100000 non-null  object 
 3   Age           100000 non-null  float64
 4   SibSp         100000 non-null  int64  
 5   Parch         100000 non-null  int64  
 6   Ticket        100000 non-null  object 
 7   Fare          100000 non-null  float64
 8   Cabin number  100000 non-null  int32  
 9   sex_female    100000 non-null  uint8  
 10  sex_male      100000 non-null  uint8  
 11  cab_A         100000 non-null  uint8  
 12  cab_B         100000 non-null  uint8  
 13  cab_C         100000 non-null  uint8  
 14  cab_D         100000 non-null  uint8  
 15  cab_E         100000 non-null  uint8  
 16  cab_F         100000 non-null  uint8  
 17  cab_G         100000 non-null  uint8  
 18  cab_T

In [29]:
train_data.to_csv('../data/train_preprocessed.csv', index=False)
test_data.to_csv('../data/test_preprocessed.csv', index=False)