## Load Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('/content/train_fe.csv')
dt = pd.read_csv('/content/test_fe.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Cat,Fare_Cat,Title,Family,Family_Cat
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,GrownUp,Low,Mr,2,Small
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,GrownUp,High,Mrs,2,Small
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,GrownUp,Low,Miss,1,Single
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,GrownUp,High,Mrs,2,Small
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,GrownUp,Low,Mr,1,Single


In [4]:
dt.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Cat,Fare_Cat,Title,Family,Family_Cat
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,GrownUp,Low,Mr,1,Single
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,GrownUp,Low,Mrs,2,Small
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Elder,Low,Mr,1,Single
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,GrownUp,Low,Mr,1,Single
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,GrownUp,Low,Mrs,3,Small


In [5]:
df.shape, dt.shape

((891, 17), (418, 16))

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Age_Cat      714 non-null    object 
 13  Fare_Cat     891 non-null    object 
 14  Title        891 non-null    object 
 15  Family       891 non-null    int64  
 16  Family_Cat   891 non-null    object 
dtypes: float64(2), int64(6), object(9)
memory usage: 118.5+ KB


## Drop Unnecessary Columns

In [7]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
dt = dt.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [8]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Cat,Fare_Cat,Title,Family,Family_Cat
0,0,3,male,22.0,1,0,7.25,S,GrownUp,Low,Mr,2,Small
1,1,1,female,38.0,1,0,71.2833,C,GrownUp,High,Mrs,2,Small
2,1,3,female,26.0,0,0,7.925,S,GrownUp,Low,Miss,1,Single
3,1,1,female,35.0,1,0,53.1,S,GrownUp,High,Mrs,2,Small
4,0,3,male,35.0,0,0,8.05,S,GrownUp,Low,Mr,1,Single


In [9]:
dt.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Cat,Fare_Cat,Title,Family,Family_Cat
0,3,male,34.5,0,0,7.8292,Q,GrownUp,Low,Mr,1,Single
1,3,female,47.0,1,0,7.0,S,GrownUp,Low,Mrs,2,Small
2,2,male,62.0,0,0,9.6875,Q,Elder,Low,Mr,1,Single
3,3,male,27.0,0,0,8.6625,S,GrownUp,Low,Mr,1,Single
4,3,female,22.0,1,1,12.2875,S,GrownUp,Low,Mrs,3,Small


## Fill Missing Value

In [10]:
# display number of missing values
def count_missing_values(data):
    missing = data.isna().sum()
    df = pd.DataFrame({'Count':missing, 'Percentage':np.round(missing/len(data)*100, 2)})
    return df[df['Count'] > 0]

In [11]:
count_missing_values(df)

Unnamed: 0,Count,Percentage
Age,177,19.87
Embarked,2,0.22
Age_Cat,177,19.87


In [12]:
count_missing_values(dt)

Unnamed: 0,Count,Percentage
Age,86,20.57
Fare,1,0.24
Age_Cat,86,20.57
Fare_Cat,1,0.24


In [13]:
#Fill missing value
mean_value1 = df['Age'].mean()
df['Age'].fillna(value=mean_value1, inplace=True)

mean_value2 = dt['Age'].mean()
dt['Age'].fillna(value=mean_value2, inplace=True)

mean_value3 = dt['Fare'].mean()
dt['Fare'].fillna(value=mean_value2, inplace=True)

In [14]:
# group age based on bins division
bins1 = [-np.Inf, 1, 6, 14, 19, 55, np.Inf]
labels1 = ["Baby","Toddler","Kid","Teenage","GrownUp","Elder"]
df["Age_Cat"] = pd.cut(x=df["Age"], bins=bins1, labels=labels1, include_lowest=True)
dt["Age_Cat"] = pd.cut(x=dt["Age"], bins=bins1, labels=labels1, include_lowest=True)

# group fare based on bins division
bins2 = [-np.Inf, 20, 50, np.Inf]
labels2 = ["Low","Medium","High"]
df["Fare_Cat"] = pd.cut(x=df["Fare"], bins=bins2, labels=labels2, include_lowest=True)
dt["Fare_Cat"] = pd.cut(x=dt["Fare"], bins=bins2, labels=labels2, include_lowest=True)

In [15]:
# drop rows with NaN value in column "Embarked"
df=df.dropna(subset=['Embarked'])

In [16]:
# Check
count_missing_values(df)

Unnamed: 0,Count,Percentage


In [17]:
count_missing_values(dt)

Unnamed: 0,Count,Percentage


## Label Encoding: Categorical to Numerical Value

In [18]:
sex_uni = df["Sex"].unique()
sex_uni

array(['male', 'female'], dtype=object)

In [19]:
mapping_dict = {val: idx for idx, val in enumerate(sex_uni)}
df["Sex"] = df["Sex"].replace(mapping_dict)
dt["Sex"] = dt["Sex"].replace(mapping_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sex"] = df["Sex"].replace(mapping_dict)


In [20]:
embarked_uni = df["Embarked"].unique()
embarked_uni

array(['S', 'C', 'Q'], dtype=object)

In [21]:
mapping_dict1 = {val: idx for idx, val in enumerate(embarked_uni)}
df["Embarked"] = df["Embarked"].replace(mapping_dict1)
dt["Embarked"] = dt["Embarked"].replace(mapping_dict1)

In [22]:
age_uni = df["Age_Cat"].unique()
age_uni

['GrownUp', 'Toddler', 'Kid', 'Elder', 'Teenage', 'Baby']
Categories (6, object): ['Baby' < 'Toddler' < 'Kid' < 'Teenage' < 'GrownUp' < 'Elder']

In [23]:
mapping_dict2 = {val: idx for idx, val in enumerate(age_uni)}
df["Age_Cat"] = df["Age_Cat"].replace(mapping_dict2)
dt["Age_Cat"] = dt["Age_Cat"].replace(mapping_dict2)

In [24]:
fare_uni = df["Fare_Cat"].unique()
fare_uni

['Low', 'High', 'Medium']
Categories (3, object): ['Low' < 'Medium' < 'High']

In [25]:
mapping_dict3 = {val: idx for idx, val in enumerate(fare_uni)}
df["Fare_Cat"] = df["Fare_Cat"].replace(mapping_dict3)
dt["Fare_Cat"] = dt["Fare_Cat"].replace(mapping_dict3)

In [26]:
title_uni =  df["Title"].unique()
title_uni

array(['Mr', 'Mrs', 'Miss', 'Master', 'Others'], dtype=object)

In [27]:
mapping_dict4 = {val: idx for idx, val in enumerate(title_uni)}
df["Title"] = df["Title"].replace(mapping_dict4)
dt["Title"] = dt["Title"].replace(mapping_dict4)

In [28]:
family_uni =  df["Family_Cat"].unique()
family_uni

array(['Small', 'Single', 'Big'], dtype=object)

In [29]:
mapping_dict5 = {val: idx for idx, val in enumerate(family_uni)}
df["Family_Cat"] = df["Family_Cat"].replace(mapping_dict5)
dt["Family_Cat"] = dt["Family_Cat"].replace(mapping_dict5)

In [30]:
# Check
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Survived    889 non-null    int64   
 1   Pclass      889 non-null    int64   
 2   Sex         889 non-null    int64   
 3   Age         889 non-null    float64 
 4   SibSp       889 non-null    int64   
 5   Parch       889 non-null    int64   
 6   Fare        889 non-null    float64 
 7   Embarked    889 non-null    int64   
 8   Age_Cat     889 non-null    category
 9   Fare_Cat    889 non-null    category
 10  Title       889 non-null    int64   
 11  Family      889 non-null    int64   
 12  Family_Cat  889 non-null    int64   
dtypes: category(2), float64(2), int64(9)
memory usage: 85.4 KB


In [31]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Pclass      418 non-null    int64   
 1   Sex         418 non-null    int64   
 2   Age         418 non-null    float64 
 3   SibSp       418 non-null    int64   
 4   Parch       418 non-null    int64   
 5   Fare        418 non-null    float64 
 6   Embarked    418 non-null    int64   
 7   Age_Cat     418 non-null    category
 8   Fare_Cat    418 non-null    category
 9   Title       418 non-null    int64   
 10  Family      418 non-null    int64   
 11  Family_Cat  418 non-null    int64   
dtypes: category(2), float64(2), int64(8)
memory usage: 33.9 KB


In [32]:
df.to_csv('train_clean.csv', index=False)
dt.to_csv('test_clean.csv', index=False)