# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Reading Datasets

In [2]:
train_backup = pd.read_csv("../Data/preprocessed_train_df.csv")
train_df = train_backup.copy()

In [3]:
test_backup = pd.read_csv("../Data/test.csv")
test_df = test_backup.copy()

# Preprocessing Test Set

**Null Values**

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Adding New Columns

### Title

In [5]:
test_df["Title"] = test_df["Name"].str.extract('\, ([A-Z][^ ]*\.)')

In [6]:
test_df["Title"].value_counts()

Mr.        240
Miss.       78
Mrs.        72
Master.     21
Col.         2
Rev.         2
Dona.        1
Dr.          1
Ms.          1
Name: Title, dtype: int64

In [7]:
test_df["Title"] = np.where(test_df["Title"] == "Ms.", "Miss.", test_df["Title"])

test_df["Title"] = np.where(test_df["Title"].isin([
                                                    "Col.", "Rev.", "Dr.", "Dona."
                                                ]), "Rare.", test_df["Title"])

In [8]:
test_df["Title"].value_counts()

Mr.        240
Miss.       79
Mrs.        72
Master.     21
Rare.        6
Name: Title, dtype: int64

### Family

In [9]:
test_df["Family"] = test_df["SibSp"] + test_df["Parch"]

### Cabin Names

In [10]:
test_df["Cabin_Names"] = test_df["Cabin"].str.extract("([A-Z])")

## Filling Null Values

### Age

In [11]:
null_indices = test_df[test_df["Age"].isnull()].index
test_df.at[null_indices, "Age"] = test_df.loc[null_indices, "Title"].map(train_df.groupby(by = "Title")["Age"].median())

### Fare

In [12]:
null_indices = test_df[test_df["Fare"].isnull()].index
test_df.at[null_indices, "Fare"] = test_df.loc[null_indices, "Pclass"].map(train_df.groupby(by = "Pclass")["Fare"].median())

### Cabin_Names

In [13]:
test_df["Cabin_Names"] = test_df["Cabin_Names"].fillna("Missing")

# Exporting Preprocessed Test Set

In [14]:
test_df.to_csv("../Data/preprocessed_test_df.csv", index = False)

# Feature Engineering Test Set

**Test Features Dataframe**

In [15]:
test_features_df = pd.DataFrame(index = test_df.index)

## Sex, Pclass, and Embarked

In [16]:
test_features_df["Sex:female"] = np.where(test_df["Sex"] == "female", 1, 0)

test_features_df["Pclass:1"] = np.where(test_df["Pclass"] == 1, 1, 0)
test_features_df["Pclass:3"] = np.where(test_df["Pclass"] == 3, 1, 0)

test_features_df["Embarked:S"] = np.where(test_df["Embarked"] == "S", 1, 0)
test_features_df["Embarked:C"] = np.where(test_df["Embarked"] == "C", 1, 0)

## Cabin_Names_label_enc

In [17]:
test_features_df["Cabin_Names_label_enc"] = test_df["Cabin_Names"].map({"Missing":0,
                                                                        "A": 1, "B": 2, "C": 3, "D": 4,
                                                                        "E": 5, "F": 6, "G": 7, "T": 8
                                                                        })

## Title_label_enc

In [18]:
test_features_df["Title_label_enc"] = test_df["Title"].map({"Mr.": 0, "Rare.": 1, "Master.": 2,
                                                            "Miss.":3, "Mrs.":4})

## Family

In [19]:
test_features_df["Family:No"] = np.where(test_df["Family"] == 0, 1, 0)
test_features_df["Family:Small"] = np.where(test_df["Family"].isin([1, 2, 3]), 1, 0)

## Age_label_enc

In [20]:
age_labels = pd.DataFrame(index = test_df.index)
age_labels["labels"] = np.where(test_df["Age"]<=10, "Infant", test_df["Age"])
age_labels["labels"] = np.where(((test_df["Age"]>10) & (test_df["Age"]<=18)), "Kid", age_labels["labels"])
age_labels["labels"] = np.where(((test_df["Age"]>18) & (test_df["Age"]<=25)), "Young_Adult", age_labels["labels"])
age_labels["labels"] = np.where(((test_df["Age"]>25) & (test_df["Age"]<=40)), "Adult", age_labels["labels"])
age_labels["labels"] = np.where(((test_df["Age"]>40) & (test_df["Age"]<=60)), "Old_Adult", age_labels["labels"])
age_labels["labels"] = np.where(test_df["Age"]>60, "Old", age_labels["labels"])

In [21]:
test_features_df["Age_label_enc"] = age_labels["labels"].map({
                                                                "Old": 0, "Adult": 1, "Young_Adult": 2,
                                                                "Old_Adult": 3, "Kid": 4, "Infant": 5
                                                            })

## Age

In [22]:
test_features_df["Age"] = test_df["Age"]

## Fare_label_enc

In [23]:
fare_labels = pd.DataFrame(index = test_df.index)
fare_labels["labels"] = np.where(test_df["Fare"]<=50, "Cheap", test_df["Fare"])
fare_labels["labels"] = np.where(((test_df["Fare"]>50) & (test_df["Fare"]<=100)), "Mid", fare_labels["labels"])
fare_labels["labels"] = np.where(test_df["Fare"]>100, "Costly", fare_labels["labels"])

In [24]:
test_features_df["Fare_label_enc"] = fare_labels["labels"].map({"Cheap": 0, "Mid": 1, "Costly": 2})

## Fare

In [25]:
test_features_df["Fare"] = test_df["Fare"]

# Exporting Test Set After Feature Engineering 

In [26]:
test_features_df.to_csv("../Data/test_features_df.csv", index = False)