# Basic Data Preprocessing

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("titanic_train.csv")
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Rename dataframe columns
df = df.rename(columns={"PassengerId": "Passenger ID", "Pclass": "Passenger Class"})

# Replace the current index of the dataframe with Passenger ID
df.set_index("Passenger ID")

# Sort data by ascending order of Passenger class
df = df.sort_values("Passenger Class", ascending=True)

# Replace NaN values in the Age column with 0
df["Age"] = df["Age"].replace(np.nan, 0.0)

# Compute average age
x = sum(df["Age"] == 0)
avg_age = df["Age"].sum()/(df["Age"].count()-x)
print(avg_age)
print(len(df.columns))
print(len(df))
df

29.69911764705882
12
891


Unnamed: 0,Passenger ID,Survived,Passenger Class,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
310,311,1,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C
309,310,1,1,"Francatelli, Miss. Laura Mabel",female,30.0,0,0,PC 17485,56.9292,E36,C
307,308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9000,C65,C
306,307,1,1,"Fleming, Miss. Margaret",female,0.0,0,0,17421,110.8833,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
379,380,0,3,"Gustafsson, Mr. Karl Gideon",male,19.0,0,0,347069,7.7750,,S
381,382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1.0,0,2,2653,15.7417,,C
382,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S
371,372,0,3,"Wiklund, Mr. Jakob Alfred",male,18.0,1,0,3101267,6.4958,,S


In [4]:
# What is returned is called a series, ie, essentially a 1-row dataframe indexed with the columns of the dataframe
print(df.iloc[2])
df.loc[309]

Passenger ID                                  310
Survived                                        1
Passenger Class                                 1
Name               Francatelli, Miss. Laura Mabel
Sex                                        female
Age                                          30.0
SibSp                                           0
Parch                                           0
Ticket                                   PC 17485
Fare                                      56.9292
Cabin                                         E36
Embarked                                        C
Name: 309, dtype: object


Passenger ID                                  310
Survived                                        1
Passenger Class                                 1
Name               Francatelli, Miss. Laura Mabel
Sex                                        female
Age                                          30.0
SibSp                                           0
Parch                                           0
Ticket                                   PC 17485
Fare                                      56.9292
Cabin                                         E36
Embarked                                        C
Name: 309, dtype: object

In [5]:
# Chain .iloc with filtering to pick a value at a specific row and column
print(df.iloc[2]["Age"])
print(df.iloc[2]["Name"])
df.loc[309]["Name"]

30.0
Francatelli, Miss. Laura Mabel


'Francatelli, Miss. Laura Mabel'

In [6]:
# USe the at() operator to assign new value to a datapoint
# Note that iloc uses the row index (starting from 0) while at() and loc() use the actual data index (set to "Passenger ID" earlier)
print(df.at[309,"Age"])
print(df.at[309,"Name"])
df.at[309, "Age"] *= 1.25
df.at[309, "Age"]

30.0
Francatelli, Miss. Laura Mabel


37.5

In [8]:
# Add a new column to the dataframe
df["New column"] = df["Age"] + 100
df.head(5)

Unnamed: 0,Passenger ID,Survived,Passenger Class,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,New column
445,446,1,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S,104.0
310,311,1,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C,124.0
309,310,1,1,"Francatelli, Miss. Laura Mabel",female,37.5,0,0,PC 17485,56.9292,E36,C,137.5
307,308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9,C65,C,117.0
306,307,1,1,"Fleming, Miss. Margaret",female,0.0,0,0,17421,110.8833,,C,100.0
