In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [157]:
df = pd.read_csv("titanic.csv")

In [128]:
df.info() #this gives the information about the types of data present in columns of csv file

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [129]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [57]:
#now we have detected categorial data (name , sex , ticket , cabin , embarked) 
#What does encoding do? It converts categorial data into numerical data 
#What is need of conversion of categorial data into numerical data ? 
#Machine Learning algorithms are mathematical operations or functions that uses numerical input only , thats why we convert it 
#It is done with help of Encoding - one hot encoding , Label Encoding , frequency encoding 

In [83]:
# for encoding we need to remove missing values or filling null values
#here we have null values in age , cabin , embarked 
#we will fill categorial data first

In [130]:
for i in df.select_dtypes(include = 'object').columns:
    df[i].fillna(df[i].mode()[0] , inplace = True)

In [131]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [62]:
#now we can see there is no categorial empty data 
#now we have to fill missing age values which we can do with help of sklearn library

In [132]:
si = SimpleImputer(strategy = 'mean')
df['Age'] = si.fit_transform(df[['Age']])

In [133]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [65]:
#now there is no null value in our dataset
#next we have to convert categorial data to numerical data using encoding

In [134]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S


In [67]:
# we will use label encoder for binary columns like yes/no , male/female etc , so we use it for sex
# we will use one hot encoder or can use get_dummies for low cardinality columns like that have less options , so we use it for embarked 
# we will use frequency encoder for high cardinalty columns where we have more columns , so we use it for Ticket , cabin
# we usually don't encode whole data because encoding whole data might create huge data , and can cause slow training

In [135]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex_encoded'] = le.fit_transform(df['Sex'])

In [136]:
df.head(5)  

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S,1


In [None]:
# from this dataset we conclude that sex is encoded male = 1 , female = 0 

In [137]:
#from sklearn.preprocessing import OneHotEncoder
df = pd.get_dummies(df, columns=['Embarked'] , dtype = int)
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_encoded,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,1,0,0,1


In [140]:
df['Cabin_encoded'] = df['Cabin'].map(df['Cabin'].value_counts())

In [144]:
df.head(5) # we have noticed that embarked is being encoded into 3 columns

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_encoded,Embarked_C,Embarked_Q,Embarked_S,Cabin_encoded,ticket_encoded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,1,0,0,1,691,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,0,1,0,0,1,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,0,0,0,1,691,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,0,0,0,1,2,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,1,0,0,1,691,


In [148]:
df['Ticket_encoded'] = df['Ticket'].map(df['Ticket'].value_counts())

In [149]:
df.head(5) # we have noticed that ticket is being encoded

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_encoded,Embarked_C,Embarked_Q,Embarked_S,Cabin_encoded,Ticket_encoded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,1,0,0,1,691,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,0,1,0,0,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,0,0,0,1,691,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,0,0,0,1,2,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,1,0,0,1,691,1


In [None]:
#now to avoid chaos we can drop the original columns

In [152]:
df.drop(columns = ['Ticket','Sex','Cabin'],axis = 1 , inplace = True)

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     891 non-null    int64  
 1   Survived        891 non-null    int64  
 2   Pclass          891 non-null    int64  
 3   Name            891 non-null    object 
 4   Age             891 non-null    float64
 5   SibSp           891 non-null    int64  
 6   Parch           891 non-null    int64  
 7   Fare            891 non-null    float64
 8   Sex_encoded     891 non-null    int32  
 9   Embarked_C      891 non-null    int32  
 10  Embarked_Q      891 non-null    int32  
 11  Embarked_S      891 non-null    int32  
 12  Cabin_encoded   891 non-null    int64  
 13  Ticket_encoded  891 non-null    int64  
dtypes: float64(2), int32(4), int64(7), object(1)
memory usage: 83.7+ KB
