In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
#Loading dataset from a CSV file.
def load_dataset(filepath):
    return pd.read_csv(filepath)

In [9]:
#This gives the information about the types of data present in columns of csv file
def show_info(df):
    print(df.info())

In [10]:
#This gives us information about the column and the count of null values present in each column
def show_missing_values(df):
    print(df.isnull().sum())

In [11]:
#This fill the mode value of that column in that cell that have null value respectively.
#This is only for categorial data - which is alphabetical or combination of numeric and alphabetic
def fill_categorical_nulls(df):
    for i in df.select_dtypes(include='object').columns:
        df[i].fillna(df[i].mode()[0], inplace=True)
    return df

In [12]:
#This fill the mean or median or mode value of that column in that cell that have null value respectively.
#This is only for numerical data - which is of numeric values
def fill_numerical_nulls(df, strategy='mean'):
    for i in df.select_dtypes(include=np.number).columns:
        if df[i].isnull().sum() > 0:
            if strategy == 'mean':
                df[i].fillna(df[i].mean(), inplace=True)
            elif strategy == 'median':
                df[i].fillna(df[i].median(), inplace=True)
            elif strategy == 'mode':
                df[i].fillna(df[i].mode()[0] , inplace = True)
    return df

In [13]:
#After filling null values we have to encode the categorial data 
#This is done using one hot encoder , label encoder , frequency encoder
#What is need of conversion of categorial data into numerical data ? 
#Machine Learning algorithms are mathematical operations or functions that uses numerical input only , thats why we convert it 
# we will use label encoder for binary columns like yes/no , male/female etc 
# we will use one hot encoder or can use get_dummies for low cardinality columns like that have less options.
# we will use frequency encoder for high cardinalty columns where we have more columns.

# Function to encode all categorical columns
def encode_all_categoricals(df, method='onehot'):
    i = df.select_dtypes(include='object').columns
    if method == 'onehot':
        df = pd.get_dummies(df, columns=i, drop_first=True)
        # Convert boolean columns to 0/1
        bool_cols = df.select_dtypes(include='bool').columns
        df[bool_cols] = df[bool_cols].astype(int)
        return df
    elif method == 'label':
        le = LabelEncoder()
        for i in cols:
            df[i] = le.fit_transform(df[i])
        return df
    else:
        raise ValueError("Method must be either 'onehot' or 'label'")


In [15]:
df = load_dataset("titanic.csv")
show_info(df)
show_missing_values(df)

df = fill_categorical_nulls(df)
df = fill_numerical_nulls(df)
show_missing_values(df)

print("ONE HOT ENCODING OR LABEL ENCODING OR FREQUENCY ENCODING")
df_encoded = encode_all_categoricals(df, method='onehot')  # or 'label' or 'frequency'
df_encoded.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int6

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel",...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 1726 entries, PassengerId to Embarked_S
dtypes: float64(2), int32(1719), int64(5)
memory usage: 5.9 MB
