In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Load dataset
df = pd.read_csv('C:/Users/HP/Downloads/titanic.csv')  # Replace with your dataset path
print("Initial Dataset Shape:", df.shape)
df.head()


Initial Dataset Shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#Step 1: Transform Categorical Data into Numeric

#One-Hot Encoding for Nominal Variables

In [3]:
# One-hot encoding for 'Embarked' and 'Pclass' columns
df = pd.get_dummies(df, columns=['Embarked', 'Pclass'], drop_first=True)
print("Dataset after One-Hot Encoding:\n", df.head())


Dataset after One-Hot Encoding:
    PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin  Embarked_Q  \
0    male  22.0      1      0         A/5 21171   7.2500   NaN           0   
1  female  38.0      1      0          PC 17599  71.2833   C85           0   
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN           0   
3  female  35.0      1      0            113803  53.1000  C123           0   
4    male  35.0      0      0            373450   8.0500   NaN           0   

   Embarked_S  Pclass_2

Explanation
We are using pd.get_dummies to apply one-hot encoding to Embarked and Pclass.
Setting drop_first=True avoids the dummy variable trap by dropping the first column for each encoded feature.


In [4]:
#Step 2: Feature Engineering

In [5]:
# Creating age groups (e.g., Child, Teen, Adult, Senior)
bins = [0, 12, 18, 60, 100]
labels = ['Child', 'Teen', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

# One-hot encode the new 'AgeGroup' feature
df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)
print("Dataset after Adding AgeGroup Feature:\n", df.head())


Dataset after Adding AgeGroup Feature:
    PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin  Embarked_Q  \
0    male  22.0      1      0         A/5 21171   7.2500   NaN           0   
1  female  38.0      1      0          PC 17599  71.2833   C85           0   
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN           0   
3  female  35.0      1      0            113803  53.1000  C123           0   
4    male  35.0      0      0            373450   8.0500   NaN           0   

   Embarked_S  P

In [6]:
# Creating 'FamilySize' feature
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # +1 includes the passenger
print("Dataset after Adding FamilySize Feature:\n", df.head())


Dataset after Adding FamilySize Feature:
    PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin  Embarked_Q  \
0    male  22.0      1      0         A/5 21171   7.2500   NaN           0   
1  female  38.0      1      0          PC 17599  71.2833   C85           0   
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN           0   
3  female  35.0      1      0            113803  53.1000  C123           0   
4    male  35.0      0      0            373450   8.0500   NaN           0   

   Embarked_S 

In [7]:
# Creating 'IsAlone' feature
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
print("Dataset after Adding IsAlone Feature:\n", df.head())


Dataset after Adding IsAlone Feature:
    PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin  Embarked_Q  \
0    male  22.0      1      0         A/5 21171   7.2500   NaN           0   
1  female  38.0      1      0          PC 17599  71.2833   C85           0   
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN           0   
3  female  35.0      1      0            113803  53.1000  C123           0   
4    male  35.0      0      0            373450   8.0500   NaN           0   

   Embarked_S  Pc

Explanation: We engineered AgeGroup to categorize passengers by age, FamilySize to reflect traveling group size, and IsAlone to indicate if a passenger is alone. These features add meaningful insights for analysis.

In [8]:
#Step 3: Aggregation Functions

In [12]:
# Group by Pclass and calculate average Fare and Age for each class
pclass_aggregated = df.groupby('Pclass_2').agg({'Fare': 'mean', 'Age': 'mean'}).reset_index()
print("Aggregated Data by Pclass:\n", pclass_aggregated)


Aggregated Data by Pclass:
    Pclass_2       Fare        Age
0         0  35.208073  29.642033
1         1  20.662183  29.877630


Explanation: By grouping data by Pclass_2, we can observe average fare and age for each class, providing insights on fare distribution and demographics.

In [13]:
print("Final Dataset Shape:", df.shape)
df.head()


Final Dataset Shape: (891, 19)


Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,AgeGroup_Teen,AgeGroup_Adult,AgeGroup_Senior,FamilySize,IsAlone
0,1,0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,1,0,1,0,1,0,2,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0,1,0,2,0
2,3,1,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,1,0,1,0,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,0,1,0,0,0,1,0,2,0
4,5,0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,0,1,0,1,0,1,0,1,1
