In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the titanic dataset including the 'name' column
df = sns.load_dataset('titanic').reset_index()

print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Summary (info()):")
df.info()

First 5 rows of the dataset:
   index  survived  pclass     sex   age  sibsp  parch     fare embarked  \
0      0         0       3    male  22.0      1      0   7.2500        S   
1      1         1       1  female  38.0      1      0  71.2833        C   
2      2         1       3  female  26.0      0      0   7.9250        S   
3      3         1       1  female  35.0      1      0  53.1000        S   
4      4         0       3    male  35.0      0      0   8.0500        S   

   class    who  adult_male deck  embark_town alive  alone  
0  Third    man        True  NaN  Southampton    no  False  
1  First  woman       False    C    Cherbourg   yes  False  
2  Third  woman       False  NaN  Southampton   yes   True  
3  First  woman       False    C  Southampton   yes  False  
4  Third    man        True  NaN  Southampton    no   True  

Dataset Summary (info()):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column     

In [None]:
print("\nMissing values before handling:")
print(df.isnull().sum())

df['age'].fillna(df['age'].median(), inplace=True)

mode_embarked = df['embarked'].mode()[0]
df['embarked'].fillna(mode_embarked, inplace=True)

df.drop('deck', axis=1, inplace=True)
print("\nMissing values after handling (before dropping original text columns):")
print(df.isnull().sum())


Missing values before handling:
index            0
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

Missing values after handling (before dropping original text columns):
index          0
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(mode_embarked, inplace=True)


In [None]:
Q1 = df['fare'].quantile(0.25)
Q3 = df['fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df['fare'] = np.where(df['fare'] > upper_bound, upper_bound, df['fare'])
df['fare'] = np.where(df['fare'] < lower_bound, lower_bound, df['fare'])

print("\nFare statistics after outlier handling:")
print(df['fare'].describe())


Fare statistics after outlier handling:
count    891.000000
mean      24.046813
std       20.481625
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max       65.634400
Name: fare, dtype: float64


In [None]:
print("--- Step 4: Feature Engineering ---")

df['title'] = df['who']

df['familysize'] = df['sibsp'] + df['parch'] + 1

df['isalone'] = np.where(df['familysize'] == 1, 1, 0)

print("Engineered features (first 5 rows):")
print(df[['who', 'title', 'familysize', 'isalone']].head())
print("-" * 50)

--- Step 4: Feature Engineering ---
Engineered features (first 5 rows):
     who  title  familysize  isalone
0    man    man           2        0
1  woman  woman           2        0
2  woman  woman           1        1
3  woman  woman           2        0
4    man    man           1        1
--------------------------------------------------


In [None]:
print("--- Step 5: Categorical Encoding and Column Dropping ---")

# Print columns before encoding to debug
print("Columns before encoding:", df.columns.tolist())

# One-hot encode embark_town (used instead of 'embarked' because it's complete)
df = pd.get_dummies(df, columns=['embark_town'], prefix=['embarked'], drop_first=True)

# Drop original text columns and redundant/intermediate columns
columns_to_drop = ['name', 'embarked', 'class', 'who', 'adult_male', 'alive', 'alone', 'ticket', 'sex', 'title', 'deck'] # Also drop original 'sex', 'title', and 'deck' if they somehow reappear or were missed earlier.
df.drop(columns_to_drop, axis=1, inplace=True, errors='ignore') # Use errors='ignore' in case some were already dropped

print("DataFrame columns after encoding and dropping:")
print(df.columns.tolist())
print("-" * 50)

--- Step 5: Categorical Encoding and Column Dropping ---
Columns before encoding: ['index', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive', 'alone', 'title', 'familysize', 'isalone']
DataFrame columns after encoding and dropping:
['index', 'survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'familysize', 'isalone', 'embarked_Queenstown', 'embarked_Southampton']
--------------------------------------------------
