In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('titanic (1).csv')

# Display the first few rows of the dataframe
print(df.head())
print(df.info())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

In [3]:
# Check for missing values
print(df.isnull().sum())

# Fill missing Age with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the Cabin column (too many missing values)
df.drop(columns=['Cabin'], inplace=True)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
# Extract title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Create a new feature: FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Create a new feature: IsAlone
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)


In [5]:
# Encode Sex
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode Embarked and Title
df = pd.get_dummies(df, columns=['Embarked', 'Title'], drop_first=True)


In [6]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns to scale
numerical_cols = ['Age', 'Fare', 'FamilySize']

# Initialize the scaler
scaler = StandardScaler()

# Scale the numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [7]:
# Drop irrelevant columns
df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# Display the first few rows of the processed dataframe
print(df.head())


   Survived  Pclass  Sex       Age  SibSp  Parch      Fare  FamilySize  \
0         0       3    0 -0.565736      1      0 -0.502445    0.059160   
1         1       1    1  0.663861      1      0  0.786845    0.059160   
2         1       3    1 -0.258337      0      0 -0.488854   -0.560975   
3         1       1    1  0.433312      1      0  0.420730    0.059160   
4         0       3    0  0.433312      0      0 -0.486337   -0.560975   

   IsAlone  Embarked_Q  ...  Title_Major  Title_Master  Title_Miss  \
0        0       False  ...        False         False       False   
1        0       False  ...        False         False       False   
2        1       False  ...        False         False        True   
3        0       False  ...        False         False       False   
4        1       False  ...        False         False       False   

   Title_Mlle  Title_Mme  Title_Mr  Title_Mrs  Title_Ms  Title_Rev  Title_Sir  
0       False      False      True      False     Fals