In [2]:
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('titanic.csv')

# Fill missing 'Age' values with the median age
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing 'Embarked' values with the mode (most common value)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column because it has too many missing values
df.drop(columns=['Cabin'], inplace=True)

# Extract titles from names
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Replace rare titles with 'Other'
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
df['Title'] = df['Title'].replace(rare_titles, 'Other')

# Replace different spellings of similar titles
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

# Create a 'FamilySize' feature
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Create an 'IsAlone' feature
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Convert 'Sex' to numeric
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode 'Embarked' and 'Title'
df = pd.get_dummies(df, columns=['Embarked', 'Title'], drop_first=True)

# Drop columns that won't be used in the model
df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# Select numeric columns for outlier detection
numeric_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
X_numeric = df[numeric_cols]

# Apply Local Outlier Factor
lof = LocalOutlierFactor()
outliers = lof.fit_predict(X_numeric)

# Add LOF results to the DataFrame
df['LOF'] = outliers

# Remove outliers (LOF result -1 indicates outliers)
df = df[df['LOF'] != -1]

# Drop the LOF column as it's no longer needed
df.drop(columns=['LOF'], inplace=True)

# Define features and target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows of the cleaned and engineered DataFrame
df.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Title_the Countess
0,0,3,0,22.0,1,0,7.25,2,0,False,True,False,True,False,False,False
1,1,1,1,38.0,1,0,71.2833,2,0,False,False,False,False,True,False,False
2,1,3,1,26.0,0,0,7.925,1,1,False,True,True,False,False,False,False
3,1,1,1,35.0,1,0,53.1,2,0,False,True,False,False,True,False,False
4,0,3,0,35.0,0,0,8.05,1,1,False,True,False,True,False,False,False



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

