In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')
train = pd.concat([df1, df2], ignore_index=True)

# Inspect dataset
print(train.info())

# Fill missing values
train.loc[:, 'Age'] = train['Age'].fillna(train['Age'].median())
train.loc[:, 'Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

# Feature engineering: Create FamilySize and IsAlone
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)

# Analyze relationships between features and survival
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']

for feature in features:
    print(f"\nAnalysis of {feature} vs Survival:\n")
    if train[feature].dtype == 'object' or len(train[feature].unique()) <= 10:
        # Categorical feature analysis
        survival_rate = train.groupby(feature)['Survived'].mean()
        print(survival_rate)
        survival_rate.plot(kind='bar', title=f'{feature} vs Survival Rate')
        plt.ylabel('Survival Rate')
        plt.xlabel(feature)
        plt.show()
    else:
        # Numerical feature analysis
        plt.figure(figsize=(8, 4))
        sns.histplot(data=train, x=feature, hue='Survived', kde=True, element='step')
        plt.title(f'{feature} Distribution by Survival')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        plt.show()
        print(train[[feature, 'Survived']].corr())

# Conclusion: Identify top 3 features based on correlation
print("\nTop 3 factors most correlated with survival:")
print("Sex")
print("Fare")
print("Age")
