In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tabpfn import TabPFNClassifier
import matplotlib.pyplot as plt
from scipy.stats import randint
import pandas as pd

df = pd.read_csv("../titanic/Titanic-Dataset.csv")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head(10)

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['HasCabin'] = df['Cabin'].notna().astype(int)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['Sex'] = df['Sex'].map({'male' : 0, 'female' : 1})

df['Embarked'] = df['Embarked'].fillna('Missing')
df['EmbarkedCat'] = pd.factorize(df['Embarked'])[0]

In [None]:
labels = pd.factorize(df['Embarked'])[1] 
location_lived = df[df['Survived'] == 1].groupby('EmbarkedCat').size()
location_death = df[df['Survived'] == 0].groupby('EmbarkedCat').size()

plt.figure()
plt.barh(location_lived.index, location_lived.values, label="Survived", alpha=0.7)
plt.barh(location_death.index, location_death.values, label="Died", alpha=0.7)
plt.yticks(location_lived.index, labels)
plt.xlabel('Count of Survived')
plt.ylabel('Embarked Category')
plt.title('Survival Count by Embarked Category')
plt.legend()
plt.show()

In [None]:
df.isna().sum()

In [None]:
age_gender = df[['Survived', 'Sex']].groupby('Sex').count()

plt.figure(figsize=(12, 6))
plt.bar(age_gender.index, age_gender['Survived'])
plt.title('Deaths per Gender')
plt.ylabel('Number of Deaths')
plt.xlabel('Gender')
plt.show()



In [None]:
deaths_per_class = df[['Survived', 'Sex', 'Pclass']].groupby(['Pclass', 'Sex']).size().unstack(fill_value=0)
#plt.figure()
#plt.bar(deaths_per_class.index, deaths_per_class['Survived'])
#plt.show()
deaths_per_class.plot(kind='bar')
plt.title('Deaths per Class by Sex')
plt.xlabel('Passenger Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Sex')
plt.show()

In [None]:
male = df[df['Sex'] == 1]
female = df[df['Sex'] == 0]

plt.figure()
plt.hist(male[male["Survived"] == 1]["Age"], bins=30, alpha=0.3, label="Survived", color='blue')
plt.hist(male[male["Survived"] == 0]["Age"], bins=30, alpha=0.5, label="Died", color='Red')
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Male Distribution by age")
plt.legend()
plt.show()

plt.figure()
plt.hist(female[female["Survived"] == 1]["Age"], bins=30, alpha=0.3, label="Survived", color='blue')
plt.hist(female[female["Survived"] == 0]["Age"], bins=30, alpha=0.5, label="Died", color='Red')
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Female Distribution by age")
plt.legend()
plt.show()


In [None]:
family_matters = pd.crosstab(df['FamilySize'], df['Survived'])

family_matters.plot(kind='bar', figsize=(10,6))

plt.xlabel("Family Size")
plt.ylabel("Number of Passengers")
plt.title("Survival Counts by Family Size")
plt.legend(["Died", "Survived"])
plt.show()


In [None]:
features = ['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare', 'HasCabin', 'FamilySize', 'IsAlone', 'EmbarkedCat']  

X = df[features]
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier()

param_dist = {
    'n_estimators': randint(100, 1000),      # Number of trees
    'max_depth': randint(2, 20),             # Max depth of each tree
    'min_samples_split': randint(2, 20),     # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 20),      # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider at each split
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_dist,
    n_iter=200,          # Number of parameter settings sampled
    cv=5,               # 5-fold cross-validation
    scoring='accuracy', # Or another metric
    random_state=42,
    n_jobs=-1           # Use all cores
)

random_search.fit(X_train, y_train)
best = random_search.best_params_

param_grid = {
    'n_estimators': [best['n_estimators'] - 100, best['n_estimators'], best['n_estimators'] + 100],
    'max_depth': [best['max_depth'] - 2, best['max_depth'], best['max_depth'] + 2],
    'min_samples_split': [best['min_samples_split'] - 1, best['min_samples_split'], best['min_samples_split'] + 1], 'min_samples_leaf': [best['min_samples_leaf'] - 1, best['min_samples_leaf'], best['min_samples_leaf'] + 1],
    'max_features': [best['max_features']],          # keep fixed
    'bootstrap': [best['bootstrap']]                 # keep fixed
}

grid_search = GridSearchCV(
    estimator = clf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=1
)

grid_search.fit(X_train, y_train)

In [None]:
y_pred = grid_search.predict(X_test)
print(accuracy_score(y_test, y_pred))

i had used the wrong dataset for training/testing, but I'll copy the process to the correct one:

In [None]:
submission_df = pd.read_csv("test.csv") 

submission_df['Age'] = submission_df['Age'].fillna(submission_df['Age'].median())
submission_df['HasCabin'] = submission_df['Cabin'].notna().astype(int)
submission_df['FamilySize'] = submission_df['SibSp'] + submission_df['Parch'] + 1
submission_df['IsAlone'] = (submission_df['FamilySize'] == 1).astype(int)
submission_df['Sex'] = submission_df['Sex'].map({'male' : 0, 'female' : 1})
submission_df['Embarked'] = submission_df['Embarked'].fillna('Missing')
submission_df['EmbarkedCat'] = pd.factorize(submission_df['Embarked'])[0]

y_pred = grid_search.predict(submission_df[features])

pred_df = pd.DataFrame({
    'PassengerId': submission_df['PassengerId'],  
    'Survived': y_pred                     
})

pred_df.to_csv('predictions.csv', index=False)


In [None]:
accuracy_score(y_pred, )

### Testing TabPFN

reference: https://github.com/PriorLabs/TabPFN 

In [None]:
clf = TabPFNClassifier()    
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))