In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sqlalchemy import create_engine


In [None]:
# Load training dataset (uploaded 'train.csv')
df = pd.read_csv('/mnt/data/train.csv')  # Adjust path if needed
df.head()

In [None]:
# Drop unnecessary columns
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# Fill missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Encode categorical features
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

# Normalize numerical columns
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model_results = []

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
log_preds = log_reg.predict(X_test)
log_acc = accuracy_score(y_test, log_preds)
model_results.append({
    'Model': 'Logistic Regression',
    'Params': 'max_iter=1000',
    'Accuracy': round(log_acc * 100, 2)
})

# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)
model_results.append({
    'Model': 'Random Forest',
    'Params': 'n_estimators=100, max_depth=5',
    'Accuracy': round(rf_acc * 100, 2)
})

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_preds)
model_results.append({
    'Model': 'K-Nearest Neighbors',
    'Params': 'n_neighbors=3',
    'Accuracy': round(knn_acc * 100, 2)
})


In [None]:
results_df = pd.DataFrame(model_results)
results_df.to_csv('model_results.csv', index=False)

print("✅ Model comparisons saved to model_results.csv")
results_df


In [None]:
# ⚠️ Update these credentials before running
db_user = 'your_username'
db_password = 'your_password'
db_host = 'localhost'
db_port = '5432'
db_name = 'your_database_name'

# Use format() if f-strings are not supported
connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(
    db_user, db_password, db_host, db_port, db_name
)
engine = create_engine(connection_string)

df.to_sql('titanic_data', engine, if_exists='replace', index=False)
print("✅ Cleaned data saved to PostgreSQL.")
engine.dispose()  # Close the connection
print("✅ Connection closed.")