In [None]:
# 1. IMPORT LIBRARIES
# ======================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)


# ======================================================
# 2. LOAD DATASET
# ======================================================
train = pd.read_csv("/kaggle/input/yashdata01/train.csv")
test  = pd.read_csv("/kaggle/input/yashdata01/test.csv")

TARGET = "Survived"



In [None]:
train.head()
test.head()

In [None]:
# ===================== BASIC CLEANING =====================
train = train.drop_duplicates()
test  = test.drop_duplicates()
# ======================================================
# 3. DATA CLEANING & PREPROCESSING
# ======================================================
# Drop duplicates
train.drop_duplicates(inplace=True)

# Drop high-cardinality / irrelevant columns
drop_cols = ["Name", "Ticket", "Cabin"]
train.drop(columns=drop_cols, inplace=True, errors="ignore")
test.drop(columns=drop_cols, inplace=True, errors="ignore")


# Separate numerical & categorical columns
num_cols = train.select_dtypes(include=["int64","float64"]).columns.drop(TARGET)
cat_cols = train.select_dtypes(include=["object"]).columns

# Handle missing values
train[num_cols] = train[num_cols].fillna(train[num_cols].median())
test[num_cols]  = test[num_cols].fillna(test[num_cols].median())

train[cat_cols] = train[cat_cols].fillna(train[cat_cols].mode().iloc[0])
test[cat_cols]  = test[cat_cols].fillna(test[cat_cols].mode().iloc[0])


In [None]:
# ======================================================
# 4. DATA VISUALIZATION & OUTLIER ANALYSIS
# ======================================================
train[num_cols].hist(figsize=(12,8))
plt.suptitle("Feature Distributions")
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(data=train[num_cols])
plt.title("Outlier Analysis")
plt.xticks(rotation=90)
plt.show()

In [None]:
# ======================================================
# 5. ENCODING CATEGORICAL VARIABLES
# ======================================================
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col]  = le.transform(test[col])


In [None]:
# ======================================================
# 6. CORRELATION ANALYSIS
# ======================================================
plt.figure(figsize=(10,6))
sns.heatmap(train.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# ======================================================
# 7. FEATURE SCALING
# ======================================================
X = train.drop(TARGET, axis=1)
y = train[TARGET]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)


In [None]:
# ======================================================
# 8. TRAINâ€“TEST SPLIT
# ======================================================
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# ======================================================
# 9. MODEL TRAINING
# ======================================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True)
}


In [None]:
# ======================================================
# 10. MODEL EVALUATION
# ======================================================
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    print(f"\nðŸ”¹ {name}")
    print("Accuracy :", accuracy_score(y_val, preds))
    print("Precision:", precision_score(y_val, preds, average="weighted"))
    print("Recall   :", recall_score(y_val, preds, average="weighted"))
    print("F1 Score :", f1_score(y_val, preds, average="weighted"))
    print("Confusion Matrix:\n", confusion_matrix(y_val, preds))
    print("Classification Report:\n", classification_report(y_val, preds))


In [None]:
# ======================================================
# 11. HYPER-PARAMETER TUNING (RANDOM FOREST)
# ======================================================
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring="f1_weighted",
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_rf = grid.best_estimator_

print("\nBest Random Forest Parameters:", grid.best_params_)


In [None]:
# ======================================================
# 12. FINAL MODEL EVALUATION
# ======================================================
final_preds = best_rf.predict(X_val)
print("\nFinal Tuned Random Forest Performance")
print(classification_report(y_val, final_preds))


In [None]:
import pandas as pd

# Load test dataset
test = pd.read_csv("/kaggle/input/yashdata01/test.csv")

# Simple rule-based prediction (baseline)
# Predict: 1 = Survived for females, 0 = Not Survived for males
test["Survived"] = test["Sex"].apply(lambda x: 1 if x == "female" else 0)

# Create submission file
gender_submission = test[["PassengerId", "Survived"]]

# Save to CSV
gender_submission.to_csv("gender_submission.csv", index=False)

print("âœ… gender_submission.csv created successfully")
