In [2]:
! pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [4]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c titanic

In [6]:
!unzip -q titanic.zip

In [48]:
import pandas as pd

gender_df = pd.read_csv('gender_submission.csv')
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

In [49]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
"""
 Titanic Survival Prediction Script

This script performs the following steps:
1. Loads data
2. Handles missing values (Embarked, Fare, Age, Cabin)
3. Performs feature engineering (Title, FamilySize, Deck, etc.)
4. Encodes categorical features
5. Selects final features
6. Evaluates multiple models using cross-validation
7. Tunes hyperparameters for the best model (Random Forest)
8. Trains the final model
9. Generates predictions and a submission file.
"""

import pandas as pd
import numpy as np
import re
import warnings
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
pd.options.mode.chained_assignment = None

print("Loading data...")
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: train.csv or test.csv not found. Make sure they are in the same directory.")
    exit()

test_passenger_ids = test_df['PassengerId']
train_original_len = len(train_df)

print("Combining train and test data for preprocessing...")
test_df['Survived'] = -1
combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)

print("Handling missing Embarked and Fare...")
# Embarked: Impute with Mode (most frequent value in *training* data)
embarked_mode = train_df['Embarked'].mode()[0]
combined_df['Embarked'].fillna(embarked_mode, inplace=True)

# Fare: Impute with Median (calculated from *training* data)
fare_median = train_df['Fare'].median()
combined_df['Fare'].fillna(fare_median, inplace=True)

# --- 4. Feature Engineering (Round 1) ---
print("Performing initial feature engineering (Family, Title, Deck)...")
# FamilySize and IsAlone
combined_df['FamilySize'] = combined_df['SibSp'] + combined_df['Parch'] + 1
combined_df['IsAlone'] = (combined_df['FamilySize'] == 1).astype(int)

# Title Extraction and Grouping
combined_df['Title'] = combined_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
combined_df['Title'] = combined_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
combined_df['Title'] = combined_df['Title'].replace(['Mlle', 'Ms'], 'Miss')
combined_df['Title'] = combined_df['Title'].replace('Mme', 'Mrs')

# Deck Extraction from Cabin
combined_df['Cabin'].fillna('Unknown', inplace=True) # Fill NaN cabins first
combined_df['Deck'] = combined_df['Cabin'].str[0]
# Group less common decks or potential errors into 'U' (Unknown)
combined_df['Deck'] = combined_df['Deck'].replace(['T', 'X'], 'U') # Assume T is Unknown

# Cabin Known
combined_df['Cabin_Known'] = (combined_df['Cabin'] != 'Unknown').astype(int)

# Log Transform Fare
combined_df['Fare_Log'] = np.log1p(combined_df['Fare']) # Use log1p to handle Fare=0

# --- 5. Handle Missing Age (Grouped Imputation - Corrected) ---
print("Handling missing Age using grouped medians...")
# --- Calculate Age Medians (ONLY from original training rows within combined_df) ---
# Calculate medians using rows corresponding to original train_df *after* Title exists
grouped_age_medians = combined_df.iloc[:train_original_len].groupby(['Pclass', 'Title'])['Age'].median()

# --- Impute Age using calculated medians ---
def impute_age(row, medians_dict):
    if pd.isnull(row['Age']):
        try:
            return medians_dict.loc[(row['Pclass'], row['Title'])]
        except KeyError:
             # Fallback if Pclass/Title combo wasn't in training data median calculation
            return combined_df['Age'].iloc[:train_original_len].median() # Global Train Median
    else:
        return row['Age']

combined_df['Age'] = combined_df.apply(lambda row: impute_age(row, grouped_age_medians), axis=1)

# --- Final Fallback (just in case some rare combo failed) ---
overall_train_age_median = combined_df['Age'].iloc[:train_original_len].median()
combined_df['Age'].fillna(overall_train_age_median, inplace=True)

# --- 6. Feature Engineering (Round 2 - Based on Imputed Age/Fare) ---
print("Performing final feature engineering (Age interactions/bins)...")
# Age*Pclass Interaction
combined_df['Age_Pclass'] = combined_df['Age'] * combined_df['Pclass']

# Age Binning
bins_age = [0, 12, 18, 35, 60, 100]
labels_age = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
combined_df['AgeGroup'] = pd.cut(combined_df['Age'], bins=bins_age, labels=labels_age, right=False, ordered=False) # Set ordered=False if using get_dummies later


# --- 7. Drop Unnecessary Columns ---
print("Dropping unnecessary original columns...")
# Drop originals that have been replaced or aren't useful for modeling
cols_to_drop = ['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'PassengerId']
combined_df_processed = combined_df.drop(columns=cols_to_drop)

# --- 8. Categorical Encoding ---
print("Encoding categorical features...")
# Identify categorical columns to be one-hot encoded
categorical_features = ['Embarked', 'Sex', 'Title', 'AgeGroup', 'Deck']
# Make sure AgeGroup is treated as categorical if pd.cut created a Categorical dtype
combined_df_processed['AgeGroup'] = combined_df_processed['AgeGroup'].astype(object)

combined_df_encoded = pd.get_dummies(combined_df_processed, columns=categorical_features, drop_first=True)


# --- 9. Separate Data back into Train and Test ---
print("Separating processed data back into train and test sets...")
train_final_df = combined_df_encoded[combined_df_encoded['Survived'] != -1]
test_final_df = combined_df_encoded[combined_df_encoded['Survived'] == -1]

# Define Final X, y and X_test
y = train_final_df['Survived']
X = train_final_df.drop('Survived', axis=1)
X_test_final = test_final_df.drop('Survived', axis=1)

# --- Ensure columns are aligned ---
print("Aligning columns between train and test sets...")
train_cols = X.columns
test_cols = X_test_final.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test: X_test_final[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train: X[c] = 0
X_test_final = X_test_final[train_cols] # Ensure order


print("\n--- Final Data Shapes ---")
print(f"Training features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Test features (X_test_final) shape: {X_test_final.shape}")

# --- 10. Model Training Pipeline Setup ---
print("\nSetting up models and cross-validation...")
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42, max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "GaussianNB": GaussianNB()
}
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


# --- 11. Cross-Validation ---
print("\n--- Running Cross-Validation ---")
results = {}
for name, model in models.items():
    pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', model)])
    try:
        scores = cross_val_score(pipeline, X, y, cv=cv_strategy, scoring='accuracy', n_jobs=-1)
        results[name] = scores
        print(f"{name}: Mean Accuracy = {scores.mean():.4f} (Std = {scores.std():.4f})")
    except Exception as e:
        print(f"Could not evaluate {name}. Error: {e}")


# --- 12. Hyperparameter Tuning (Example: Random Forest) ---
# Based on CV results, Random Forest is usually a strong contender
print("\n--- Running Hyperparameter Tuning for Random Forest ---")
param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 8, 12, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 3, 5],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__class_weight': ['balanced', None]
}
pipeline_rf_tune = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])
grid_search_rf = GridSearchCV(estimator=pipeline_rf_tune, param_grid=param_grid_rf, scoring='accuracy', cv=cv_strategy, n_jobs=-1, verbose=0) # Set verbose=1 to see progress

try:
    grid_search_rf.fit(X, y)
    print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
    print(f"Best cross-validation accuracy for Random Forest: {grid_search_rf.best_score_:.4f}")
    best_model_pipeline = grid_search_rf.best_estimator_
except Exception as e:
    print(f"GridSearchCV failed. Error: {e}")
    # Fallback: Use default Random Forest if tuning fails
    print("Using default Random Forest model as fallback.")
    best_model_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    best_model_pipeline.fit(X, y)


# --- 13. Final Prediction ---
print("\n--- Making final predictions on test data ---")
final_predictions = best_model_pipeline.predict(X_test_final)


# --- 14. Submission File Generation ---
print("Generating submission file...")
submission_df = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': final_predictions.astype(int)
})
submission_filename = 'titanic_submission.csv'
submission_df.to_csv(submission_filename, index=False)
print(f"\nSubmission file created successfully: {submission_filename}")
print(submission_df.head())

print("\n--- Script finished ---")

Loading data...
Data loaded successfully.
Combining train and test data for preprocessing...
Handling missing Embarked and Fare...
Performing initial feature engineering (Family, Title, Deck)...
Handling missing Age using grouped medians...
Performing final feature engineering (Age interactions/bins)...
Dropping unnecessary original columns...
Encoding categorical features...
Separating processed data back into train and test sets...
Aligning columns between train and test sets...

--- Final Data Shapes ---
Training features (X) shape: (891, 24)
Target (y) shape: (891,)
Test features (X_test_final) shape: (418, 24)

Setting up models and cross-validation...

--- Running Cross-Validation ---
Logistic Regression: Mean Accuracy = 0.8293 (Std = 0.0342)
KNN: Mean Accuracy = 0.8159 (Std = 0.0305)
SVC: Mean Accuracy = 0.8260 (Std = 0.0301)
Decision Tree: Mean Accuracy = 0.7811 (Std = 0.0306)
Random Forest: Mean Accuracy = 0.8204 (Std = 0.0316)
Gradient Boosting: Mean Accuracy = 0.8237 (Std = 

In [None]:
!kaggle competitions submit -c titanic -f titanic_submission.csv -m "This is my Submission for this task"