In [15]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [16]:
# 2. Load Data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
passenger_ids = test_df['PassengerId']

In [17]:
# 3. Separate features and target BEFORE preprocessing
X = train_df.drop("Transported", axis=1)
y = train_df["Transported"].astype(int)  # Convert boolean to int for modeling

In [18]:
# 4. Identify feature types
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()


In [19]:
# 5. Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [20]:
# 6. Split data BEFORE preprocessing
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [21]:
# 7. Preprocess data using pipeline
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(test_df)


In [22]:
# 8. Train Model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_preprocessed, y_train)


In [23]:
# 9. Evaluate on Training Set
y_pred_train = model.predict(X_train_preprocessed)
train_acc = accuracy_score(y_train, y_pred_train)
print("="*60)
print(f"TRAIN ACCURACY: {train_acc:.4f}")
print("Classification Report (TRAIN):")
print(classification_report(y_train, y_pred_train))
print("Confusion Matrix (TRAIN):")
print(confusion_matrix(y_train, y_pred_train))

TRAIN ACCURACY: 0.9518
Classification Report (TRAIN):
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      3454
           1       0.96      0.94      0.95      3500

    accuracy                           0.95      6954
   macro avg       0.95      0.95      0.95      6954
weighted avg       0.95      0.95      0.95      6954

Confusion Matrix (TRAIN):
[[3316  138]
 [ 197 3303]]


In [24]:
# 10. Evaluate on Validation Set
y_pred_val = model.predict(X_val_preprocessed)
val_acc = accuracy_score(y_val, y_pred_val)
print("\n" + "="*60)
print(f"VALIDATION ACCURACY: {val_acc:.4f}")
print("Classification Report (VALIDATION):")
print(classification_report(y_val, y_pred_val))
print("Confusion Matrix (VALIDATION):")
print(confusion_matrix(y_val, y_pred_val))
print("="*60 + "\n")



VALIDATION ACCURACY: 0.7740
Classification Report (VALIDATION):
              precision    recall  f1-score   support

           0       0.78      0.75      0.77       861
           1       0.77      0.79      0.78       878

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739

Confusion Matrix (VALIDATION):
[[649 212]
 [181 697]]



In [25]:
# 11. Predict Test Set
y_pred_test = model.predict(X_test_preprocessed)


In [26]:
# 12. Submission
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": y_pred_test.astype(bool)
})
submission.to_csv("submission_logistic.csv", index=False)

In [27]:
# 13. Final Model Evaluation Summary
print("\n" + "="*60)
print("FINAL MODEL PERFORMANCE SUMMARY")
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print("="*60)


FINAL MODEL PERFORMANCE SUMMARY
Train Accuracy: 0.9518
Validation Accuracy: 0.7740
