# Analysis Summary

This notebook reproduces the analysis performed in the Streamlit app. It contains the code for data loading, preprocessing, model training, and evaluation.

## 1. Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, precision_score, recall_score, roc_curve, auc

# --- IMPORTANT: Change this path to your file location ---
file_path = 'titanic.csv' 
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found. Please update the path.")
    df = None # Set df to None to prevent errors


## 2. Data Exploration

In [2]:
if df is not None:
    print("Data Head:")
    print(df.head())
    print("\nData Info:")
    df.info()
    print("\nDescriptive Statistics:")
    print(df.describe(include='all'))


Data Head:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   su

## 3. Machine Learning Pipeline

In [3]:
if df is not None:
    target_col = 'survived'
    selected_features = ['pclass', 'sex']

    X = df[selected_features]
    y = df[target_col]
    
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Preprocessing pipelines based on app choices
    numerical_transformer_steps = [('imputer', SimpleImputer(strategy='median'))]
    if 'StandardScaler' == 'StandardScaler':
        numerical_transformer_steps.append(('scaler', StandardScaler()))
    elif 'StandardScaler' == 'MinMaxScaler':
        numerical_transformer_steps.append(('scaler', MinMaxScaler()))
    
    numerical_transformer = Pipeline(steps=numerical_transformer_steps)
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')), 
        ('onehot', OneHotEncoder(handle_unknown='ignore' if True else 'error'))
    ])
    
    # Bundle preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Model selection based on app choice
    if 'Logistic Regression' == 'Logistic Regression':
        model = LogisticRegression()
    elif 'Logistic Regression' == 'K-Nearest Neighbors Classifier':
        model = KNeighborsClassifier(n_neighbors=5)
    # Add other models here
    
    # Create the full pipeline
    full_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    print("Pipeline built successfully!")


Pipeline built successfully!


## 4. Model Training and Evaluation

In [5]:
if df is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    full_pipeline.fit(X_train, y_train)
    y_pred = full_pipeline.predict(X_test)
    
    print("Model Training Complete!")

    # Evaluation based on problem type
    if True:
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        print("\n--- Classification Results ---")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")
    else:
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        print("\n--- Regression Results ---")
        print(f"RMSE: {rmse:.2f}")
        print(f"R^2 Score: {r2:.2f}")


Model Training Complete!

--- Classification Results ---
Accuracy: 0.78
Precision: 0.75
Recall: 0.70
F1 Score: 0.73
