In [None]:
#Downloading datasets and fairlearn

!gdown 120Q64To3z6yqp6T9M-q365UUR4-55LyX

!gdown 1dZT7wMzJqRPzNRSNpcBcpf59wEc33Nd3

%pip install fairlearn

# Model Training and Fairness evaluation Pre-intervention

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from fairlearn.metrics import MetricFrame, demographic_parity_difference, equalized_odds_difference

def main():
    """
    Main function to load data, train a model, and evaluate its
    performance and fairness.
    """
    try:
        df = pd.read_csv('data.csv', sep=';', quotechar='"')
        print("DataFrame columns:", df.columns) # Added this line to print column names
    except FileNotFoundError:
        print("Error: 'dataset.csv' not found. Make sure the dataset file is in the same directory.")
        return

    #Data Preprocessing
    X = df.drop('Target', axis=1)
    y = df['Target']
    # The list of sensitive features to evaluate
    sensitive_features_list = ['Marital status', 'Application mode', 'Course',
                                'Previous qualification', 'Nacionality',
                               "Mother's qualification",
                               'Educational special needs',
                               'Tuition fees up to date', 'Gender',
                               'Age at enrollment', 'International']


    # Identify categorical and numerical features
    # exclude sensitive features
    categorical_features = X.select_dtypes(include=['object']).columns.difference(sensitive_features_list)
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.difference(sensitive_features_list)

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ], remainder='passthrough') #sensitive features in X for evaluation


    #Model Training

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])

    # Training
    print("Training the classification model...")
    model_pipeline.fit(X_train, y_train)
    print("Model training complete.\n")

    #Model Evaluation
    y_pred = model_pipeline.predict(X_test)

    print("--- Model Performance Evaluation ---")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")

    # Print the classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Print the confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 35 + "\n")


    #Fairness Evaluation (One-vs-Rest)

    unique_classes = y.unique()

    for sensitive_feature in sensitive_features_list:
        if sensitive_feature not in X_test.columns:
            print(f"Warning: Sensitive feature '{sensitive_feature}' not found in the test set. Skipping.")
            continue

        sensitive_test = X_test[sensitive_feature]

        print(f"\n--- Fairness Evaluation (by '{sensitive_feature}' - One-vs-Rest) ---")

        for target_class in unique_classes:
            print(f"\nEvaluating fairness for class: '{target_class}' (One-vs-Rest)")

            # Create binary target
            y_test_binary = (y_test == target_class).astype(int)
            y_pred_binary = (y_pred == target_class).astype(int)

            #compute metrics grouped by the sensitive feature
            metrics_binary = {
                'accuracy': accuracy_score,
                'precision': lambda y_true_b, y_pred_b: classification_report(y_true_b, y_pred_b, output_dict=True, zero_division=0)['weighted avg']['precision'],
                'recall': lambda y_true_b, y_pred_b: classification_report(y_true_b, y_pred_b, output_dict=True, zero_division=0)['weighted avg']['recall']
            }

            try:
                grouped_on_sensitive = MetricFrame(metrics=metrics_binary,
                                                       y_true=y_test_binary,
                                                       y_pred=y_pred_binary,
                                                       sensitive_features=sensitive_test)

                print(f"Metrics grouped by '{sensitive_feature}':")
                print(grouped_on_sensitive.by_group)

                # Calculate and print fairness metrics
                print("Fairness Metrics for this class:")
                dpd_binary = demographic_parity_difference(y_test_binary, y_pred_binary, sensitive_features=sensitive_test)
                print(f"Demographic Parity Difference: {dpd_binary:.4f}")

                eod_binary = equalized_odds_difference(y_test_binary, y_pred_binary, sensitive_features=sensitive_test)
                print(f"Equalized Odds Difference: {eod_binary:.4f}")
                print("-" * 35)

            except ValueError as e:
                print(f"Could not calculate fairness metrics for class '{target_class}' with sensitive feature '{sensitive_feature}': {e}")
                print("This might happen if one of the sensitive feature groups has no samples for this class in the test set.")
                print("-" * 35)


if __name__ == '__main__':
    main()

# Visualisations to gauge biases in each feature

import pandas as pd
import plotly.express as px

# Load the dataset
try:
    df = pd.read_csv('data.csv', sep=';', quotechar='"')
except FileNotFoundError:
    print("Error: 'data.csv' not found. Make sure the dataset file is in the same directory.")
    exit() # Exit the program if the file is not found

# Iterate through each column and create a frequency distribution plot
for column in df.columns:
    # Calculate the frequency distribution
    freq_dist = df[column].value_counts().reset_index()
    freq_dist.columns = [column, 'Frequency']

    # Create the bar plot
    fig = px.bar(freq_dist, x=column, y='Frequency', title=f'Frequency Distribution of {column}')

    # Display the plot
    fig.show()

## Anova Analysis to compute most significant features

from scipy import stats

# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Identify the target column
target_column = 'Target'

# Exclude the target column
numerical_features = numerical_cols.drop(target_column, errors='ignore')


# Identify unique classes
unique_classes = df[target_column].unique()

#store p-values
anova_results = {}

# Perform ANOVA
for feature in numerical_features:
    data_for_anova = []
    for class_value in unique_classes:
        feature_data = df[df[target_column] == class_value][feature].dropna()
        if len(feature_data) >= 2:
            data_for_anova.append(feature_data)

    if len(data_for_anova) >= 2:
        try:
            f_statistic, p_value = stats.f_oneway(*data_for_anova)
            anova_results[feature] = p_value
        except ValueError as e:
            print(f"Could not perform ANOVA for feature '{feature}': {e}")
            print("This might happen if a group has constant values or insufficient variance.")
    else:
        print(f"Skipping ANOVA for feature '{feature}': Insufficient data in groups.")

# Print the p-values
print("\nANOVA Test p-values:")
for feature, p_value in anova_results.items():
    print(f"{feature}: {p_value:.4f}")

# Model Training and Fairness evaluation Post-intervention

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from fairlearn.metrics import MetricFrame, demographic_parity_difference, equalized_odds_difference

# Define global variables
global dpd_results_data1
global eod_results_data1
global dpd_results_data2
global eod_results_data2

def perform_fairness_evaluation(df, dataset_name):
    """
    Performs fairness evaluation on a given DataFrame.
    """
    print(f"\n--- Performing fairness evaluation for: {dataset_name} ---")

    # Define features (X) and target (y)
    # The target column is assumed to be 'Target'
    X = df.drop('Target', axis=1)
    y = df['Target']
    if dataset_name == 'data2 - Sheet1.csv' and 'Nacionality' in X.columns:
        print(f"Eliminating 'Nacionality' feature from {dataset_name}.")
        X = X.drop('Nacionality', axis=1)

    #list of sensitive features
    sensitive_features_list = ['Marital status', 'Application mode', 'Course',
                                'Previous qualification', 'Nacionality',
                               "Mother's qualification",
                               'Educational special needs',
                               'Tuition fees up to date', 'Gender',
                               'Age at enrollment', 'International']
    if 'Nacionality' not in X.columns and 'Nacionality' in sensitive_features_list:
        sensitive_features_list.remove('Nacionality')


    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object']).columns.difference(sensitive_features_list)
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.difference(sensitive_features_list)
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ], remainder='passthrough') # Keep sensitive features in X for evaluation


    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    #full model pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])

    # Train the model
    print(f"Training the classification model for {dataset_name}...")
    model_pipeline.fit(X_train, y_train)
    print(f"Model training complete for {dataset_name}.\n")

    #Model Evaluation

    # Make predictions on the test set
    y_pred = model_pipeline.predict(X_test)

    print(f"--- Model Performance Evaluation for {dataset_name} ---")
    # Print accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")

    # Print the classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Print the confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 35 + "\n")


    #Fairness Evaluation (One-vs-Rest)

    unique_classes = y.unique()
    dpd_results = {}
    eod_results = {}


    for sensitive_feature in sensitive_features_list:
        if sensitive_feature not in X_test.columns:
            print(f"Warning: Sensitive feature '{sensitive_feature}' not found in the test set for {dataset_name}. Skipping.")
            continue

        sensitive_test = X_test[sensitive_feature]

        print(f"\n--- Fairness Evaluation (by '{sensitive_feature}' - One-vs-Rest) for {dataset_name} ---")

        dpd_results[sensitive_feature] = {}
        eod_results[sensitive_feature] = {}

        for target_class in unique_classes:
            print(f"\nEvaluating fairness for class: '{target_class}' (One-vs-Rest) for {dataset_name}")

            y_test_binary = (y_test == target_class).astype(int)
            y_pred_binary = (y_pred == target_class).astype(int)
            metrics_binary = {
                'accuracy': accuracy_score,
                'precision': lambda y_true_b, y_pred_b: classification_report(y_true_b, y_pred_b, output_dict=True, zero_division=0)['weighted avg']['precision'],
                'recall': lambda y_true_b, y_pred_b: classification_report(y_true_b, y_pred_b, output_dict=True, zero_division=0)['weighted avg']['recall']
            }

            try:
                grouped_on_sensitive = MetricFrame(metrics=metrics_binary,
                                                       y_true=y_test_binary,
                                                       y_pred=y_pred_binary,
                                                       sensitive_features=sensitive_test)

                print(f"Metrics grouped by '{sensitive_feature}':")
                print(grouped_on_sensitive.by_group)

                #fairness metrics
                print("Fairness Metrics for this class:")
                dpd_binary = demographic_parity_difference(y_test_binary, y_pred_binary, sensitive_features=sensitive_test)
                print(f"Demographic Parity Difference: {dpd_binary:.4f}")
                dpd_results[sensitive_feature][target_class] = dpd_binary


                eod_binary = equalized_odds_difference(y_test_binary, y_pred_binary, sensitive_features=sensitive_test)
                print(f"Equalized Odds Difference: {eod_binary:.4f}")
                eod_results[sensitive_feature][target_class] = eod_binary

                print("-" * 35)

            except ValueError as e:
                print(f"Could not calculate fairness metrics for class '{target_class}' with sensitive feature '{sensitive_feature}' for {dataset_name}: {e}")
                print("This might happen if one of the sensitive feature groups has no samples for this class in the test set.")
                print("-" * 35)

    return pd.DataFrame(dpd_results).T, pd.DataFrame(eod_results).T


def main():
    """
    Main function to load data, train a model, and evaluate its
    performance and fairness for both datasets.
    """
    global dpd_results_data1
    global eod_results_data1
    global dpd_results_data2
    global eod_results_data2
    try:
        df2 = pd.read_csv('data2 - Sheet1.csv')
        print("\nDataFrame columns for data2 - Sheet1.csv:", df2.columns)
        dpd_results_data2, eod_results_data2 = perform_fairness_evaluation(df2, 'data2 - Sheet1.csv')
    except FileNotFoundError:
        print("Error: 'data2 - Sheet1.csv' not found. Make sure the dataset file is in the same directory.")
        dpd_results_data2, eod_results_data2 = pd.DataFrame(), pd.DataFrame() # Assign empty dataframes on error


if __name__ == '__main__':
    main()

#Comparative fairness evaluation

import pandas as pd

# Identify features with potentially significant bias (e.g., DPD or EOD > 0.1)
bias_threshold = 0.1

print(f"Features with potentially significant bias (DPD or EOD > {bias_threshold}):")

# Analyze results for data.csv
print("\n--- Bias Analysis for data.csv ---")
biased_features_data1 = set()
if 'dpd_results_data1' in globals() and not dpd_results_data1.empty:
    print("\nAnalyzing Demographic Parity Difference (DPD) for data.csv:")
    for feature in dpd_results_data1.index:
        if any(dpd_results_data1.loc[feature] > bias_threshold):
            biased_features_data1.add(feature)
            print(f"- {feature} (DPD bias detected for classes: {[cls for cls, val in dpd_results_data1.loc[feature].items() if val > bias_threshold]})")
else:
    print("Fairness evaluation results for data.csv not found or empty.")

if 'eod_results_data1' in globals() and not eod_results_data1.empty:
    print("\nAnalyzing Equalized Odds Difference (EOD) for data.csv:")
    for feature in eod_results_data1.index:
        if any(eod_results_data1.loc[feature] > bias_threshold):
            biased_features_data1.add(feature)
            print(f"- {feature} (EOD bias detected for classes: {[cls for cls, val in eod_results_data1.loc[feature].items() if val > bias_threshold]})")
else:
    print("Fairness evaluation results for data.csv not found or empty.")

if not biased_features_data1:
    print("\nNo features found with bias above the threshold for data.csv.")
else:
    print("\nSummary of features with potentially significant bias for data.csv:")
    for feature in biased_features_data1:
        print(f"- {feature}")

# results for data2 - Sheet1.csv
print("\n--- Bias Analysis for data2 - Sheet1.csv ---")
biased_features_data2 = set()
if 'dpd_results_data2' in globals() and not dpd_results_data2.empty:
    print("\nAnalyzing Demographic Parity Difference (DPD) for data2 - Sheet1.csv:")
    for feature in dpd_results_data2.index:
        if any(dpd_results_data2.loc[feature] > bias_threshold):
            biased_features_data2.add(feature)
            print(f"- {feature} (DPD bias detected for classes: {[cls for cls, val in dpd_results_data2.loc[feature].items() if val > bias_threshold]})")
else:
    print("Fairness evaluation results for data2 - Sheet1.csv not found or empty.")

if 'eod_results_data2' in globals() and not eod_results_data2.empty:
    print("\nAnalyzing Equalized Odds Difference (EOD) for data2 - Sheet1.csv:")
    for feature in eod_results_data2.index:
        if any(eod_results_data2.loc[feature] > bias_threshold):
            biased_features_data2.add(feature)
            print(f"- {feature} (EOD bias detected for classes: {[cls for cls, val in eod_results_data2.loc[feature].items() if val > bias_threshold]})")
else:
    print("Fairness evaluation results for data2 - Sheet1.csv not found or empty.")

if not biased_features_data2:
    print("\nNo features found with bias above the threshold for data2 - Sheet1.csv.")
else:
    print("\nSummary of features with potentially significant bias for data2 - Sheet1.csv:")
    for feature in biased_features_data2:
        print(f"- {feature}")

import pandas as pd
import plotly.express as px
if 'dpd_results_data1' not in globals() or 'eod_results_data1' not in globals() or \
   'dpd_results_data2' not in globals() or 'eod_results_data2' not in globals() or \
   dpd_results_data1.empty or eod_results_data1.empty or dpd_results_data2.empty or eod_results_data2.empty:
    print("Fairness evaluation results for both datasets are required. Please run the fairness evaluation cell first.")
else:
    # Calculate the difference in DPD and EOD between the two datasets
    dpd_diff = dpd_results_data2.align(dpd_results_data1, join='outer', axis=0)[0] - dpd_results_data2.align(dpd_results_data1, join='outer', axis=0)[1]
    eod_diff = eod_results_data2.align(eod_results_data1, join='outer', axis=0)[0] - eod_results_data2.align(eod_results_data1, join='outer', axis=0)[1]

    # Fill NaN values
    dpd_diff = dpd_diff.fillna(0)
    eod_diff = eod_diff.fillna(0)


    print("Difference in Demographic Parity Difference (data2 - data1):")
    display(dpd_diff)

    print("\nDifference in Equalized Odds Difference (data2 - data1):")
    display(eod_diff)


    # Visualize the differences
    dpd_diff_melted = dpd_diff.reset_index().melt(id_vars='index', var_name='Target Class', value_name='DPD Difference')
    dpd_diff_melted = dpd_diff_melted.rename(columns={'index': 'Sensitive Feature'})

    eod_diff_melted = eod_diff.reset_index().melt(id_vars='index', var_name='Target Class', value_name='EOD Difference')
    eod_diff_melted = eod_diff_melted.rename(columns={'index': 'Sensitive Feature'})

    fig_dpd = px.bar(dpd_diff_melted, x='Sensitive Feature', y='DPD Difference', color='Target Class',
                     title='Difference in Demographic Parity Difference (data2 - data1) by Sensitive Feature and Target Class',
                     barmode='group')
    fig_dpd.update_layout(xaxis={'categoryorder':'total descending'})
    fig_dpd.show()

    fig_eod = px.bar(eod_diff_melted, x='Sensitive Feature', y='EOD Difference', color='Target Class',
                     title='Difference in Equalized Odds Difference (data2 - data1) by Sensitive Feature and Target Class',
                     barmode='group')
    fig_eod.update_layout(xaxis={'categoryorder':'total descending'})
    fig_eod.show()

import pandas as pd

if 'dpd_diff' not in globals() or 'eod_diff' not in globals() or dpd_diff.empty or eod_diff.empty:
    print("Difference dataframes (dpd_diff, eod_diff) not found or are empty. Please run the previous cell to calculate them.")
else:
    print("--- Analysis of Bias Differences (data2 - data1) ---")

    # Analyze DPD differences
    print("\nAnalyzing Demographic Parity Difference (DPD) changes:")
    if not dpd_diff.empty:

        max_dpd_diff = dpd_diff.abs().max().max()
        if max_dpd_diff > 0:

            max_dpd_feature, max_dpd_class = dpd_diff.abs().stack().idxmax()
            max_dpd_value = dpd_diff.loc[max_dpd_feature, max_dpd_class]
            change_direction = "increase" if max_dpd_value > 0 else "decrease"
            print(f"The largest absolute DPD change ({abs(max_dpd_value):.4f}) is for '{max_dpd_feature}' in the '{max_dpd_class}' class. The bias {change_direction}d in data2 compared to data1.")

            # Identify features with notable DPD changes (here absolute difference > 0.2)
            notable_dpd_changes = dpd_diff.stack()[(dpd_diff.abs().stack() > 0.2)].sort_values(ascending=False)
            if not notable_dpd_changes.empty:
                print("\nSensitive features and target classes with notable DPD changes (absolute difference > 0.2):")
                for (feature, target_class), value in notable_dpd_changes.items():
                    change_direction = "increased" if value > 0 else "decreased"
                    print(f"- '{feature}' ('{target_class}' class): Change of {value:.4f}. Bias {change_direction}.")
            else:
                print("\nNo features show a notable DPD change (absolute difference > 0.2).")

        else:
            print("No change in DPD observed between the datasets.")
    else:
        print("DPD difference dataframe is empty.")


    # Analyze EOD differences
    print("\nAnalyzing Equalized Odds Difference (EOD) changes:")
    if not eod_diff.empty:


        max_eod_diff = eod_diff.abs().max().max()
        if max_eod_diff > 0:
            max_eod_feature, max_eod_class = eod_diff.abs().stack().idxmax()
            max_eod_value = eod_diff.loc[max_eod_feature, max_eod_class]
            change_direction = "increase" if max_eod_value > 0 else "decrease"
            print(f"The largest absolute EOD change ({abs(max_eod_value):.4f}) is for '{max_eod_feature}' in the '{max_eod_class}' class. The bias {change_direction}d in data2 compared to data1.")

            # Identify features with notable EOD changes (here absolute difference > 0.2)
            notable_eod_changes = eod_diff.stack()[(eod_diff.abs().stack() > 0.2)].sort_values(ascending=False)
            if not notable_eod_changes.empty:
                print("\nSensitive features and target classes with notable EOD changes (absolute difference > 0.2):")
                for (feature, target_class), value in notable_eod_changes.items():
                    change_direction = "increased" if value > 0 else "decreased"
                    print(f"- '{feature}' ('{target_class}' class): Change of {value:.4f}. Bias {change_direction}.")
            else:
                 print("\nNo features show a notable EOD change (absolute difference > 0.2).")
        else:
             print("No change in EOD observed between the datasets.")
    else:
        print("EOD difference dataframe is empty.")

# Task
Generate a Flask API that takes data as input, preprocesses it using the previously trained preprocessor, and returns predictions using the previously trained model. The API should be deployable and accessible from a web application.

## Setup flask and dependencies

### Subtask:
Install necessary libraries like Flask and gunicorn.


**Reasoning**:
Install Flask and gunicorn using pip.



%pip install Flask gunicorn

## Save the trained model and preprocessor

### Subtask:
Save the trained model pipeline and the preprocessor to disk so they can be loaded by the API.


%pip install fairlearn

**Reasoning**:
Now that fairlearn is installed, I can re-run the code to train the model and preprocessor and save them to disk.



!gdown 1dZT7wMzJqRPzNRSNpcBcpf59wEc33Nd3

**Reasoning**:
Now that the data file is downloaded, I can re-run the code to load the data, train the model and preprocessor, and then save them to disk.



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from fairlearn.metrics import MetricFrame, demographic_parity_difference, equalized_odds_difference
import joblib

# Load the dataset
try:
    df = pd.read_csv('data.csv', sep=';', quotechar='"')
except FileNotFoundError:
    print("Error: 'data.csv' not found. Make sure the dataset file is in the same directory.")
    raise FileNotFoundError("data.csv not found")


# Data Preprocessing
X = df.drop('Target', axis=1)
y = df['Target']

# The list of sensitive features to evaluate (used for fairness metrics, not dropped)
sensitive_features_list = ['Marital status', 'Application mode', 'Course',
                           'Previous qualification', 'Nacionality',
                           "Mother's qualification",
                           'Educational special needs',
                           'Tuition fees up to date', 'Gender',
                           'Age at enrollment', 'International']


# Identify categorical and numerical features
# Exclude sensitive features from the features to be transformed
categorical_features = X.select_dtypes(include=['object']).columns.difference(sensitive_features_list)
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.difference(sensitive_features_list)

# Create transformers for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different transformations to different columns
# 'remainder='passthrough'' keeps the columns not specified in the transformers,
# which includes the sensitive features that we need in X_test for fairness evaluation.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ], remainder='passthrough')


# Model Training

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create the full model pipeline including preprocessing and the classifier
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Train the model
print("Training the classification model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

# Save the trained model pipeline
joblib.dump(model_pipeline, 'model_pipeline.joblib')

# Save the trained preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

print("Model pipeline and preprocessor saved successfully.")

## Create the flask application

### Subtask:
Write the code for the Flask app, including loading the saved model and preprocessor.


**Reasoning**:
Write the Flask application code to load the model and preprocessor and define the necessary routes.



!pip install flask_cors pyngrok

import os
from flask import Flask, request, jsonify
import joblib
import pandas as pd
from flask_cors import CORS # Import CORS
from pyngrok import ngrok
import threading
from google.colab import userdata

# Define the path to the saved model and preprocessor
MODEL_PATH = 'model_pipeline.joblib'
PREPROCESSOR_PATH = 'preprocessor.joblib'

# Load the saved model and preprocessor
try:
    model_pipeline = joblib.load(MODEL_PATH)
    preprocessor = joblib.load(PREPROCESSOR_PATH)
    print("Model pipeline and preprocessor loaded successfully.")
except FileNotFoundError:
    print(f"Error: Model or preprocessor file not found. Ensure '{MODEL_PATH}' and '{PREPROCESSOR_PATH}' exist.")
    exit() # Exit if files are not found

# Initialize the Flask application
app = Flask(__name__)
CORS(app) # Enable CORS for all origins

# Define the root route
@app.route('/')
def index():
    return "Flask app is running"

# Define the prediction route
@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get data from the request
        data = request.get_json()

        # Convert the incoming JSON data to a pandas DataFrame
        # Assuming the JSON data is a list of dictionaries, where each dictionary is a data point
        input_df = pd.DataFrame(data)

        # Ensure the input DataFrame has the same columns as the training data
        # This is a basic check; a more robust solution would handle missing/extra columns
        # based on the columns used during training. For simplicity, we'll assume
        # the input data matches the original training data columns before preprocessing.
        # You would typically get the original column names from your training data
        # or save them during the training phase.
        # For now, we'll rely on the preprocessor's remainder='passthrough' to handle
        # columns not explicitly transformed, assuming sensitive features are present
        # and other unexpected columns are passed through (though this might not be
        # the desired behavior). A better approach would align columns with X_train.columns.

        # Since the preprocessor was fitted on X_train which included sensitive features
        # with remainder='passthrough', the input_df *must* contain all columns
        # present in the original training DataFrame (X). Let's load the original
        # dataframe columns to ensure alignment.
        try:
            original_df_columns = pd.read_csv('data.csv', sep=';', quotechar='"').drop('Target', axis=1).columns.tolist()
            # Reindex the input_df to match the original columns.
            # This will add missing columns with NaN and drop extra columns.
            # You might need to handle NaNs appropriately based on your data.
            input_df = input_df.reindex(columns=original_df_columns, fill_value=None) # Use None or a suitable default

        except FileNotFoundError:
             print("Error: 'data.csv' not found. Cannot align input data columns.")
             return jsonify({'error': 'Internal server error: Original data columns not found.'}), 500
        except Exception as e:
             print(f"Error aligning input data columns: {e}")
             return jsonify({'error': f'Internal server error: Could not align input data columns: {e}'}), 500


        # Preprocess the input data using the loaded preprocessor
        processed_data = preprocessor.transform(input_df)

        # Make predictions using the loaded model pipeline
        predictions = model_pipeline.predict(processed_data)

        # Convert predictions to a list and return as JSON
        return jsonify(predictions.tolist())

    except Exception as e:
        # Log the error for debugging
        app.logger.error('An error occurred during prediction: %s', e)
        return jsonify({'error': 'An internal error occurred. Please try again later.'}), 500

if __name__ == '__main__':
    # Get ngrok authtoken from Colab secrets
    NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')

    # Set ngrok authtoken
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

    # Start ngrok tunnel
    public_url = ngrok.connect(5000).public_url
    print(f" * ngrok tunnel is live at {public_url}")

    # Start Flask app in a separate thread
    threading.Thread(target=app.run, kwargs={'host': '0.0.0.0', 'port': 5000, 'debug': True, 'use_reloader': False}).start()