In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import random
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

def define_files():
    files = {
        "air_pollution_death": "data/air_pollution_death.csv",
        "transportation": "data/road-transportation_country_emissions.csv",
        "coal": "data/coal-mining_country_emissions.csv",
        "cropland": "data/cropland-fires_country_emissions.csv",
        "residential_commercial": "data/residential-and-commercial-onsite-fuel-usage_country_emissions.csv",
        "forest_clearing": "data/forest-land-clearing_country_emissions.csv",
        "petrochemicals": "data/petrochemicals_country_emissions.csv",
        "electricity_generation": "data/electricity-generation_country_emissions.csv",
        "incineration_open_burning": "data/incineration-and-open-burning-of-waste_country_emissions.csv",
        "health_expenditure": "data/health-expenditure.csv",
        "urban_population": "data/urban-population.csv"
    }
    return files

# Set seed for reproducibility
random_seed = 42
np.random.seed(random_seed)
random.seed(random_seed)

# Load datasets with adjusted parameters for health_expenditure
def load_data():
    import csv
    
    files = define_files()
    air_pollution_death = pd.read_csv(files['air_pollution_death'])
    transportation = pd.read_csv(files['transportation'])
    
    health_expenditure = None  # Initialize to avoid UnboundLocalError
    
    try:
        health_expenditure = pd.read_csv(
            files['health_expenditure'], 
            skiprows=3, 
            engine='python',
            sep=',',
            quotechar='"',
            skipinitialspace=True,
            on_bad_lines='warn',  # Warn instead of skipping silently
            quoting=csv.QUOTE_MINIMAL  # Standard CSV quoting
        )
    except Exception as e:
        print(f"Error occurred while reading health_expenditure.csv: {e}")
        
        # Read the file line-by-line to identify the problematic row
        with open(files['health_expenditure'], 'r', encoding='utf-8') as file:
            for i, line in enumerate(file, 1):
                if '"' in line and line.count('"') % 2 != 0:  # Unmatched quotes
                    print(f"Issue found on line {i}: {line.strip()}")
        
        # Attempt to load again with relaxed CSV parsing
        try:
            health_expenditure = pd.read_csv(
                files['health_expenditure'],
                skiprows=3,
                engine='python',
                sep=',',
                quotechar='"',
                skipinitialspace=True,
                on_bad_lines='skip',
                quoting=csv.QUOTE_NONE,  # Ignore quote issues
                escapechar='\\'
            )
            print("Loaded with relaxed parsing rules.")
        except Exception as e2:
            print(f"Failed again after relaxing CSV rules: {e2}")
            return air_pollution_death, transportation, None  # Gracefully handle failure

    if health_expenditure is not None:
        # Clean column names
        health_expenditure.columns = health_expenditure.columns.str.replace('"', '').str.strip()

    return air_pollution_death, transportation, health_expenditure

# Data preprocessing
def preprocess_data(air_pollution_death, transportation, health_expenditure):
    # Clean column names
    air_pollution_death.columns = air_pollution_death.columns.str.strip()
    transportation.columns = transportation.columns.str.strip()
    health_expenditure.columns = health_expenditure.columns.str.strip()
    
    # Check available columns
    print("Health Expenditure Columns:", health_expenditure.columns.tolist())
    
    # Identify common countries
    common_countries = set(air_pollution_death['SpatialDimValueCode']).intersection(
        transportation['iso3_country'], health_expenditure['Country Code']
    )
    print(f"Common countries found: {len(common_countries)}")

    if not common_countries:
        raise ValueError("No common countries found across datasets. Check for mismatched country codes.")
    
    # Filter datasets
    df = air_pollution_death.merge(transportation, left_on='SpatialDimValueCode', right_on='iso3_country') \
                             .merge(health_expenditure, left_on='SpatialDimValueCode', right_on='Country Code')

    # Check dataset size after merging
    print(f"Dataset size after merging: {df.shape}")

    if df.empty:
        raise ValueError("Merged DataFrame is empty. Check data consistency across datasets.")
    
    return df

# Model training and evaluation
def train_model(df):
    if df.empty:
        raise ValueError("Cannot train model on an empty DataFrame.")
    
    X = df.drop(['SpatialDimValueCode', 'Air Pollution Deaths'], axis=1)
    X = X.select_dtypes(include=[np.number])
    y = df['Air Pollution Deaths'].astype(float)

    print(f"Training samples: {X.shape[0]}")

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_seed
    )
    # Drop the key columns and the target column, then select only numeric features.
    X = df.drop(['SpatialDimValueCode', 'Air Pollution Deaths'], axis=1)
    X = X.select_dtypes(include=[np.number])
    y = df['Air Pollution Deaths'].astype(float)
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

    scaler = StandardScaler()
    X_train_val = scaler.fit_transform(X_train_val)
    X_test = scaler.transform(X_test)

    xgb_model = xgb.XGBRegressor(
        subsample=0.7,
        colsample_bytree=0.7,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=random_seed,
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6
    )

    xgb_model.fit(X_train_val, y_train_val)
    y_pred_train = xgb_model.predict(X_train_val)
    y_pred_test = xgb_model.predict(X_test)

    train_r2 = r2_score(y_train_val, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    print(f'Train R^2 Score: {train_r2}')
    print(f'Test R^2 Score: {test_r2}')

    return xgb_model, X_train_val, X_test, y_train_val, y_test

# SHAP beeswarm plot
def generate_beeswarm_plot(model, X_train_val):
    explainer = shap.Explainer(model)
    shap_values = explainer(X_train_val)
    shap.summary_plot(shap_values, X_train_val, plot_type="bee swarm")
    plt.savefig("beeswarm_plot.png", bbox_inches='tight')

# HTML generation
def generate_html(df):
    fig = px.scatter(df, x='Transportation Emissions', y='Air Pollution Deaths', color='SpatialDimValueCode')
    fig.write_html('scatter_plot.html')
    fig_choropleth = px.choropleth(df, locations='SpatialDimValueCode', color='Air Pollution Deaths',
                                   hover_name='SpatialDimValueCode', hover_data=['Air Pollution Deaths'])
    fig_choropleth.write_html('choropleth_map.html')

# Main execution
if __name__ == "__main__":
    air_pollution_death, transportation, health_expenditure = load_data()
    df = preprocess_data(air_pollution_death, transportation, health_expenditure)
    model, X_train_val, X_test, y_train_val, y_test = train_model(df)
    generate_beeswarm_plot(model, X_train_val)
    generate_html(df)


Error occurred while reading health_expenditure.csv: ',' expected after '"'
Loaded with relaxed parsing rules.
Health Expenditure Columns: ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', 'Unnamed: 68']


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.