In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [2]:
df = pd.read_csv('data/processed/global_panel.csv')

df['year'] = pd.to_numeric(df['year'], errors='coerce')
df_ekc = df[['country', 'year', 'gdp', 'co2_per_capita', 'population']].dropna()
df_ekc['gdp_per_capita'] = df_ekc['gdp'] / df_ekc['population']
df_ekc = df_ekc[(df_ekc['year'] >= 1970) & (df_ekc['year'] <= 2022)]


df_ekc = df_ekc[df_ekc['co2_per_capita'] < 50]  # example threshold

In [3]:
latest_year = df_ekc['year'].max()
df_latest = df_ekc[df_ekc['year'] == latest_year]

import plotly.express as px

fig = px.scatter(
    df_latest,
    x='gdp_per_capita',
    y='co2_per_capita',
    hover_name='country',
    size='gdp_per_capita',
    log_x=True,
    title=f"CO₂ per Capita vs GDP per Capita ({latest_year})"
)
fig.show()

In [4]:
import pycountry_convert as pc

def get_continent(country):
    try:
        country_code = pc.country_name_to_country_alpha2(country)
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        return pc.convert_continent_code_to_continent_name(continent_code)
    except:
        return None

df_ekc['continent'] = df_ekc['country'].apply(get_continent)

In [5]:
df_ekc = df_ekc[['country','year','gdp_per_capita','co2_per_capita','continent']].dropna(subset=['gdp_per_capita','co2_per_capita'])
df_ekc = df_ekc[(df_ekc['co2_per_capita'] >= 0) & (df_ekc['gdp_per_capita'] > 0)]


In [6]:
analysis_year = int(df_ekc['year'].max())   # default latest year
df_year = df_ekc[df_ekc['year'] == analysis_year].copy()

df_year['gdp_pc_sq'] = df_year['gdp_per_capita'] ** 2

# Fit EKC: co2_per_capita ~ gdp_per_capita + gdp_per_capita^2
model = smf.ols('co2_per_capita ~ gdp_per_capita + gdp_pc_sq', data=df_year).fit()
print(model.summary())

gdp_grid = np.linspace(df_year['gdp_per_capita'].quantile(0.01), df_year['gdp_per_capita'].quantile(0.99), 200)
pred_df = pd.DataFrame({'gdp_per_capita': gdp_grid})
pred_df['gdp_pc_sq'] = pred_df['gdp_per_capita']**2
pred_df['fitted'] = model.predict(pred_df)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_year['gdp_per_capita'],
    y=df_year['co2_per_capita'],
    mode='markers',
    name='Countries',
    hovertext=df_year['country'],
    marker=dict(size=8)
))
fig.add_trace(go.Scatter(
    x=pred_df['gdp_per_capita'],
    y=pred_df['fitted'],
    mode='lines',
    name='EKC (quadratic fit)'
))
fig.update_layout(
    title=f'Environmental Kuznets Curve (Quadratic) — {analysis_year}',
    xaxis_title='GDP per Capita',
    yaxis_title='CO₂ per Capita',
)
fig.update_xaxes(type='log')
fig.show()

                            OLS Regression Results                            
Dep. Variable:         co2_per_capita   R-squared:                       0.557
Model:                            OLS   Adj. R-squared:                  0.551
Method:                 Least Squares   F-statistic:                     101.2
Date:                Sat, 20 Dec 2025   Prob (F-statistic):           3.50e-29
Time:                        15:04:11   Log-Likelihood:                -447.16
No. Observations:                 164   AIC:                             900.3
Df Residuals:                     161   BIC:                             909.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.7290      0.482      1.

In [7]:
df_ekc[df_ekc['continent'].isna()]

Unnamed: 0,country,year,gdp_per_capita,co2_per_capita,continent
3245,Cote d'Ivoire,1970,2971.172857,0.414,
3246,Cote d'Ivoire,1971,2987.331983,0.434,
3247,Cote d'Ivoire,1972,3014.946978,0.459,
3248,Cote d'Ivoire,1973,3108.766930,0.465,
3249,Cote d'Ivoire,1974,3055.494256,0.501,
...,...,...,...,...,...
3737,Democratic Republic of Congo,2018,842.768703,0.042,
3738,Democratic Republic of Congo,2019,853.094574,0.044,
3739,Democratic Republic of Congo,2020,839.857837,0.045,
3740,Democratic Republic of Congo,2021,863.747533,0.043,


In [8]:
manual_continent_fix = {
    "Democratic Republic of Congo": "Africa",
    "Congo": "Africa",
    "Cote d'Ivoire": "Africa",
    "Ivory Coast": "Africa",
    "Eswatini": "Africa",
    "Cabo Verde": "Africa",
    "São Tomé and Príncipe": "Africa",
    "Timor": "Asia",
    "State of Palestine": "Asia",
    "Micronesia (country)": "Oceania"
}

In [9]:
def get_continent(country):

    if country in manual_continent_fix:
        return manual_continent_fix[country]

    try:
        cc = pc.country_name_to_country_alpha2(country)
        cont = pc.country_alpha2_to_continent_code(cc)
        return pc.convert_continent_code_to_continent_name(cont)
    except:
        return None

In [10]:
df_ekc['continent'] = df_ekc['country'].apply(get_continent)


In [11]:
continents = df_ekc['continent'].dropna().unique().tolist()
cont_counts = df_ekc[df_ekc['year'] == analysis_year].groupby('continent').size()
continents = [c for c in continents if cont_counts.get(c,0) >= 10]  # only continents with >=10 countries

fig = go.Figure()
for c in sorted(df_ekc['continent'].unique()):
    sub = df_ekc[(df_ekc['year']==analysis_year) & (df_ekc['continent']==c)]
    if sub.shape[0] == 0:
        continue
    fig.add_trace(go.Scatter(
        x=sub['gdp_per_capita'], y=sub['co2_per_capita'],
        mode='markers', name=str(c), hovertext=sub['country'],
        marker=dict(size=7), visible=True
    ))

for c in continents:
    sub = df_ekc[(df_ekc['year']==analysis_year) & (df_ekc['continent']==c)]
    sub = sub.copy()
    sub['gdp_pc_sq'] = sub['gdp_per_capita']**2
    mod = smf.ols('co2_per_capita ~ gdp_per_capita + gdp_pc_sq', data=sub).fit()
    grid = np.linspace(sub['gdp_per_capita'].quantile(0.01), sub['gdp_per_capita'].quantile(0.99), 120)
    pred = pd.DataFrame({'gdp_per_capita':grid})
    pred['gdp_pc_sq'] = pred['gdp_per_capita']**2
    pred['fitted'] = mod.predict(pred)
    fig.add_trace(go.Scatter(x=pred['gdp_per_capita'], y=pred['fitted'], mode='lines', name=f'{c} fit'))

fig.update_layout(title=f'EKC by Continent — {analysis_year}', xaxis_title='GDP per Capita', yaxis_title='CO₂ per Capita')
fig.update_xaxes(type='log')
fig.show()
