In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('../../data/per-capita-ghg-emissions.csv')

# Filter the DataFrame to include only the years from 2010 onwards
df_filtered = df[df['Year'] >= 2010]

# Group the data by country code
grouped = df_filtered.groupby('Code')

# Function to calculate percentage reduction for each country
def calculate_percentage_reduction(group):
    if 2010 in group['Year'].values:
        start_year_emission = group[group['Year'] == 2010]['Per-capita greenhouse gas emissions in CO₂ equivalents'].values[0]
        end_year_emission = group[group['Year'] == group['Year'].max()]['Per-capita greenhouse gas emissions in CO₂ equivalents'].values[0]
        percentage_reduction = ((start_year_emission - end_year_emission) / start_year_emission) * 100
    else:
        percentage_reduction = float('nan')  # Assign NaN if 2010 data is not available
    return percentage_reduction

# Create a new DataFrame with unique countries and their percentage reduction
percentage_reduction_df = grouped.apply(calculate_percentage_reduction).reset_index()
percentage_reduction_df.columns = ['Code', 'Percentage Reduction']

# Optional: Merge with country names or other relevant information if needed
# Example:
# percentage_reduction_df = df[['Code', 'Country']].drop_duplicates().merge(percentage_reduction_df, on='Code', how='left')

# Output the new DataFrame
percentage_reduction_df.head()

Unnamed: 0,Code,Percentage Reduction
0,AFG,20.375948
1,AGO,52.263295
2,ALB,7.568916
3,AND,34.203987
4,ARE,-17.575822


In [None]:
import pandas as pd

# Load the dataset
file_path = '../../data/modern-renewable-energy-consumption.csv'
df = pd.read_csv(file_path)

# Filter the dataframe for the year 2022
df_2022 = df[df['Year'] == 2022].copy()

# Define the columns related to renewable energy sources
renewable_sources = [
    'Other renewables (including geothermal and biomass) electricity generation - TWh',
    'Solar generation - TWh',
    'Wind generation - TWh',
    'Hydro generation - TWh'
]

# Calculate the total renewable energy generation
df_2022['Total Renewable Energy Generation (TWh)'] = df_2022[renewable_sources].sum(axis=1)

# Calculate the adoption rate as a percentage of the total renewable energy generation
df_2022['Renewable Energy Adoption Rate (%)'] = df_2022['Total Renewable Energy Generation (TWh)'] / df_2022['Total Renewable Energy Generation (TWh)'].sum() * 100

# Select relevant columns for the new dataframe
df_adoption_rate = df_2022[['Entity', 'Code', 'Year', 'Total Renewable Energy Generation (TWh)', 'Renewable Energy Adoption Rate (%)']]

# Display the new dataframe
df_adoption_rate = df_adoption_rate.dropna()
df_adoption_rate.head()

Unnamed: 0,Entity,Code,Year,Total Renewable Energy Generation (TWh),Renewable Energy Adoption Rate (%)
150,Algeria,DZA,2022,0.681,0.001322
234,Argentina,ARG,2022,42.227462,0.081992
411,Australia,AUS,2022,87.441743,0.169784
450,Austria,AUT,2022,49.715355,0.096531
504,Azerbaijan,AZE,2022,1.84255,0.003578


In [None]:
# Load the dataset
file_path = '../../data/gdp-per-capita-worldbank.csv'
df_gdp = pd.read_csv(file_path)

# Filter the dataframe for the year 2022
df_gdp = df_gdp[df_gdp['Year'] == 2022]
df_gdp = df_gdp.rename(columns={'GDP per capita, PPP (constant 2017 international $)': 'GDP per capita, PPP ($)'})

df_region = pd.read_csv('../../data/region_mapping.csv')
df_region = df_region[['alpha-3', 'region']]

df_gdp = df_gdp.merge(df_region, left_on='Code', right_on='alpha-3')
df_gdp.head()

Unnamed: 0,Entity,Code,Year,"GDP per capita, PPP ($)",alpha-3,region
0,Albania,ALB,2022,15492.067,ALB,Europe
1,Algeria,DZA,2022,11198.233,DZA,Africa
2,Angola,AGO,2022,5906.1157,AGO,Africa
3,Antigua and Barbuda,ATG,2022,22321.87,ATG,Americas
4,Argentina,ARG,2022,22461.441,ARG,Americas


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_merged is your merged DataFrame containing the necessary data
df_merged = df_merged.rename(columns={'region': 'Region'})

# Determine the top 5 highest GDP per capita
top_gdp_countries = df_merged.nlargest(5, 'GDP per capita, PPP ($)')['Entity'].tolist()

# Determine the top 5 highest percentage reduction in GHG emissions
top_reduction_countries = df_merged.nlargest(5, 'Percentage Reduction')['Entity'].tolist()

# Identify outliers using Z-score (for both GDP per capita and Percentage Reduction)
df_merged['Z_GDP'] = (df_merged['GDP per capita, PPP ($)'] - df_merged['GDP per capita, PPP ($)'].mean()) / df_merged['GDP per capita, PPP ($)'].std()
df_merged['Z_Reduction'] = (df_merged['Percentage Reduction'] - df_merged['Percentage Reduction'].mean()) / df_merged['Percentage Reduction'].std()

outliers = df_merged[(df_merged['Z_GDP'].abs() > 2) | (df_merged['Z_Reduction'].abs() > 2)]['Entity'].tolist()

# Combine all the selected countries
countries_to_label = list(set(top_gdp_countries + top_reduction_countries + outliers))

# Plotting
plt.figure(figsize=(14, 8))

# Scatter plot with trendline
sns.regplot(
    data=df_merged,
    x='GDP per capita, PPP ($)',  # X-Axis
    y='Percentage Reduction',  # Y-Axis
    scatter=False,  # Don't show scatter points here
    color='blue',  # Color of the trendline
    line_kws={'label': 'Trendline'}  # Label for the trendline
)

# Add the scatter points separately to keep the original styling
scatter = sns.scatterplot(
    data=df_merged,
    x='GDP per capita, PPP ($)',  # X-Axis
    y='Percentage Reduction',  # Y-Axis
    size='Renewable Energy Adoption Rate (%)',  # Circle Size
    hue='Region',  # Circle Color
    palette='Set1',
    sizes=(20, 200),
    alpha=0.7,
    edgecolor='w',
    linewidth=0.5
)

# Add text labels only for selected countries based on statistical criteria
for line in range(0, df_merged.shape[0]):
    if df_merged['Entity'].iloc[line] in countries_to_label:
        plt.text(
            df_merged['GDP per capita, PPP ($)'].iloc[line], 
            df_merged['Percentage Reduction'].iloc[line], 
            df_merged['Entity'].iloc[line], 
            horizontalalignment='left', 
            size='medium', 
            color='black', 
            weight='semibold'
        )

# Customize the plot
plt.title('Economic Context and Policy Effectiveness')
plt.xlabel('GDP per Capita, PPP ($)')
plt.ylabel('Percentage Reduction in GHG Emissions')

# First, create the legend for 'region' (hue)
hue_legend = scatter.legend(title='Region', bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3)
scatter.add_artist(hue_legend)  # Add the hue legend back after customizing it

# Now create the legend for 'Renewable Energy Adoption Rate (%)' (size)
size_legend = plt.legend(title='Renewable Energy Adoption Rate (%)', loc='lower right', bbox_to_anchor=(1, -0.15), ncol=1)

# Show the trendline in the legend
plt.legend(loc='best')

plt.grid(True, linestyle='--', alpha=0.5)

plt.show()

NameError: name 'df_merged' is not defined

In [None]:
merged_df.to_csv('save.csv', index=False)