<a href="https://colab.research.google.com/github/william-toscani/Data_Visualization_Project/blob/main/Data_Viz_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries

In [None]:
!pip install pandasql

In [13]:
import requests
import pandas as pd
from pandasql import sqldf
sql = lambda q: sqldf(q, globals())

import plotly.express as px

# Datasets

- World
- GDP
- CO2 Emissions
- Share energy
- Population
- Energy consumption

In [None]:
world_raw = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
world_raw.rename(columns={'alpha-3': 'alpha_3', 'sub-region': 'sub_region'}, inplace=True)
#display(world_raw.info(), world_raw.describe())

world_clean = sql(""" SELECT name as country, alpha_3 as code, region as continent, sub_region FROM world_raw """)

display(world_clean)

#####################################################################

gdp_raw = pd.read_csv("https://ourworldindata.org/grapher/gdp-worldbank.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
#gdp_metadata = requests.get("https://ourworldindata.org/grapher/gdp-worldbank.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

gdp_clean = sql("""
SELECT wc.code, wc.name, ny_gdp_mktp_pp_kd as gdp FROM gdp_raw
INNER JOIN world_clean as wc ON gdp_raw.Code = wc.code
WHERE year = 2024
ORDER BY gdp DESC
""")

display(gdp_clean)

#####################################################################

emission_raw = pd.read_csv("https://ourworldindata.org/grapher/annual-co2-emissions-per-country.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
#emission_metadata = requests.get("https://ourworldindata.org/grapher/annual-co2-emissions-per-country.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

#####################################################################

share_raw = pd.read_csv("https://ourworldindata.org/grapher/share-of-primary-energy-consumption-by-source.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
#metadata = requests.get("https://ourworldindata.org/grapher/share-of-primary-energy-consumption-by-source.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

fossil_columns = ['gas', 'oil', 'coal', 'nuclear']
renew_cols = ['wind', 'hydro', 'other_renewables', 'solar', 'biofuels']

new_columns = {}
for col in share_raw.columns:
    if '__pct_direct_primary_energy' in col:
        new_name = col.replace('__pct_direct_primary_energy', '')
        new_columns[col] = new_name
share_raw = share_raw.rename(columns=new_columns)

share_clean = sql(f"""
SELECT wc.name, wc.code, wc.region, {",".join(fossil_columns)}, {",".join(renew_cols)} FROM share_raw
INNER JOIN gdp_clean ON share_raw.Code = gdp_clean.code
INNER JOIN world_clean as wc ON share_raw.Code = wc.code
WHERE year = 2024 AND wc.region IS "Europe"
""")

display(share_clean)

#####################################################################

pop_raw = pd.read_csv("https://ourworldindata.org/grapher/population-with-un-projections.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
#pop_metadata = requests.get("https://ourworldindata.org/grapher/population-with-un-projections.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

pop_clean = sql("""
SELECT pop_raw.Code, population__sex_all__age_all__variant_medium as population FROM pop_raw
INNER JOIN world_clean ON pop_raw.Code = world_clean.code
WHERE year = 2024
""")

display(pop_clean)

#####################################################################

energy_cons_raw = pd.read_csv("https://ourworldindata.org/grapher/energy-consumption-by-source-and-country.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
#metadata = requests.get("https://ourworldindata.org/grapher/energy-consumption-by-source-and-country.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

columns_to_sum = [col for col in energy_cons_raw.columns if col not in ['Year', 'Code', 'Entity']]
energy_cons_raw['total_consumption'] = energy_cons_raw[columns_to_sum].sum(axis=1)
#display(energy_cons_raw)

energy_cons_clean = sql("""
SELECT energy_cons_raw.code, total_consumption FROM energy_cons_raw
INNER JOIN gdp_clean ON energy_cons_raw.Code = gdp_clean.code
WHERE year = 2024
""")

display(energy_cons_clean)

#####################################################################


'''
path = kagglehub.dataset_download("pralabhpoudel/world-energy-consumption")
cons_raw = pd.read_csv(f"{path}/World Energy Consumption.csv")
#display(cons_raw)




cons_clean = sql("""
SELECT * FROM cons_raw
INNER JOIN world_clean ON cons_raw.iso_code = world_clean.code
WHERE year
""")
display(cons_clean)
'''

# Data Analysis

## Top 10 countries for renewable energy

In [None]:
fig1 = px.bar(share_clean,
             x='renewable',
             y='name',
             color='region',
             orientation='h',
             title='Top 10 countries for renewable share',
             text='renewable')

fig1.update_traces(texttemplate='%{text:.2s}%', textposition='inside') # Position text inside the bar
fig1.update_layout(yaxis={'categoryorder':'total ascending', 'title': ''}, # Remove y-axis title
                    xaxis={'title': ''},
                    width=1000, height=1000, font=dict(size=15),
                    title_x=0.5) # Center the title
fig1.show()

## Top 10 countries for fossil energy

In [None]:
fig2 = px.bar(fossil_share_clean,
             x='renewable',
             y='name',
             color='region',
             orientation='h',
             title='Top 10 countries for renewable share',
             text='renewable')

fig2.update_traces(texttemplate='%{text:.2f}%', textposition='inside') # Position text inside the bar
fig2.update_layout(yaxis={'categoryorder':'total ascending', 'title': ''}, # Remove y-axis title
                    xaxis={'title': ''},
                    width=1000, height=1000, font=dict(size=15),
                    title_x=0.5) # Center the title
fig2.show()

## Emission and cons 2d map

In [None]:
fig = px.scatter(merged_data, x='emissions_per_gdp', y='consumption_per_gdp', color='region', hover_data=['Code'])
fig.update_traces(marker=dict(size=12)) # Increase point size
fig.update_layout(width=800, height=800) # Make the plot square
fig.show()

## Top Countries for Renewable Share (by Source) v1 e v2

In [None]:
renew_cols = ['hydro', 'nuclear', 'wind', 'solar', 'other_renewables', 'biofuels']

share_clean_melted = pd.melt(share_clean,
                             id_vars=['name', 'region', 'Code'],
                             value_vars=renew_cols,
                             var_name='renewable_source',
                             value_name='share_value')

share_clean_melted = share_clean_melted[share_clean_melted['share_value'] != 0]

display(share_clean_melted.head())

In [None]:
color_map = {
    'hydro': '#1F77B4',  # Vibrant Blue
    'nuclear': '#9467BD', # Vibrant Purple
    'wind': '#17BECF',    # Vibrant Cyan
    'solar': '#FFD700',   # Vibrant Gold/Yellow
    'other_renewables': '#2CA02C', # Vibrant Green
    'biofuels': '#8C564B' # Vibrant Brown
}

fig3 = px.bar(share_clean_melted,
             x='share_value',
             y='name',
             color='renewable_source',
             orientation='h',
             title='Top Countries for Renewable Share (by Source)',
             category_orders={'renewable_source': ['hydro', 'nuclear', 'wind', 'solar', 'other_renewables', 'biofuels']},
             color_discrete_map=color_map) # Apply custom color map

fig3.update_traces(textposition='none') # Remove text from inside the bar
fig3.update_layout(yaxis={'categoryorder':'total ascending', 'title': ''},
                    xaxis={'title': ''},
                    width=1000, height=1000, font=dict(size=15),
                    title_x=0.5,
                    legend_title_text='Renewable Sources') # Add title for the primary legend
fig3.show()

In [None]:
region_pastel_color_map = {
    'Europe': '#CBD5E8',  # Light Blue
    'Oceania': '#B3E2CD',  # Light Green
    'Americas': '#FDCDAC', # Light Orange
    'Asia': '#F4CAE4',    # Light Pink
    'Africa': '#E6F5C9',  # Pale Yellow Green
    None: '#CCCCCC' # Grey for any missing region
}

shapes_to_add = []

# Get the list of countries in the order they appear on the y-axis
# y_categories_ordered and y_category_to_pos are already available in the kernel state

for i, country_name in enumerate(y_categories_ordered):
    # Find the region for the current country
    country_region = share_clean[share_clean['name'] == country_name]['region'].iloc[0]

    # Get the corresponding pastel color
    fill_color = region_pastel_color_map.get(country_region, '#CCCCCC') # Default to grey if region not found

    # Create a rectangle shape for the 'shadow'
    shapes_to_add.append(
        go.layout.Shape(
            type="rect",
            xref="x",
            yref="y",
            x0=0, # Start at the beginning of the x-axis
            y0=i - 0.5, # Slightly below the bar center
            y1=i + 0.5, # Slightly above the bar center
            x1=100, # End at the maximum of the x-axis (total percentage)
            fillcolor=fill_color,
            layer="below", # Place behind the bars
            line_width=0, # No border for the shadow
            opacity=0.6 # Adjust opacity if desired
        )
    )

# Update the layout of fig3 with the new shapes
fig3.update_layout(shapes=shapes_to_add)

# Display the modified figure
fig3.show()

## Top 20 countries by emission per gdp

In [None]:
fig = px.bar(merged_data,
             x='emissions_per_gdp',
             y='Code',
             color='region',
             orientation='h',
             title='Top 20 Countries by Emissions per GDP',
             text='emissions_per_gdp')

fig.update_traces(texttemplate='%{text:.2e}', textposition='outside')
fig.update_layout(yaxis={'categoryorder':'total ascending', 'title': ''},
                    xaxis={'title': 'Emissions per GDP'},
                    width=1000, height=800, font=dict(size=15),
                    title_x=0.5)
fig.show()

## EMISSION GDP

In [None]:
global_emissions_gdp = sql("""
SELECT
    wc.name,
    wc.code,
    wc.region,
    er.emissions_total,
    gr.ny_gdp_mktp_pp_kd AS gdp,
    (CAST(er.emissions_total AS REAL) / gr.ny_gdp_mktp_pp_kd) AS emissions_per_gdp
FROM
    world_clean AS wc
INNER JOIN
    emission_raw AS er ON wc.code = er.Code
INNER JOIN
    gdp_raw AS gr ON wc.code = gr.Code
WHERE
    er.Year = 2024 AND gr.Year = 2024

ORDER BY emissions_per_gdp DESC
""")

display(global_emissions_gdp)

In [None]:
min_emissions_per_gdp = global_emissions_gdp['emissions_per_gdp'].min()
max_emissions_per_gdp = global_emissions_gdp['emissions_per_gdp'].max()

print(f"Global Minimum Emissions per GDP: {min_emissions_per_gdp}")
print(f"Global Maximum Emissions per GDP: {max_emissions_per_gdp}")

In [None]:
plot_data = global_emissions_gdp.copy()
plot_data['emissions_per_gdp_normalized'] = (plot_data['emissions_per_gdp'] - min_emissions_per_gdp) / (max_emissions_per_gdp - min_emissions_per_gdp)

# Select the top 20 countries based on original emissions_per_gdp
plot_data = plot_data.sort_values(by='emissions_per_gdp', ascending=False).head(20)

display(plot_data.head())

In [None]:
fig = px.bar(plot_data,
             x='emissions_per_gdp_normalized',
             y='name',
             color='region',
             orientation='h',
             title='Top 20 Countries by Normalized CO2 Emissions per GDP (2024)',
             text='emissions_per_gdp_normalized')

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(yaxis={'categoryorder':'total ascending', 'title': ''},
                    xaxis={'title': 'Normalized Emissions per GDP'},
                    width=1000, height=800, font=dict(size=15),
                    title_x=0.5)
fig.show()

## Bump plot trial

In [None]:
gdp_top20['gdp_rank'] = gdp_top20['gdp'].rank(ascending=False).astype(int)
display(gdp_top20.head())

In [None]:
merged_ranks = pd.merge(gdp_top20[['name', 'region', 'gdp_rank']],
                        emissions_rank_gdp_top20[['name', 'emissions_per_gdp_rank']],
                        on='name',
                        how='inner')
display(merged_ranks.head())

In [None]:
melted_ranks = merged_ranks.melt(id_vars=['name', 'region'],
                                 value_vars=['gdp_rank', 'emissions_per_gdp_rank'],
                                 var_name='rank_type',
                                 value_name='rank_value')

display(melted_ranks.head())

melted_ranks['rank_value'] = pd.to_numeric(melted_ranks['rank_value'])
display(melted_ranks.info())

# Create a list of text positions based on rank_type
text_positions = ['middle left' if rt == 'gdp_rank' else 'middle right' for rt in melted_ranks['rank_type']]

fig = px.line(melted_ranks,
              x='rank_type',
              y='rank_value',
              color='region',
              line_group='name',
              text='name',  # Set text to country name here
              hover_name='name',
              title='Shifts in Country Rankings: GDP vs. Emissions per GDP',
              labels={'rank_type': 'Rank Type', 'rank_value': 'Rank'})

fig.update_layout(yaxis={'autorange': 'reversed'}, width=800, height=800) # Invert y-axis and make it square
fig.update_traces(mode='lines+markers+text', marker=dict(size=12), # Increase marker size
                  line=dict(width=7), # Make lines thicker
                  textposition=text_positions) # Use dynamic text positions
fig.show()

# Scaletta Storytelling - Emissioni e Consumo Energetico

## 1. Introduzione - Il quadro globale

- Line chart emissioni globali (2004-2024)
- Treemap emissioni per continente (2024)

In [None]:

emission_clean = sql("""
SELECT wc.continet, SUM(emissions_total) AS total_emissions_per_region FROM emission_raw
INNER JOIN world_clean as wc ON emission_raw.Code = wc.code
WHERE year = 2024 AND wc.region IS NOT NULL
GROUP BY wc.region
ORDER BY total_emissions_per_region DESC
""")

display(emission_clean)

# ANALISI OK

In [None]:
color_map = {
    'Asia': px.colors.qualitative.Dark2[5],
    'Africa': px.colors.qualitative.Dark2[6],
    'Oceania': px.colors.qualitative.Dark2[0],
    'Americas': px.colors.qualitative.Dark2[1],
    'Europe': px.colors.qualitative.Dark2[2],
}

# Create a new column combining region name and total emissions for display
emission_clean['region_label_with_emissions'] = emission_clean['region'] + '' + emission_clean['total_emissions_per_region'].apply(lambda x: f'{x/1e9:.1f} Bt')

fig = px.treemap(emission_clean, path=['region_label_with_emissions'], # Use the new combined label
                 values='total_emissions_per_region',
                 color='region',
                 title='Carbon dioxide emissions by continents (billion tonnes)',
                 color_discrete_map=color_map
                 )

fig.update_layout(
    margin = dict(t=50, l=25, r=25, b=25),
    width=800, height=800,
    title_x=0.5, # Center the title horizontally
    title_y=0.95,
    title_font_size=24 # Make the title bigger
)

fig.update_traces(
    textfont_color='white', # Attempt to make the text white
    textfont_size=24, # Make the text bigger
    textposition='middle center' # Explicitly center the text
)

fig.show()

In [None]:
share_raw = pd.read_csv("https://ourworldindata.org/grapher/share-of-primary-energy-consumption-by-source.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
#metadata = requests.get("https://ourworldindata.org/grapher/share-of-primary-energy-consumption-by-source.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

fossil_columns = ['gas', 'oil', 'coal', 'nuclear']
renew_cols = ['wind', 'hydro', 'other_renewables', 'solar', 'biofuels']

new_columns = {}
for col in share_raw.columns:
    if '__pct_direct_primary_energy' in col:
        new_name = col.replace('__pct_direct_primary_energy', '')
        new_columns[col] = new_name
share_raw = share_raw.rename(columns=new_columns)

share_clean = sql(f"""
SELECT wc.name, wc.code, wc.region, {",".join(fossil_columns)}, {",".join(renew_cols)} FROM share_raw
INNER JOIN gdp_clean ON share_raw.Code = gdp_clean.code
INNER JOIN world_clean as wc ON share_raw.Code = wc.code
WHERE year = 2024 AND wc.region IS "Europe"
""")

display(share_clean)

In [None]:
primary_renewable_source = []
share_value = []

for index, row in share_clean.iterrows():
    # Select only the renewable columns for the current row
    renewable_shares = row[renew_cols]

    # Find the renewable source with the maximum share
    if not renewable_shares.empty:
        max_share_source = renewable_shares.idxmax()
        max_share_value = renewable_shares.max()
        primary_renewable_source.append(max_share_source)
        share_value.append(max_share_value)
    else:
        primary_renewable_source.append(None)
        share_value.append(None)

# Create the new DataFrame
renewable_ranking = pd.DataFrame({
    'name': share_clean['name'],
    'code': share_clean['code'],
    'region': share_clean['region'],
    'primary_renewable_source': primary_renewable_source,
    'share_value': share_value
})

# Display the new DataFrame, sorted by share_value
display(renewable_ranking.sort_values(by='share_value', ascending=False).head())

In [None]:
# Define a color map for the primary renewable sources for better visualization
# You can customize these colors further if needed
energy_source_color_map = {
    'hydro': 'steelblue',
    'nuclear': 'darkorchid',
    'wind': 'mediumseagreen',
    'solar': 'gold',
    'other_renewables': 'darkorange',
    'biofuels': 'saddlebrown',
    'gas': 'lightgray',
    'oil': 'dimgray',
    'coal': 'black',

}

# Create the choropleth map
fig_renewable_choropleth = px.choropleth(renewable_ranking,
                                    locations='code',
                                    color='primary_renewable_source',
                                    hover_name='name',
                                    hover_data={'share_value': ':.2f', 'primary_renewable_source': True}, # Show share value with 2 decimal places on hover
                                    color_discrete_map=energy_source_color_map,
                                    title='Mostly used renewable energy source in European Countries (2024)',
                                    scope='europe',
                                    projection="natural earth",
                                    fitbounds='locations')

fig_renewable_choropleth.update_layout(
    width=1000,
    height=800,
    title_x=0.5,
    title_y=0.9,
    title_font_size=24, # Larger title
    margin=dict(l=10, r=10, t=50, b=10), # Reduce whitespace around the map
    legend=dict(
        orientation="h", # Horizontal legend
        yanchor="middle",
        y=0.075, # Position legend below the map
        xanchor="center",
        x=0.5,
        title_text='', # Remove legend title
        font=dict(size=18) # Increase legend font size
    )
)
fig_renewable_choropleth.show()