In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns


In [49]:
data = pd.read_csv('raw_data/VMcn4Vml.csv')

# Display the first few rows of the dataset
print(data.head())

  IndicatorCode                                          Indicator ValueType  \
0       SDGPM25  Concentrations of fine particulate matter (PM2.5)      text   
1       SDGPM25  Concentrations of fine particulate matter (PM2.5)      text   
2       SDGPM25  Concentrations of fine particulate matter (PM2.5)      text   
3       SDGPM25  Concentrations of fine particulate matter (PM2.5)      text   
4       SDGPM25  Concentrations of fine particulate matter (PM2.5)      text   

  ParentLocationCode ParentLocation Location type SpatialDimValueCode  \
0                AFR         Africa       Country                 KEN   
1                AMR       Americas       Country                 TTO   
2                EUR         Europe       Country                 GBR   
3                AMR       Americas       Country                 GRD   
4                AMR       Americas       Country                 BRA   

                                            Location Period type  Period  ...  \

In [50]:
# Filter the data for 2010 and 2019
data_filtered = data[data['Period'].isin([2010, 2019])]

# Select relevant columns: Location, Period, and PM2.5 (FactValueNumericLow and FactValueNumericHigh)
data_filtered = data_filtered[['Location', 'Period', 'FactValueNumericLow', 'FactValueNumericHigh']]

# Compute the average PM2.5 from the low and high values
data_filtered['PM2.5_Avg'] = data_filtered[['FactValueNumericLow', 'FactValueNumericHigh']].mean(axis=1)


# Remove duplicates by averaging the PM2.5 values for each location and year
data_filtered = data_filtered.groupby(['Location', 'Period'], as_index=False).agg({
    'FactValueNumericLow': 'mean',
    'FactValueNumericHigh': 'mean'
})

# Recompute the average PM2.5 after handling duplicates
data_filtered['PM2.5_Avg'] = data_filtered[['FactValueNumericLow', 'FactValueNumericHigh']].mean(axis=1)

# Pivot the data again
data_pivot = data_filtered.pivot(index='Location', columns='Period', values='PM2.5_Avg').reset_index()

# Calculate the average change in PM2.5 from 2010 to 2019
data_pivot['PM2.5_Change_2010_2019'] = data_pivot[2019] - data_pivot[2010]

# Keep only relevant columns
result = data_pivot[['Location', 2019, 'PM2.5_Change_2010_2019']]

# Rename columns for clarity
result.columns = ['Location', 'PM2.5_Avg_2019', 'PM2.5_Change_2010_2019']

# Display the cleaned result
print(result.head())

      Location  PM2.5_Avg_2019  PM2.5_Change_2010_2019
0  Afghanistan         71.2370                -8.45300
1      Albania         16.5160                -5.75000
2      Algeria         24.8720                 1.67400
3      Andorra          8.7375                -2.93625
4       Angola         54.6220                13.43100


In [51]:
scaler = MinMaxScaler()

columns_to_scale: list[str] = ['PM2.5_Avg_2019', 'PM2.5_Change_2010_2019']


result[columns_to_scale] = scaler.fit_transform(result[columns_to_scale])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result[columns_to_scale] = scaler.fit_transform(result[columns_to_scale])


In [52]:
# Save the preprocessed data to a new CSV file
result.to_csv('preprocessed_data/preprocessed_pm25_data.csv', index=False)
