# Import Required Libraries and Modules
* **pandas**: used for data manipulation  
* **DataCleaner**: contains some helpful methods for data manipulation
* **sys**: used for system-specific parameters
* **os**: used for accessing file system paths


In [11]:
import sys
import os

project_root = "/Users/tyler/ML-Climate-Project"
sys.path.append(os.path.abspath(project_root))

import pandas as pd
from data_cleaning.data_cleaner import DataCleaner

# Instantiate an instance of DataCleaner
This reads the data into a pandas data frame.

In [12]:
dc: DataCleaner = DataCleaner(project_root + '/raw-data/Sea_level_1_raw.csv')
dc.preview(10)

Unnamed: 0,Year,Country,Avg Temperature (°C),CO2 Emissions (Tons/Capita),Sea Level Rise (mm),Rainfall (mm),Population,Renewable Energy (%),Extreme Weather Events,Forest Area (%)
0,2006,UK,8.9,9.3,3.1,1441,530911230,20.4,14,59.8
1,2019,USA,31.0,4.8,4.2,2407,107364344,49.2,8,31.0
2,2014,France,33.9,2.8,2.2,1241,441101758,33.3,9,35.5
3,2010,Argentina,5.9,1.8,3.2,1892,1069669579,23.7,7,17.7
4,2007,Germany,26.9,5.6,2.4,1743,124079175,12.5,4,17.4
5,2020,China,32.3,1.4,2.7,2100,1202028857,49.4,12,47.2
6,2006,Argentina,30.7,11.6,3.9,1755,586706107,41.9,10,50.5
7,2018,South Africa,33.9,6.0,4.5,827,83947380,17.7,1,56.6
8,2022,UK,27.8,16.6,1.5,1966,980305187,8.2,4,43.4
9,2010,Australia,18.3,1.9,3.5,2599,849496137,7.5,5,48.7


# Check for missing values

In [13]:
dc.count_missing_values()

Year                           0
Country                        0
Avg Temperature (°C)           0
CO2 Emissions (Tons/Capita)    0
Sea Level Rise (mm)            0
Rainfall (mm)                  0
Population                     0
Renewable Energy (%)           0
Extreme Weather Events         0
Forest Area (%)                0
dtype: int64

# Keep only relevant columns
'CO2 Emissions (Tons/Capita)', 'Population', 'Renewable Energy (%)', 'Extreme Weather Events', 'Forest Area (%)' aren't necessary for regression.

In [14]:
columns_to_drop = ['CO2 Emissions (Tons/Capita)', 'Population', 'Renewable Energy (%)', 'Extreme Weather Events', 'Forest Area (%)']
dc.drop(columns_to_drop)
dc.sort_column('Year')
dc.preview(n=100)

Unnamed: 0,Year,Country,Avg Temperature (°C),Sea Level Rise (mm),Rainfall (mm)
0,2000,UK,21.8,2.2,1273
1,2000,India,20.9,2.6,1100
2,2000,Canada,22.8,2.5,720
3,2000,Canada,16.5,2.2,2279
4,2000,UK,22.9,4.7,1255
...,...,...,...,...,...
95,2002,Russia,27.2,4.8,1729
96,2002,Canada,21.7,3.0,1831
97,2002,Russia,23.1,4.7,1484
98,2002,France,9.4,3.7,2202


# Aggregate the data
Group the data by Year and Country and calculate the mean for each group.

In [15]:
dc.aggregate_data(['Year', 'Country'], 'mean')
dc.preview(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Avg Temperature (°C),Sea Level Rise (mm),Rainfall (mm)
Year,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000,Argentina,16.9,4.0,2047.0
2000,Australia,11.933333,2.266667,2033.333333
2000,Brazil,31.2,3.7,803.0
2000,Canada,19.3,2.65,1383.0
2000,China,26.2,2.2,1849.0
2000,France,16.6,2.8,1819.666667
2000,Germany,9.75,1.45,2641.0
2000,India,21.25,3.25,1124.5
2000,Indonesia,23.585714,3.242857,1781.428571
2000,Mexico,16.9,1.2,1974.5


# Save cleaned data to new file

In [None]:
output_path = project_root + '/clean-data/processed_Sea_level_1_data.csv'
dc.save_data(output_path)

Data successfully saved to /Users/tyler/ML-Climate-Project/clean-data/processed_Sea_level_1_data.csv
