# Import Required Libraries and Modules
* **pandas**: used for data manipulation  
* **DataCleaner**: contains some helpful methods for data manipulation
* **sys**: used for system-specific parameters
* **os**: used for accessing file system paths


In [2]:
import sys
import os
from data_cleaning.data_cleaner import DataCleaner

In [12]:
project_root = os.getcwd()
sys.path.append(os.path.abspath(project_root))

# Instantiate an instance of DataCleaner
This reads the data into a pandas data frame.

In [28]:
dc: DataCleaner = DataCleaner(project_root + '/raw-data/Sea_level_2_raw.csv')
dc.preview(10)

Unnamed: 0,Date,Location,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
0,2000-01-01 00:00:00.000000000,New Williamtown,Latvia,10.688986,403.118903,0.717506,13.835237,23.631256,18.492026
1,2000-01-01 20:09:43.258325832,North Rachel,South Africa,13.81443,396.663499,1.205715,40.974084,43.982946,34.2493
2,2000-01-02 16:19:26.516651665,West Williamland,French Guiana,27.323718,451.553155,-0.160783,42.697931,96.6526,34.124261
3,2000-01-03 12:29:09.774977497,South David,Vietnam,12.309581,422.404983,-0.475931,5.193341,47.467938,8.554563
4,2000-01-04 08:38:53.033303330,New Scottburgh,Moldova,13.210885,410.472999,1.135757,78.69528,61.789672,8.001164
5,2000-01-05 04:48:36.291629162,South Nathan,Saint Helena,6.229326,392.473317,1.12221,76.368331,48.973886,30.398908
6,2000-01-06 00:58:19.549954995,Port Richardfurt,Tuvalu,21.646738,387.648437,0.058471,9.650389,11.402284,15.720944
7,2000-01-06 21:08:02.808280828,Adambury,Australia,19.7308,448.180275,0.001415,93.360755,21.52635,29.993495
8,2000-01-07 17:17:46.066606660,Williamsonberg,Qatar,19.858114,379.61882,0.584881,6.218846,30.861949,37.519472
9,2000-01-08 13:27:29.324932493,North Thomas,Chad,14.121563,410.517072,-1.712224,15.351583,88.422794,47.922521


# Check for missing values

In [29]:
dc.count_missing_values()

Date              0
Location          0
Country           0
Temperature       0
CO2 Emissions     0
Sea Level Rise    0
Precipitation     0
Humidity          0
Wind Speed        0
dtype: int64

# Keep only relevant columns

In [30]:
# Drop unneeded columns
columns_to_drop = ['Location', 'Precipitation', 'CO2 Emissions', 'Humidity', 'Wind Speed']
dc.drop(columns_to_drop)

In [31]:
# Remove time specific information and day (aggregate over month for each country)
# This is removing a lot of information (especially given inconsistent with time of measurement and location)
# May be better to do an alternate strategy such as only filtering for one location
regex_pattern = r'-\d{2}\s+\d{2}:\d{2}:\d{2}(?:\.\d+)?$'
dc.remove_text(column='Date', text=regex_pattern, regex=True)
dc.sort_column('Date')
dc.preview(n=100)

Unnamed: 0,Date,Country,Temperature,Sea Level Rise
0,2000-01,Latvia,10.688986,0.717506
1,2000-01,Bahrain,10.026932,0.655938
2,2000-01,French Southern Territories,21.082149,0.236963
3,2000-01,Iceland,3.940121,0.798083
4,2000-01,Bahrain,12.942743,0.396008
...,...,...,...,...
95,2000-03,Costa Rica,13.463885,-0.734010
96,2000-03,Vietnam,11.522797,0.763616
97,2000-03,Denmark,11.347275,0.603301
98,2000-03,Svalbard & Jan Mayen Islands,14.637082,-0.630046


# Aggregate the data

In [32]:
dc.aggregate_data(['Date', 'Country'], 'mean')
dc.preview(50)

Unnamed: 0,Date,Country,Temperature,Sea Level Rise
0,2000-01,Andorra,14.112278,-0.304872
1,2000-01,Australia,19.7308,0.001415
2,2000-01,Bahrain,11.484837,0.525973
3,2000-01,Bosnia and Herzegovina,15.556628,-1.535181
4,2000-01,Chad,14.121563,-1.712224
5,2000-01,Cocos (Keeling) Islands,13.818428,0.54177
6,2000-01,Comoros,17.937819,-0.024024
7,2000-01,Egypt,22.324533,-1.028802
8,2000-01,Equatorial Guinea,14.799344,-1.08679
9,2000-01,Ethiopia,17.120783,-0.269191


# Filter countries

In [33]:
dc.filter_by_column('Country', 'Libyan Arab Jamahiriya')
dc.preview(100)

Unnamed: 0,Date,Country,Temperature,Sea Level Rise
18,2000-01,Libyan Arab Jamahiriya,12.72996,0.148418
253,2000-08,Libyan Arab Jamahiriya,22.390582,-1.04239
468,2001-02,Libyan Arab Jamahiriya,8.576927,-0.211266
1423,2003-07,Libyan Arab Jamahiriya,14.287083,1.04563
2065,2005-02,Libyan Arab Jamahiriya,3.295588,0.657817
2262,2005-08,Libyan Arab Jamahiriya,18.522576,0.716663
2734,2006-10,Libyan Arab Jamahiriya,29.760923,-2.54709
2933,2007-04,Libyan Arab Jamahiriya,6.72206,-1.611455
3067,2007-08,Libyan Arab Jamahiriya,6.859091,0.668781
3100,2007-09,Libyan Arab Jamahiriya,15.323936,1.063279


# Save cleaned data to new file

In [34]:
output_path = project_root + '/clean-data/processed_Sea_level_2_data.csv'
dc.save_data(output_path)

Data successfully saved to c:\Users\Ethan Crook\Documents\Computer Science 562\ML-Climate-Project/clean-data/processed_Sea_level_2_data.csv
