In [1]:
#Dependencies
import pandas as pd
import numpy as np
import os

## Data Cleaning
------
- Import csv files and assess them in pandas df. 
- Determine what to do with any NaN values for prcp and tobs. 

In [2]:
# Read CSV files into a pandas DataFrame
hawaii_measurements_df = pd.read_csv("Resources/hawaii_measurements.csv")
hawaii_stations_df = pd.read_csv("Resources/hawaii_stations.csv" )

In [3]:
hawaii_measurements_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
hawaii_stations_df

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [None]:
hawaii_measurements_df.info()

In [5]:
#Determine if there are any null or NaN for each df
hawaii_measurements_df.isnull().sum()

station       0
date          0
prcp       1447
tobs          0
dtype: int64

In [6]:
hawaii_stations_df.isnull().sum()

station      0
name         0
latitude     0
longitude    0
elevation    0
dtype: int64

In [7]:
# Assess the affect of NaN on the average `prcp` and `tobs` for each weather station.
station_avgs = hawaii_measurements_df
station_avgs = pd.pivot_table(station_avgs,index=['station'], values=['prcp','tobs'])
station_avgs

Unnamed: 0_level_0,prcp,tobs
station,Unnamed: 1_level_1,Unnamed: 2_level_1
USC00511918,0.047971,71.615968
USC00513117,0.141921,72.689184
USC00514830,0.121058,74.873297
USC00516128,0.429988,70.915008
USC00517948,0.063602,74.684402
USC00518838,0.207222,72.72407
USC00519281,0.212352,71.663781
USC00519397,0.04902,74.553231
USC00519523,0.114961,74.543649


In [8]:
# There are lots of NaN for prcp. If I drop the NaN then I will lose lots of `tobs` and `dates` too. 
# Replace the NaN with 0.0. 
hawaii_measurements_noNaN = hawaii_measurements_df.dropna(axis=0)

# Assess the impact of dropping the NaN on the average `tobs` for each weather station.
station_avgs_noNaN = hawaii_measurements_noNaN 
station_avgs_noNaN = pd.pivot_table(station_avgs_noNaN,index=['station'], values=['prcp','tobs'])
station_avgs_noNaN

Unnamed: 0_level_0,prcp,tobs
station,Unnamed: 1_level_1,Unnamed: 2_level_1
USC00511918,0.047971,71.527433
USC00513117,0.141921,72.678042
USC00514830,0.121058,74.813113
USC00516128,0.429988,70.865137
USC00517948,0.063602,74.587116
USC00518838,0.207222,72.675439
USC00519281,0.212352,71.663781
USC00519397,0.04902,74.564246
USC00519523,0.114961,74.532659


### Lets look at the difference in  'tobs' when I drop the NaN for 'prcp'.  
----
- Subtract the two df
- Get the percentage change when dropping the NaN's.

In [9]:
# Subtracting the two df on `tobs`.
diff = station_avgs['tobs'] - station_avgs_noNaN['tobs']
diff

station
USC00511918    0.088535
USC00513117    0.011143
USC00514830    0.060184
USC00516128    0.049871
USC00517948    0.097287
USC00518838    0.048632
USC00519281    0.000000
USC00519397   -0.011015
USC00519523    0.010990
Name: tobs, dtype: float64

In [10]:
# Percent difference of `tobs` of the two df.  
percent_diff = (diff/station_avgs['tobs'])*100
percent_diff

station
USC00511918    0.123625
USC00513117    0.015329
USC00514830    0.080381
USC00516128    0.070325
USC00517948    0.130264
USC00518838    0.066872
USC00519281    0.000000
USC00519397   -0.014775
USC00519523    0.014743
Name: tobs, dtype: float64

### So, there doesn't seem to be a big difference in dropping the NaN but to perserve the 'tobs' data, I will make all the NaN for 'prcp' = '0'.

In [11]:
# Fill all the NaN with '0'. 
hawaii_measurements_df = hawaii_measurements_df.fillna(0)

#Determine if there are any null or NaN.
hawaii_measurements_df.isnull().sum()

station    0
date       0
prcp       0
tobs       0
dtype: int64

In [12]:
#Get the info for the df. 
hawaii_measurements_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
station    19550 non-null object
date       19550 non-null object
prcp       19550 non-null float64
tobs       19550 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 611.0+ KB


In [13]:
# Save each cleaned df to csv file.
hawaii_measurements_df.to_csv('data/clean_hawaii_measurements.csv', encoding='utf-8', index=False)
hawaii_stations_df.to_csv('data/clean_hawaii_stations.csv',encoding='utf-8', index=False)