In [1]:
#Imports
import pandas as pd
import numpy as np
from datetime import datetime
import csv

In [2]:
#CSV import
hmeasures = "raw_data/hawaii_measurements.csv"
hstations = "raw_data/hawaii_stations.csv"
hmeasures_df = pd.read_csv(hmeasures)
hstations_df = pd.read_csv(hstations)

In [3]:
#Data structure overview
print("hstations_df column information: \n")
print(hstations_df.columns,hstations_df.dtypes) 
print("=============================")
print("hmeasures_df column information: \n")
print(hmeasures_df.columns,hmeasures_df.dtypes)

hstations_df column information: 

Index(['station', 'name', 'latitude', 'longitude', 'elevation'], dtype='object') station       object
name          object
latitude     float64
longitude    float64
elevation    float64
dtype: object
hmeasures_df column information: 

Index(['station', 'date', 'prcp', 'tobs'], dtype='object') station     object
date        object
prcp       float64
tobs         int64
dtype: object


In [4]:
hstations_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [5]:
hmeasures_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [6]:
#Data inspection (NaN, missing values)
hstations_df.isnull().values.any()

False

In [7]:
hmeasures_df.isnull().values.any()

True

In [13]:
hmeasures_df.isnull().sum()
    #Use if more than one column with missing values:
    #hmeasures_df.isnull().sum().sum 

station       0
date          0
prcp       1447
tobs          0
dtype: int64

In [9]:
def missing_values_table(df):
        #Function source: https://stackoverflow.com/questions/26
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        return mis_val_table_ren_columns 

In [10]:
missing_values_table(hmeasures_df)

Unnamed: 0,Missing Values,% of Total Values
station,0,0.0
date,0,0.0
prcp,1447,7.401535
tobs,0,0.0


In [11]:
#Verify missing data and store said data in another dataframe
hmeasures_clean_df =hmeasures_df.dropna(how='any')
print("Row count BEFORE missing value removal: \n")
print(hmeasures_df.count())
missingrowcount=str(hmeasures_df.isnull().sum().sum())
print("=============================")
print("Rows with missing values: "+missingrowcount)
print("=============================")
print("Row count AFTER missing value removal: \n")
print(hmeasures_clean_df.count())

#Create dataframe ONLY with rows with missing data
hmeasures_missing_df = (hmeasures_df[hmeasures_df.isnull().T.any().T])
    #Source: https://stackoverflow.com/a/37442692, Dataframe transpose
print("=============================")
print("Rows in hmeasures_missing_df: " + str(len(hmeasures_missing_df)))

Row count BEFORE missing value removal: 

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64
Rows with missing values: 1447
Row count AFTER missing value removal: 

station    18103
date       18103
prcp       18103
tobs       18103
dtype: int64
Rows in hmeasures_missing_df: 1447


In [12]:
#Output clean data to CSV
hstations_df.to_csv("raw_data/clean_hawaii_stations.csv", index=True, header=True)
hmeasures_clean_df.to_csv("raw_data/clean_hawaii_measurements.csv", index=True, header=True)

#Output missing data to CSV
hmeasures_missing_df.to_csv("raw_data/missing_hawaii_measurements.csv", index=True, header=True)