# Data Cleaning and Visualization

There are 23 columns and 145460 rows in this time-series dataset. The goal is to predict the `RainTomorrow` variable.

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# helper function to get quick row info
def getDfRowInfo(dataframe):
    print('DF Row Info:')
    print('\tTotal number of rows:', len(df.index))
    print('\tPercentage of original number of rows:', len(df.index) / 145460)
    print('\tTotal number of rows with the RainTomorrow = true:', len([x for x in df['RainTomorrow'] if x == 1]))
    print('\tPercentage of dataset that has RainTomorrow = true:', len([x for x in df['RainTomorrow'] if x == 1]) / len(df.index))
    print('*********************************************************************************************')

# -----------------------
# read in dataset with date column parsed
df = pd.read_csv('weatherAUS.csv',
    parse_dates=['Date'],
    index_col='Date')

# preview data
df.head()

# print out the number of non-NA values per column
print('Total number of non-NA values per column:')
print(df.count(0))

Total number of non-NA values per column:
Location         145460
MinTemp          143975
MaxTemp          144199
Rainfall         142199
Evaporation       82670
Sunshine          75625
WindGustDir      135134
WindGustSpeed    135197
WindDir9am       134894
WindDir3pm       141232
WindSpeed9am     143693
WindSpeed3pm     142398
Humidity9am      142806
Humidity3pm      140953
Pressure9am      130395
Pressure3pm      130432
Cloud9am          89572
Cloud3pm          86102
Temp9am          143693
Temp3pm          141851
RainToday        142199
RainTomorrow     142193
dtype: int64


In [25]:
# Only use rows of df that do not have NA in Rain Tomorrow or Rain Today
df = df[df['RainToday'].notna()]
df = df[df['RainTomorrow'].notna()]

# TODO: replace NAs with median or mode over the month - Manon
# note: beware of replacing ints with floats that shouldn't be floats
# replaced numerical columns with mean and categorical columns with mode

for column in ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 
               'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 
               'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 
               'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']:
    df[column] = df[column].fillna(df[column].mean())

for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    df[column] = df[column].fillna(df[column].mode())



# transform binary string to binary 0 or 1 values
df['RainToday'] = df['RainToday'].transform(lambda x: 0 if x == 'No' else 1)
df['RainTomorrow'] = df['RainTomorrow'].transform(lambda x: 0 if x == 'No' else 1)

getDfRowInfo(df)

# ------------------------
# TODO: replace NAs with median or mode over the month - Manon
# note: beware of replacing ints with floats that shouldn't be floats

# columns with mostly NA values?: Sunshine, Evaporation, Cloud9am, Cloud3pm
# df = df.drop(['Sunshine', 'Evaporation', 'Cloud9am', 'Cloud3pm'], axis=1)
# getDfRowInfo(df)

 
# remove rows that have any NA values? - no, this removes 61% of the rows before removing the 4 columns above
#       after removing the above 4 columns, it removes 22% of the rows
# df = df.dropna()
# getDfRowInfo(df)

# -------------------------
# TODO: should we split dataset by location?

# transform WindDir and Location columns into encoded labels
la = LabelEncoder()
l = []
for i in df.columns:
    if df.dtypes[i]=='O':
        l.append(i)
        df[i] = la.fit_transform(df[i])
print('Transformed columns to encoded labels: ', l)

# -------------------------
# preview data
df.head()
print(df.describe())

DF Row Info:
	Total number of rows: 145460
	Percentage of original number of rows: 1.0
	Total number of rows with the RainTomorrow = true: 35144
	Percentage of dataset that has RainTomorrow = true: 0.24160593977725836
*********************************************************************************************
Transformed columns to encoded labels:  ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
            Location        MinTemp        MaxTemp       Rainfall  \
count  145460.000000  143975.000000  144199.000000  142199.000000   
mean       23.793524      12.194034      23.221348       2.360918   
std        14.228687       6.398495       7.119049       8.478060   
min         0.000000      -8.500000      -4.800000       0.000000   
25%        11.000000       7.600000      17.900000       0.000000   
50%        24.000000      12.000000      22.600000       0.000000   
75%        36.000000      16.900000      28.200000       0.800000   
max        48.000000      33.900000     

In [64]:
# write cleaned dataset to new file

df.to_csv('cleanedWeatherAUS.csv')

In [16]:
# do some initial analysis? data visualizations on the dataset content
# useful syntax to slice data based on time:
df["1/1/2010":"1/2/2010"].head() # gets all data in the given date range
# min, max temp over time, etc...
# - Valeria

Unnamed: 0_level_0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-01-01,Albury,19.4,31.9,5.0,NNE,39.0,NW,WNW,9.0,9.0,70.0,40.0,1012.2,1008.5,23.4,30.9,1,1
2010-01-02,Albury,18.6,29.1,12.4,W,56.0,S,W,6.0,28.0,88.0,48.0,1007.8,1006.2,20.6,28.0,1,0
2010-01-01,BadgerysCreek,19.6,29.1,0.0,NNW,43.0,NNE,N,11.0,20.0,83.0,56.0,1014.2,1010.6,22.4,28.2,0,0
2010-01-02,BadgerysCreek,20.3,30.3,0.0,SE,41.0,NNE,NE,9.0,9.0,72.0,64.0,1007.9,1005.5,24.5,28.6,0,1
2010-01-01,Cobar,19.3,26.4,1.6,NE,30.0,NNE,NNE,13.0,19.0,96.0,86.0,1011.1,1008.9,21.3,23.8,1,1
