# Data Cleaning and Visualization

There are 23 columns and 145460 rows in this dataset. The goal is to predict the `RainTomorrow` variable.

In [49]:
import pandas as pd
import numpy as np

# helper function to get quick row info
def getDfRowInfo(dataframe):
    print('DF Row Info:')
    print('\tTotal number of rows:', len(df.index))
    print('\tPercentage of original number of rows:', len(df.index) / 145460)
    print('\tTotal number of rows with the RainTomorrow = true:', len([x for x in df['RainTomorrow'] if x == 1]))
    print('\tPercentage of dataset that has RainTomorrow = true:', len([x for x in df['RainTomorrow'] if x == 1]) / len(df.index))

# -----------------------
# read in dataset
df = pd.read_csv('weatherAUS.csv')

# transform binary string to binary 0 or 1 values
df['RainToday'] = df['RainToday'].transform(lambda x: 0 if x == 'No' else 1)
df['RainTomorrow'] = df['RainTomorrow'].transform(lambda x: 0 if x == 'No' else 1)

getDfRowInfo(df)


# ------------------------
# print out the number of non-NA values per column
print('Total number of non-NA values per column:')
print(df.count(0))

# TODO: what to do with rows that are NA values? remove columns that are half full of NAs? replace NAs with 0 or some value?
# have two datasets to compare?

# remove columns with mostly NA values?: Sunshine, Evaporation, Cloud9am, Cloud3pm
df = df.drop(['Sunshine', 'Evaporation', 'Cloud9am', 'Cloud3pm'], axis=1)
getDfRowInfo(df)

 
# remove rows that have any NA values? - no, this removes 61% of the rows before removing the 4 columns above
# after removing the above 4 columns, it removes 22% of the rows
df = df.dropna()
getDfRowInfo(df)

# -------------------------
# TODO: what to do with location column that is string geographic data? does it matter, should we drop it?
# TODO: what to do with date column that is time series data?
# TODO: what to do with wind direction columns that are 17 enumerations of strings of cardinal directions, including NaN?
print('Unique wind direction values:', df['WindGustDir'].unique())

# -------------------------
# preview data
# df.head()
# print(df.describe())

DF Row Info:
	Total number of rows: 145460
	Percentage of original number of rows: 1.0
	Total number of rows with the RainTomorrow = true: 35144
	Percentage of dataset that has RainTomorrow = true: 0.24160593977725836
Total number of non-NA values per column:
Date             145460
Location         145460
MinTemp          143975
MaxTemp          144199
Rainfall         142199
Evaporation       82670
Sunshine          75625
WindGustDir      135134
WindGustSpeed    135197
WindDir9am       134894
WindDir3pm       141232
WindSpeed9am     143693
WindSpeed3pm     142398
Humidity9am      142806
Humidity3pm      140953
Pressure9am      130395
Pressure3pm      130432
Cloud9am          89572
Cloud3pm          86102
Temp9am          143693
Temp3pm          141851
RainToday        145460
RainTomorrow     145460
dtype: int64
DF Row Info:
	Total number of rows: 145460
	Percentage of original number of rows: 1.0
	Total number of rows with the RainTomorrow = true: 35144
	Percentage of dataset that ha

In [48]:
# write cleaned dataset to new file

df.to_csv('cleanedWeatherAUS.csv')

In [None]:
# do some initial analysis? data visualizations on the dataset content
