#### Australian Weather Insights Tracker 2007-2017


#### About Data

This dataset contains weather information from many of the weather stations around Australia. For most weather stations, we have about 365 observations for the years 2007 to 2017. More information about the dataset can be found in the Australian Bureau of Meteorology website, and below you can find a short description of the variables in the dataset.

1 Load

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
df = pd.read_csv("../datasets/files/weatherAUS.csv", parse_dates=['Date'])


#### 4.2 Inpect Data

In [None]:
df.shape()

In [None]:
df.describe().T

In [None]:
df.isna().sum()

In [None]:
#Let's double check the years we have data for, and how many values do we have per year
df['Date'].dt.year.value_counts()
sorted(df.loc[df['Location'] == 'Sydney', 'Date'].dt.year.value_counts())

4.3 Clean & Prepare

In [None]:
# Let's normalise the columns
df.columns = [col.lower() for col in df.columns]
df.columns

4.3.1 Dealing with Missing Values

get rid of rows with too many missing values for each column.What we will do first by selecting to keep rows with at least 80% of the data in them. 

In [None]:
dfmiss = df.dropna(thresh=18).copy()

In [None]:
# check if there was a significant change
(dfmiss.isna().sum() / dfmiss.shape[0]) * 100

In [None]:
df.shape[0] - dfmiss.shape[0]
dfmiss.info()

we will iterate over the columns.if it is a float, we will fill in missing values with the median of that column
else
if the column contains data of type object and it has any missing values
we will convert the column to a category type
and forward fill the missing values

In [None]:

for col in dfmiss.columns:
    if (dfmiss[col].dtype == 'float64') & (dfmiss[col].isna().any()):
        dfmiss[col].fillna(value=dfmiss[col].median(), axis=0, inplace=True)
    elif (dfmiss[col].dtype == 'object') & (dfmiss[col].isna().any()):
        dfmiss[col].astype('category', copy=True)
        dfmiss[col].fillna(method='ffill', axis=0, inplace=True)

In [None]:
dfmiss.isna().sum()

Since weather is time series data, we will create additional date variables for visualisation 

In [None]:
dfmiss['month'] = dfmiss['date'].dt.month
dfmiss['year'] = dfmiss['date'].dt.year
dfmiss['week'] = dfmiss['date'].dt.week
dfmiss['weekday'] = dfmiss['date'].dt.weekday
dfmiss['quarter'] = dfmiss['date'].dt.quarter
dfmiss['day_of_week'] = dfmiss['date'].dt.day_name()
dfmiss['week_or_end'] = dfmiss['weekday'].apply(lambda x: 'weekend' if x >= 5 else 'week_day')
dfmiss.head()

We might want to represent the quarter variable as an object later on, so we will create a dictionary with the values we would like to change, and pass it to our Python's .map() method. A very useful fuction to map a function or set of values to a column or other data structure

In [None]:
mapping = {1:'first_Q',
           2:'second_Q',
           3:'third_Q',
           4:'fourth_Q'}


dfmiss['qtr_cate'] = dfmiss['quarter'].map(mapping)

In [None]:
#Save your dataset for later use
df_ready = dfmiss.reset_index(drop=True).copy()
df_ready.to_csv('weather_ready.csv', index=False)

5.Visualisation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

%matplotlib inline

In [None]:
df = pd.read_csv('weather_ready.csv', parse_dates=['date'])
df.head(3)

That's a nice and informative plot as it tells us upon first inspection that our data is normally distributed 

In [None]:
plt.hist(x=df['maxtemp'], color='green', bins=40, edgecolor='white')
plt.title('Maximum Temperature - 2007-2017')

In [None]:
plt.hist(x=df['maxtemp'], color='red', bins=50, histtype='stepfilled', alpha=0.4)
plt.hist(x=df['mintemp'], color='cyan', bins=50, histtype='stepfilled', alpha=0.5)
plt.title('Maximum & Minimum Temperature - 2007-2017', fontdict={'fontsize': 15})
plt.xlabel('Temperature Distribution in Celcius')
plt.ylabel('Temperature Frequency in 10 Years')
plt.show()

In [None]:
plt.scatter(x=df['rainfall'], y=df['temp3pm'], marker='.', color='red', alpha=0.7, s=1.5)
plt.xlim(0, 250)
plt.title('Relationship Between Rain and Temperature at 3 pm from 2007-2017', fontdict={'fontsize': 10})
plt.xlabel('Rainfall')
plt.ylabel('Temperature Frequency at 3 pm')


A very useful way of visualising data is by aggregating it first.

In [None]:
hum_by_year = df.pivot_table(
    index='year',
    values=['humidity9am', 'humidity3pm'],
    aggfunc='mean'
)
hum_by_year

In [None]:
# Create the fig and ax objects
fig, ax = plt.subplots()

# add the 9am data and label to the ax object
ax.plot(hum_by_year.index, hum_by_year['humidity9am'], label='Humidity 9am', marker='o')
# add the 3pm data and label to the ax object
ax.plot(hum_by_year.index, hum_by_year['humidity3pm'], label='Humidity 3pm', marker='v')

ax.set_xlabel("Years")
ax.set_ylabel("Average Humidity")

# show the legend with labels
plt.legend()
# show the fig
plt.show()

show the distributions of our 4 temperature variables

In [None]:
# Create the fig and ax objects
fig, ax = plt.subplots(2, 2, sharey=True, figsize=(10, 8))

# add the 9am data and label to the ax object
ax[0, 0].hist(df['mintemp'], bins=55, color='blue', alpha=0.5)
ax[0, 1].hist(df['maxtemp'], bins=55, color='red', alpha=0.5)
ax[1, 0].hist(df['temp9am'], bins=55, color='cyan', alpha=0.5)
ax[1, 1].hist(df['temp3pm'], bins=55, color='green', alpha=0.5)


# ax.set(title="Distributions of Temperatures between 2007-2017")
ax[0, 0].set_xlabel("Min Temperature")
ax[0, 1].set_xlabel("Max Temperature")
ax[1, 0].set_xlabel("Temp at 9am")
ax[1, 1].set_xlabel("Temp at 3pm")
ax[0, 0].set_ylabel("Frequency")
ax[1, 0].set_ylabel("Frequency")

fig.savefig('temp_plots.png')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.countplot(x='windgustdir', data=df)
plt.xticks(rotation=45)
plt.title("Where does the wind come from most of the time?")
plt.show()

In [None]:
sns.countplot(x='raintoday', data=df[(df['location'] == 'Sydney') & (df['year'] == 2016)])
plt.xlabel("Did it Rain Today?")
plt.title("Did it rain a lot in Sydney in 2016?")
plt.show()

In [None]:
df_2016 = df[df['year'] == 2016].copy()
df_2016.shape
sns.catplot(x='raintomorrow', data=df_2016, kind='count')
plt.xlabel("Did it Rain Tomorrow?")
plt.title("Did it rain a lot in 2016?")
plt.show()

In [None]:
sns.catplot(x='day_of_week', y='rainfall', data=df_2016, kind='bar')
plt.xlabel("Day of the Week")
plt.ylabel("Rain Fall (inches)")
plt.title("Amount of rain in inches and per day in 2016")
plt.show()

In [None]:
sns.catplot(x='day_of_week', y='maxtemp', data=df_2016, kind='box')
plt.xlabel("Day of the Week")
plt.ylabel("Max Temp")
plt.title("Distribution of the Max Temp in the weekdays of 2016")
plt.show()

In [None]:
palette_colors = {'weekend': "green", 'week_day': "blue"}

sns.countplot(x='raintoday', data=df[(df['location'] == 'Sydney') & (df['year'] == 2016)],
              hue='week_or_end', palette=palette_colors)

plt.title("Did it rain a lot in Sydney in 2016?")

plt.show()

In [None]:
sns.set_style('whitegrid')
sns.scatterplot(x='maxtemp', y='rainfall', data=df)
plt.title("Are rainy days correlated with high temperatures?")
plt.show()

In [None]:
sns.scatterplot(x='humidity9am', y='temp3pm', data=df.sample(frac=.05), hue='raintomorrow')
plt.xlabel("Humidity at 9am")
plt.ylabel("Humidity at 3pm")
plt.title("If it is humid in the mornig will it be humid in the afternoon?")
plt.show()

In [None]:

sns.relplot(x='rainfall', 
            y='windspeed9am', 
            data=df_2016, 
            kind='scatter',
            col='week_or_end')
plt.show()

In [None]:
sns.relplot(x='rainfall', 
            y='windspeed9am', 
            data=df_2016, 
            kind='scatter',
            col='week_or_end',
            row='raintomorrow')
plt.show()

In [None]:
df_2016_no_outliers = df_2016[df_2016['rainfall'] < 60]
sns.relplot(x='rainfall', 
            y='maxtemp', 
            data=df_2016_no_outliers.sample(frac=0.2), 
            kind='scatter',
            hue='weekday',
            size='weekday')
plt.show()

In [None]:

sns.relplot(x='month', y='rainfall', data=df_2016, kind='line')
plt.xlabel('Months of 2016')
plt.ylabel('Rain in inches')
plt.title("Precipitation Monthly Trend in 2016")
plt.show()

In [None]:

sns.relplot(x='month', y='rainfall', data=df_2016, kind='line', style='week_or_end', hue='week_or_end', markers=True, ci=False)
plt.xlabel('Months of 2016')
plt.ylabel('Rain in inches')
plt.title("Precipitation Monthly Trend by Week or W-End in 2016")
plt.savefig("more_trends", dpi=350)
plt.show()

In [None]:
from bokeh.io import output_file, show, output_notebook
from bokeh.plotting import figure
import matplotlib.pyplot as plt
import pandas as pd
output_notebook()

%matplotlib inline
plot.circle(df['rainfall'], df['mintemp'])
output_file('circle.html')

In [None]:
p.circle(x=df['rainfall'], y=df['humidity9am'], size=4)
show(p)

In [None]:
Color mapping
from bokeh.models import CategoricalColorMapper
# this tool allows us to map specific colors to specific categories within a variable
mapper = CategoricalColorMapper(
    factors=['first_Q', 'second_Q', 'third_Q', 'fourth_Q'],
    palette=['bisque', 'rosybrown', 'chocolate', 'maroon']
)

# labels can be added within the figure parameter
plot5 = figure(x_axis_label='rainfall',
              y_axis_label='mintemp'
              )

plot5.circle('rainfall', 'mintemp',
            size=7, source=df,
            color={'field':'qtr_cate', # pass in the color as a dictionary and specify the field first, e.g. our qrt_cate variable
                  'transform': mapper}, # assign the colors with transform param
            legend='qtr_cate' # add your legend for the categories
           )

# move the legend to a convenient spot
plot5.legend.location = 'top_right'
