In [1]:
import pandas as pd
import numpy as np

In [71]:
data = pd.read_csv('weatherAUS.csv')
print('Current data size: ', data.shape)
data.head()

Current data size:  (142193, 24)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


# Data preprocessing

In [72]:
data.isna().sum().sort_values(ascending=False)

Sunshine         67816
Evaporation      60843
Cloud3pm         57094
Cloud9am         53657
Pressure9am      14014
Pressure3pm      13981
WindDir9am       10013
WindGustDir       9330
WindGustSpeed     9270
WindDir3pm        3778
Humidity3pm       3610
Temp3pm           2726
WindSpeed3pm      2630
Humidity9am       1774
RainToday         1406
Rainfall          1406
WindSpeed9am      1348
Temp9am            904
MinTemp            637
MaxTemp            322
Location             0
RainTomorrow         0
RISK_MM              0
Date                 0
dtype: int64

In [73]:
data.drop(columns=['Sunshine',
                   'Evaporation',
                   'Cloud3pm',
                   'Cloud9am',
                   'Location',
                   'RISK_MM',
                   'Date'],
          axis=1, inplace=True)
print('Current data size: ', data.shape)

Current data size:  (142193, 17)


In [74]:
data.dropna(how='any', inplace=True)
print('Current data size: ', data.shape)

Current data size:  (112925, 17)


In [75]:
data['RainToday'].replace({
    'No': 0,
    'Yes': 1
}, inplace=True)

data['RainTomorrow'].replace({
    'No': 0,
    'Yes': 1
}, inplace=True)

In [76]:
categorical_columns = ['WindGustDir',
                       'WindDir3pm',
                       'WindDir9am']
data = pd.get_dummies(data, columns=categorical_columns)
print('Current data size: ', data.shape)
data.head()

Current data size:  (112925, 62)


Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,0,0,0,0,0,0,0,1,0,0
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,1,0,0,0,0,0,0,0,0,0
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,0,0,0,0,0,0,0,1,0,0
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,0,0,0,1,0,0,0,0,0,0
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
from scipy import stats

data_z = data.iloc[:, :12]
z = np.abs(stats.zscore(data_z))
data = data[(z < 3).all(axis=1)]
print('Final data size: ', data.shape)

Final data size:  (107868, 62)


In [78]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data),
                    index=data.index,
                    columns=data.columns)
data.head()

  return self.partial_fit(X, y)


Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,0.518717,0.464198,0.021429,0.506849,0.486486,0.52381,0.674157,0.22,0.268409,0.309353,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.358289,0.518519,0.0,0.506849,0.054054,0.47619,0.370787,0.25,0.337292,0.326139,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.505348,0.533333,0.0,0.534247,0.459459,0.571429,0.303371,0.3,0.266033,0.347722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.406417,0.590123,0.0,0.232877,0.243243,0.166667,0.382022,0.16,0.503563,0.446043,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.628342,0.696296,0.035714,0.465753,0.135135,0.428571,0.797753,0.33,0.342043,0.282974,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature selection

In [79]:
# to do