In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# read data from csv file
df = pd.read_csv('datasets/weatherAUS.csv', index_col='Date')

In [3]:
# drop all of rows that have NaN values in labels (RainTomorrow)
df.dropna(subset = ['RainTomorrow'], inplace=True)

# get the label and data for input
labels = df[['RainTomorrow']].copy()
df = df.iloc[:, :-1]

In [4]:
# replace values of label from Yes/No to 1/0
numeric_labels_dir = {"No":0, "Yes":1}
labels.replace(numeric_labels_dir, inplace=True)
labels.head()

Unnamed: 0_level_0,RainTomorrow
Date,Unnamed: 1_level_1
2008-12-01,0
2008-12-02,0
2008-12-03,0
2008-12-04,0
2008-12-05,0


In [5]:
# use One Hot Encodeing for nominal features
num_df = pd.get_dummies(df, dtype='float64', dummy_na=True)

# clear all of columns that have only 1 instance
for col in num_df.columns:
    if len(num_df[col].unique()) == 1:
        num_df.drop(col,inplace=True,axis=1)
        
# create Label column for dataset to group by
num_df['Label'] = labels.RainTomorrow.copy()

#fill NaN values with mean values for each group of label
filled_df = num_df.groupby(["Label"]).transform(lambda x: x.fillna(x.mean()))
filled_df.head()

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,WindDir3pm_nan,RainToday_No,RainToday_Yes,RainToday_nan
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-12-01,13.4,22.9,0.6,5.734807,8.546358,44.0,20.0,24.0,71.0,22.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2008-12-02,7.4,25.1,0.0,5.734807,8.546358,44.0,4.0,22.0,44.0,25.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2008-12-03,12.9,25.7,0.0,5.734807,8.546358,46.0,19.0,26.0,38.0,30.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2008-12-04,9.2,28.0,0.0,5.734807,8.546358,24.0,11.0,9.0,45.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2008-12-05,17.5,32.3,1.0,5.734807,8.546358,41.0,7.0,20.0,82.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
# use MinMaxScaler to scale the data to [1, 0] range
minmax_scaler = preprocessing.MinMaxScaler()
scaled_arr = minmax_scaler.fit_transform(filled_df)

In [7]:
scaled_df = pd.DataFrame(scaled_arr)
scaled_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
0,0.516509,0.523629,0.001617,0.03955,0.589404,0.294574,0.153846,0.275862,0.71,0.22,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.375,0.565217,0.0,0.03955,0.589404,0.294574,0.030769,0.252874,0.44,0.25,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.504717,0.57656,0.0,0.03955,0.589404,0.310078,0.146154,0.298851,0.38,0.3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.417453,0.620038,0.0,0.03955,0.589404,0.139535,0.084615,0.103448,0.45,0.16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.613208,0.701323,0.002695,0.03955,0.589404,0.271318,0.053846,0.229885,0.82,0.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
# split the dataset to train set and test set with 33 percent test set
X_train, X_test, y_train, y_test = train_test_split(scaled_df, np.array(labels).ravel(), shuffle=True, test_size=0.33, random_state=42)

In [9]:
# create KNN Classifier with k = 7, distance weights and fit the train set
knn_cf = KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=2)
knn_cf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=2, n_neighbors=7, p=2,
           weights='distance')

In [10]:
# use KNN model to predict the test set and calculate the accuracy
y_pred = knn_cf.predict(X_test)
print("Accuracy: {0}".format(accuracy_score(y_test, y_pred)))

Accuracy: 0.8586863864973148


In [11]:
# change k to 21
knn_cf.n_neighbors = 21
y_pred = knn_cf.predict(X_test)
print("Accuracy: {0}".format(accuracy_score(y_test, y_pred)))

Accuracy: 0.8586650754411389


In [12]:
# change the number of test set to 20 percent
X_train, X_test, y_train, y_test = train_test_split(scaled_df, np.array(labels).ravel(), shuffle=True, test_size=0.2, random_state=42)
knn_cf.fit(X_train, y_train)
y_pred = knn_cf.predict(X_test)
print("Accuracy: {0}".format(accuracy_score(y_test, y_pred)))

Accuracy: 0.8581876999894511
