In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error

bj_df = pd.read_csv('Beijing_labeled.csv')
sy_df = pd.read_csv('Shenyang_labeled.csv')
merge = bj_df.append(sy_df)

#shuffle the data
#merge = merge.sample(frac=1).reset_index(drop=True)

#drop the null value if existing
merge = merge.dropna()
#check if the data set is balanced
print(merge['PM_HIGH'].sum()/merge.shape[0])
merge.head()

In [None]:
#Standardiza and normalize raw data
from sklearn.preprocessing import StandardScaler
features_normalized = ['season','DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation','cbwd_NE','cbwd_NW','cbwd_SE']
merge[features_normalized] = StandardScaler().fit_transform(merge[features_normalized])
merge.head()

In [None]:
#split the data set into training and validation data
from sklearn.model_selection import train_test_split
features_all = ['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation', 'cbwd_NE','cbwd_NW','cbwd_SE']
#features_all = ['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation']
x_train, x_test, y_train, y_test = train_test_split(merge[features_all],merge['PM_HIGH'],test_size=0.25, random_state=0)

In [None]:
from math import sqrt
class knn_classifier(object):
    def __init__(self):
        pass

        
    def get_distance(self, row1, row2):
        distance = 0
        for i in range(len(row1)):
            distance += (row1[i]-row2[i])**2
        return sqrt(distance) 
        
    def get_neighbors(self,x_train,y_train,x_test_row,k_max):
        distance_all = list()
        neighbors = list()
        for i in range(x_train.shape[0]):
            dist = self.get_distance(x_test_row, x_train.iloc[i])
            distance_all.append((dist, y_train.iloc[i]))
        distance_all.sort(key=lambda x:x[0])
        for i in range(k_max):
            neighbors.append(distance_all[i][1])
        return neighbors
    
    def prediction_single(self,x_train,y_train,x_test_row,k_num):
        neighbors = self.get_neighbors(x_train,y_train,x_test_row,k_num)
        prediction = max(set(neighbors),key=neighbors.count)
        return prediction
    
    def prediction_all(self,x_train,y_train,x_test,k_num):
        predictions = list()
        for i in range(x_test.shape[0]):
            pred = self.prediction_single(x_train, y_train, x_test.iloc[i],k_num)
            predictions.append((x_test.iloc[i].name,pred))
        return predictions
    
    def model_accracy(self,y_test,predictions):
        correct = 0
        for i in range(len(y_test)):
            if y_test.iloc[i] == predictions[i][1]:
                correct +=1
                score = correct/len(y_test)
        return score

In [None]:
#find k value with the highest accuracy 
score = list()
for i in range(9,31,2):
    model = knn_classifier()
    predictions = model.prediction_all(x_train, y_train, x_test, i)
    sc = model.model_accracy(y_test,predictions)
    score.append((i,sc))
    print(i,sc)

In [None]:
#when k is 29, we get highest accuracy, thus we will set k value as 29
score.sort(key=lambda x:x[1])
score

In [None]:
#Evaluate the model using 'Shanghai_labeled.csv'
test_df = pd.read_csv('Shanghai_labeled.csv')
#Data preprocessing
test_df = test_df.dropna()
test_df[features_normalized] = StandardScaler().fit_transform(test_df[features_normalized])
#Fit the model
model = knn_classifier()
predictions = model.prediction_all(x_train, y_train, test_df[features_all], 29)
model.model_accracy(test_df['PM_HIGH'],predictions)

In [None]:
#evaluate the model using 'Guangzhou_labeled.csv'
test_df = pd.read_csv('Guangzhou_labeled.csv')
#Data preprocessing
test_df = test_df.dropna()
test_df[features_normalized] = StandardScaler().fit_transform(test_df[features_normalized])
#Fit the model
model = knn_classifier()
predictions = model.prediction_all(x_train, y_train, test_df[features_all], 29)
model.model_accracy(test_df['PM_HIGH'],predictions)

In [3]:
import matplotlib.backends.backend_tkagg
import matplotlib.backends.tkagg as tkagg

ModuleNotFoundError: No module named 'matplotlib.backends.tkagg'