In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error

bj_df = pd.read_csv('Beijing_labeled.csv')
sy_df = pd.read_csv('Shenyang_labeled.csv')
merge = bj_df.append(sy_df)

#shuffle the data
#merge = merge.sample(frac=1).reset_index(drop=True)
#drop the null value if existing
merge = merge.dropna()
#check if the data set is balanced
print(merge['PM_HIGH'].sum()/merge.shape[0])
merge.head()

0.27495682210708117


Unnamed: 0,season,DEWP,HUMI,PRES,TEMP,Iws,precipitation,cbwd_NE,cbwd_NW,cbwd_SE,PM_HIGH
0,4,-8.0,79.0,1026.0,-5.0,23.69,0.0,0,0,1,1.0
1,4,-11.0,85.0,1021.0,-9.0,105.93,1.1,0,0,1,0.0
2,4,-21.0,43.0,1030.0,-11.0,117.55,0.0,0,1,0,0.0
3,4,-25.0,33.0,1034.0,-12.0,39.35,0.0,1,0,0,0.0
4,4,-24.0,30.0,1034.0,-10.0,59.0,0.0,1,0,0,0.0


In [2]:
#Standardiza and normalize raw data
from sklearn.preprocessing import StandardScaler
features_normalized = ['season','DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation','cbwd_NE','cbwd_NW','cbwd_SE']
merge[features_normalized] = StandardScaler().fit_transform(merge[features_normalized])
merge.head()

Unnamed: 0,season,DEWP,HUMI,PRES,TEMP,Iws,precipitation,cbwd_NE,cbwd_NW,cbwd_SE,PM_HIGH
0,1.345351,-0.645127,1.64108,0.93837,-1.69189,0.039829,-0.099523,-0.353141,-0.633985,1.774138,1.0
1,1.345351,-0.8574,1.911613,0.442481,-2.022982,1.866585,2.183845,-0.353141,-0.633985,1.774138,0.0
2,1.345351,-1.564975,0.01788,1.335082,-2.188529,2.124694,-0.099523,-0.353141,1.577324,-0.563654,0.0
3,1.345351,-1.848005,-0.433009,1.731793,-2.271302,0.387677,-0.099523,2.831729,-0.633985,-0.563654,0.0
4,1.345351,-1.777247,-0.568276,1.731793,-2.105756,0.824153,-0.099523,2.831729,-0.633985,-0.563654,0.0


In [3]:
#split the data set into training and validation data
from sklearn.model_selection import train_test_split
features_all = ['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation', 'cbwd_NE','cbwd_NW','cbwd_SE']
#features_all = ['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation']
x_train, x_test, y_train, y_test = train_test_split(merge[features_all],merge['PM_HIGH'],test_size=0.25, random_state=0)

In [4]:
from math import sqrt
class knn_classifier(object):
    def __init__(self):
        pass

        
    def get_distance(self, row1, row2):
        distance = 0
        for i in range(len(row1)):
            distance += (row1[i]-row2[i])**2
        return sqrt(distance) 
        
    def get_neighbors(self,x_train,y_train,x_test_row,k_max):
        distance_all = list()
        neighbors = list()
        for i in range(x_train.shape[0]):
            dist = self.get_distance(x_test_row, x_train.iloc[i])
            distance_all.append((dist, y_train.iloc[i]))
        distance_all.sort(key=lambda x:x[0])
        for i in range(k_max):
            neighbors.append(distance_all[i][1])
        return neighbors
    
    def prediction_single(self,x_train,y_train,x_test_row,k_num):
        neighbors = self.get_neighbors(x_train,y_train,x_test_row,k_num)
        prediction = max(set(neighbors),key=neighbors.count)
        return prediction
    
    def prediction_all(self,x_train,y_train,x_test,k_num):
        predictions = list()
        for i in range(x_test.shape[0]):
            pred = self.prediction_single(x_train, y_train, x_test.iloc[i],k_num)
            predictions.append((x_test.iloc[i].name,pred))
        return predictions
    
    def model_accracy(self,y_test,predictions):
        correct = 0
        for i in range(len(y_test)):
            if y_test.iloc[i] == predictions[i][1]:
                correct +=1
                score = correct/len(y_test)
        return score

In [5]:
#find k value with the highest accuracy 
score = list()
for i in range(9,31,2):
    model = knn_classifier()
    predictions = model.prediction_all(x_train, y_train, x_test, i)
    sc = model.model_accracy(y_test,predictions)
    score.append((i,sc))
    print(i,sc)

9 0.7845303867403315
11 0.7886740331491713
13 0.7983425414364641
15 0.7983425414364641
17 0.7997237569060773
19 0.8038674033149171
21 0.8011049723756906
23 0.8038674033149171
25 0.8052486187845304
27 0.8066298342541437
29 0.8093922651933702


In [None]:
#when k is 29, we get highest accuracy, thus we will set k value as 29
score.sort(key=lambda x:x[1])
score

In [7]:
#Evaluate the model using 'Shanghai_labeled.csv'
test_df = pd.read_csv('Shanghai_labeled.csv')
#Data preprocessing
test_df = test_df.dropna()
test_df[features_normalized] = StandardScaler().fit_transform(test_df[features_normalized])
#Fit the model
model = knn_classifier()
predictions = model.prediction_all(x_train, y_train, test_df[features_all], 29)
model.model_accracy(test_df['PM_HIGH'],predictions)

0.7586972612879349

In [8]:
#evaluate the model using 'Guangzhou_labeled.csv'
test_df = pd.read_csv('Guangzhou_labeled.csv')
#Data preprocessing
test_df = test_df.dropna()
test_df[features_normalized] = StandardScaler().fit_transform(test_df[features_normalized])
#Fit the model
model = knn_classifier()
predictions = model.prediction_all(x_train, y_train, test_df[features_all], 29)
model.model_accracy(test_df['PM_HIGH'],predictions)

0.7670118343195266

In [None]:
#Appendix Not included in assignment codes!!!
#Use KNeighborsClassifier from sklearn as ground truth for our model
from sklearn.neighbors import KNeighborsClassifier
knn_score = []
for i in range(1,43,2):
    knn = KNeighborsClassifier(n_neighbors=i) #class
    #fit the model
    knn.fit(x_train, y_train) #def fit
    #make prediction
    pred = knn.predict(x_test) #def predict
    #model score
    train_score = knn.score(x_train, y_train) #def score
    test_score = knn.score(x_test, y_test)
    #store the score in a list
    knn_score.append((i,train_score,test_score)) 

score_df = pd.DataFrame(knn_score,columns=['k','train score','test score'])
score_df