In [51]:
'''
1. When looking for the optimal k,random_state = 101
2. using the optimal k from step 1. get the average mean RMSE from 21 random cases, where random_state = range(1,102,5)
'''


import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error,mean_squared_error
%matplotlib inline
import math as m
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix
pd.set_option("display.precision", 8)
import scipy.stats

from sklearn import svm
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.svm import SVC

from sklearn.cluster import KMeans
from scipy.io import loadmat

def indexlist(data,checklist):
    '''use indeces to mark the 143 gps coordinates'''
    #checklist = data['ref_x'].unique()
    for i in range(len(checklist)):
        data['point_index'] = data['point_index'].replace([checklist.iloc[i,0]],i)
    return data

def rsrp_indexlist(data):
    '''use indeces to mark the 41 rsrp clusters, where the cluster is categorized by the 3 largest signals'''
    checklist = data['cluster'].unique()
    for i in range(len(checklist)):
        data['rsrp_cluster'] = data['rsrp_cluster'].replace(checklist[i],i)
    return data

def cluster(rsrp,num = 3):
    dataset = rsrp.set_axis([0,1,2,3,4,5,6,7],axis=1).to_numpy()
    #dataset = pd.read_csv('rds-SRSRP.csv').set_axis([0,1,2,3,4,5,6,7], axis=1).to_numpy()
    #pick out the 3 largest values's indeces
    cluster = dataset.argsort()[:,-num:]
    ascending_cluster = pd.DataFrame(cluster)
    #sort the indeces
    cluster.sort()
    if num == 3:
        rsrp[['aaa','bbb','ccc']]=cluster
    elif num ==2:
        rsrp[['aaa','bbb']]=cluster
    elif num == 1:
        rsrp['aaa']=cluster
    return rsrp, ascending_cluster

def knn_nothing(data,gps,n_neighbors=10):
    scaler = StandardScaler()
    scaler.fit(data)
    scaled_features = scaler.transform(data)
    df_feat = pd.DataFrame(scaled_features,columns=data.columns)
    X_train, X_test, y_train, y_test = train_test_split(scaled_features,gps['point_index'],
                                                        test_size=0.20,random_state=101)
    knn_model = KNeighborsClassifier(n_neighbors)
    knn_model.fit(X_train,y_train)
    pred = knn_model.predict(X_test)
    accuracy = np.mean(pred == y_test)
    #print(classification_report(y_test,pred))
    return np.array([pred]),np.array([y_test])

def knn_lin_data(feature_train,feature_test,k=5,clustering = True):
    '''Using Lin's clustering idea '''
    if clustering == True:
        X_train, y_train = feature_train.iloc[:,:8],feature_train.iloc[:,-4:]
        X_test, y_test = feature_test.iloc[:,:8],feature_test.iloc[:,-4:]
    else:
        X_train, y_train = feature_train.iloc[:,:8],feature_train.iloc[:,-1]
        X_test, y_test = feature_test.iloc[:,:8],feature_test.iloc[:,-1]
        
    pred_rec = []
    for i in range(len(y_test)):
        if clustering == True:
            # get the cluster index of the sample
            a,b,c = y_test.iloc[i,:3]
            #1.try to get 3 indeces
            y_train_chosen_three = y_train[(y_train['aaa'] == a) & 
                                           (y_train['bbb'] == b) & 
                                           (y_train['ccc'] == c)]
            chosen_indeces_three = y_train_chosen_three.index.values
            if len(chosen_indeces_three) >=1:
                output_indeces = chosen_indeces_three
                output_y_train_chosen = y_train_chosen_three
            else:
                #2. instead, get 2 indeces
                y_train_chosen_two = y_train[(y_train['bbb'] == b) & 
                                             (y_train['ccc'] == c)]
                chosen_indeces_two = y_train_chosen_two.index.values
                if len(chosen_indeces_two) >=1:
                    output_indeces = chosen_indeces_two
                    output_y_train_chosen = y_train_chosen_two
                else:
                    #3. instead, get 1 index
                    y_train_chosen_one = y_train[(y_train['ccc'] == c)]
                    chosen_indeces_one = y_train_chosen_one.index.values
                    if len(chosen_indeces_one) >=1:
                        output_indeces = chosen_indeces_one
                        output_y_train_chosen = y_train_chosen_one

            #get the norm-1 distance
            distances = np.dot(np.abs(X_train.loc[output_indeces] - X_test.iloc[i]),np.ones(8))
            nn_ids = distances.argsort()[:k]
            nn_index = output_y_train_chosen.iloc[nn_ids,-1]
        else:
            distances = np.dot(np.abs(X_train - X_test.iloc[i]),np.ones(8))
            nn_ids = distances.argsort()[:k]
            nn_index = y_train.iloc[nn_ids]
        
        
        index_mostcounted = np.unique(nn_index.values,return_counts=True)[1].argmax()
        predi = int(np.unique(nn_index.values,return_counts=True)[0][index_mostcounted])
        pred_rec.append(predi)
    return np.array([pred_rec]),np.array([pd.DataFrame(y_test).iloc[:,-1]])


#without clustering
def knn_reg_lin_data(feature_train,feature_test,k=19,clustering = True):
    '''in wknn, get weighted average of all k nearest nighbours is worse than average of only 4 top-count neighbours(n=4)'''
    if clustering == True:
        X_train, y_train = feature_train.iloc[:,:8],feature_train.iloc[:,-4:]
        X_test, y_test = feature_test.iloc[:,:8],feature_test.iloc[:,-4:]
        test_  = y_test.iloc[:,-1]
    else:
        X_train, y_train = feature_train.iloc[:,:8],feature_train.iloc[:,-1]
        X_test, y_test = feature_test.iloc[:,:8],feature_test.iloc[:,-1]
        test_  = y_test

    pred_rec = []
    for i in range(len(y_test)):
        if clustering == True:
            # get the cluster index of the sample
            a,b,c = y_test.iloc[i,:3]
            #1.try to get 3 indeces
            y_train_chosen_three = y_train[(y_train['aaa'] == a) & 
                                           (y_train['bbb'] == b) & 
                                           (y_train['ccc'] == c)]
            chosen_indeces_three = y_train_chosen_three.index.values
            if len(chosen_indeces_three) >=1:
                output_indeces = chosen_indeces_three
                output_y_train_chosen = y_train_chosen_three
            else:
                #2. instead, get 2 indeces
                y_train_chosen_two = y_train[(y_train['bbb'] == b) & 
                                             (y_train['ccc'] == c)]
                chosen_indeces_two = y_train_chosen_two.index.values
                if len(chosen_indeces_two) >=1:
                    output_indeces = chosen_indeces_two
                    output_y_train_chosen = y_train_chosen_two
                else:
                    #3. instead, get 1 index
                    y_train_chosen_one = y_train[(y_train['ccc'] == c)]
                    chosen_indeces_one = y_train_chosen_one.index.values
                    if len(chosen_indeces_one) >=1:
                        output_indeces = chosen_indeces_one
                        output_y_train_chosen = y_train_chosen_one
            #get the norm-1 distance
            distances = np.dot(np.abs(X_train.loc[output_indeces] - X_test.iloc[i]),np.ones(8))
            nn_ids = distances.argsort()[:k]
            nn_index = output_y_train_chosen.iloc[nn_ids,-1] 
        else:
            #get the norm-1 distance
            distances = np.dot(np.abs(X_train - X_test.iloc[i]),np.ones(8))
            nn_ids = distances.argsort()[:k]
            nn_index = y_train.iloc[nn_ids]
        #regression part
        indeces = nn_index.value_counts().index
        counts = nn_index.value_counts().values
        x_values = []
        y_values = []
        for i in range(len(counts)):
            x_values.append(np.array(checklist.iloc[int(indeces[i])][0]))
            y_values.append(np.array(checklist.iloc[int(indeces[i])][1]))
        x_values = np.array(x_values)
        y_values = np.array(y_values)
        pred_i = (np.dot(x_values.T,counts.T)/counts.sum(),
                  np.dot(y_values.T,counts.T)/counts.sum())
        pred_rec.append(pred_i)
    pred_ = np.array(pred_rec)
    # error calculation
    se_rec = []
    for i in range(len(test)):
        lon_pred, lat_pred = pred_[i]
        lon_, lat_ = checklist.iloc[int(test_[i])][0],checklist.iloc[int(test_[i])][1]
        err_lon = lon_pred - lon_
        err_lat = lat_pred -lat_
        err_x = err_lon*((40000/360)*1000*m.cos(m.degrees(lat_)))
        #err_x = err_lon*((40000/360)*1000*(m.cos(lat_)-m.cos(lat_pred)))
        err_y = err_lat*((40000/360)*1000)
        se = err_x**2 + err_y**2
        se_rec.append(se)
    return se_rec


def error_record(y_p,y_,checklist,se_rec):
    lon_pred, lat_pred = checklist.iloc[y_p][0], checklist.iloc[y_p][1]
    lon_, lat_ = checklist.iloc[y_][0],checklist.iloc[y_][1]
    err_lon = lon_pred - lon_
    err_lat = lat_pred -lat_
    err_x = err_lon*((40000/360)*1000*m.cos(m.degrees(lat_)))
    #err_x = err_lon*((40000/360)*1000*(m.cos(lat_)-m.cos(lat_pred)))
    err_y = err_lat*((40000/360)*1000)
    se = err_x**2 + err_y**2
    se_rec.append(se)

def result_report(se_rec,test,verbose = False):
    rmse = np.sqrt(sum(se_rec)/len(test))
    a = np.sqrt(se_rec)
    #reshape b from (1,1116) to (1116)
    b = np.sort(a.T).flatten()
    err_80 = b[int(len(a)*0.8)]
    if verbose == True:
        print("RMSE is {:.2f}m".format(rmse))
        print("When the accuracy is 80%,RMSE is {:.2f}m".format(b[int(len(a)*0.8)]))
    return rmse,err_80

def give_result(result,checklist):
    try:
        se_rec = []
        pred = np.array(result.pred)
        test = np.array(result.test)
        for i in range(len(result.test)):
            y_ = int(test[i])
            y_p = int(pred[i])
            if y_ == y_p:
                se = 0
                se_rec.append(se)
            else:
                error_record(y_p,y_,checklist,se_rec)
        rmse,err_80 = result_report(se_rec,test)
    except:
        rmse = np.sqrt(np.mean(result.error))
        a = np.sqrt(result)
        b = np.sort(a.T).flatten()
        err_80=b[int(len(a)*0.8)]
    return rmse,err_80

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
rsrp_train = train.iloc[:,:8]
rsrp_test = test.iloc[:,:8]
rsrp_train, ascending_cluster_train = cluster(rsrp_train,num=3)
rsrp_test, ascending_cluster_test = cluster(rsrp_test,num=3)

gps_train = train[["ref_x","ref_y"]]
gps_test  = test[["ref_x","ref_y"]]
#input indeces for each individual gps coordinates
gps_train['point_index'] = gps_train['ref_x']
gps_test['point_index'] = gps_test['ref_x']
checklist = pd.DataFrame(np.append([gps_train.ref_x.unique()],[gps_train.ref_y.unique()],axis=0).T,
                         columns = ['ref_x','ref_y'])
gps_train = indexlist(gps_train,checklist)
gps_test = indexlist(gps_test,checklist)
feature_train = pd.concat([rsrp_train, gps_train['point_index']],axis = 1)
feature_test = pd.concat([rsrp_test, gps_test['point_index']],axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gps_train['point_index'] = gps_train['ref_x']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gps_test['point_index'] = gps_test['ref_x']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['point_index'] = data['point_index'].replace([checklist.iloc[i,0]],i)


In [58]:
prediction,test = knn_lin_data(feature_train,feature_test,k=8,clustering=False)
df = np.append(prediction, test,axis=0)
result = pd.DataFrame(df.T,columns = ['pred','test'])

#report
rmse,err_80 = give_result(result,checklist)
print("The accuracy is {:.2f}%".format(np.mean(result.pred == result.test)*100))
print("RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

The accuracy is 46.09%
RMSE is 9.61m
When the accuracy is 80%,RMSE is 10.75m


In [57]:
#1-2 knn, find the optimal k, sorted
rmse_rec = []
err_80_rec = []
acc_rec = []
for k in range(1,21):
    prediction,test = knn_lin_data(feature_train,feature_test,k=k,clustering=False)
    df = np.append(prediction, test,axis=0)
    result = pd.DataFrame(df.T,columns = ['pred','test'])
    rmse,err_80 = give_result(result,checklist)
    acc_rec.append(np.mean(result.pred==result.test))
    rmse_rec.append(rmse)
    err_80_rec.append(err_80)
print('When k = {}, the highest accuracy is {:.2f}%'.format(np.argmax(acc_rec)+1,np.max(acc_rec)*100))

print('When k = {}, the lowest RMSE is {:.2f}m'.format(np.argmin(rmse_rec)+1,np.min(rmse_rec)))
print("When the accuracy is 80%,RSE is {:.2f}m".format(err_80_rec[np.argmin(rmse_rec)]))

When k = 8, the highest accuracy is 46.09%
When k = 13, the lowest RMSE is 8.60m
When the accuracy is 80%,RSE is 10.06m


In [55]:
#1-1 knn, grouping , k=9
prediction,test = knn_lin_data(feature_train,feature_test,k=9)
df = np.append(prediction, test,axis=0)
result = pd.DataFrame(df.T,columns = ['pred','test'])

rmse,err_80 = give_result(result,checklist)
print("The accuracy is {:.2f}%".format(np.mean(result.pred == result.test)*100))
print("RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

The accuracy is 44.06%
RMSE is 9.62m
When the accuracy is 80%,RMSE is 10.89m


In [89]:
#1-2 knn,grouping, find the optimal k, sorted
rmse_rec = []
err_80_rec = []
acc_rec = []
for k in range(1,21):
    prediction,test = knn_lin_data(feature_train,feature_test,k=k)
    df = np.append(prediction, test,axis=0)
    result = pd.DataFrame(df.T,columns = ['pred','test'])
    rmse,err_80 = give_result(result,checklist)
    acc_rec.append(np.mean(result.pred==result.test))
    rmse_rec.append(rmse)
    err_80_rec.append(err_80)
print('When k = {}, the highest accuracy is {:.2f}%'.format(np.argmax(acc_rec)+1,np.max(acc_rec)*100))

print('When k = {}, the lowest RMSE is {:.2f}m'.format(np.argmin(rmse_rec)+1,np.min(rmse_rec)))
print("When the accuracy is 80%,RSE is {:.2f}m".format(err_80_rec[np.argmin(rmse_rec)]))

When k = 9, the highest accuracy is 44.06%
When k = 4, the lowest RMSE is 9.01m
When the accuracy is 80%,RSE is 11.03m


In [43]:
#2-1 knn, regression, k=12
result = pd.DataFrame(knn_reg_lin_data(feature_train,feature_test,k=19,clustering = False),columns = ['error'])
rmse,err_80 = give_result(result,checklist)
print("Using manhattan, RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

Using manhattan, RMSE is 7.04m
When the accuracy is 80%,RMSE is 8.28m


In [45]:
#2-2 knn,regression, find the optimal k, sorted
rmse_rec = []
err_80_rec = []

for k in range(1,21):
    result = pd.DataFrame(knn_reg_lin_data(feature_train,feature_test,k=k,clustering = False),columns = ['error'])
    rmse,err_80 = give_result(result,checklist)
    rmse_rec.append(rmse)
    err_80_rec.append(err_80)
print('When k = {}, the lowest RMSE is {:.2f}m'.format(np.argmin(rmse_rec)+1,np.min(rmse_rec)))
print("When the accuracy is 80%,RSE is {:.2f}m".format(err_80_rec[np.argmin(rmse_rec)]))

When k = 5, the lowest RMSE is 6.85m
When the accuracy is 80%,RSE is 8.31m


In [27]:
#3-1 knn, regression+grouping, k=19
result = pd.DataFrame(knn_reg_lin_data(feature_train,feature_test,k=19,clustering = True),columns = ['error'])
rmse,err_80 = give_result(result,checklist)
print("Using manhattan, RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

Using manhattan, RMSE is 7.55m
When the accuracy is 80%,RMSE is 8.59m


In [46]:
#3-2 knn,grouping+regression, find the optimal k, sorted
rmse_rec = []
err_80_rec = []

for k in range(1,21):
    result = pd.DataFrame(knn_reg_lin_data(feature_train,feature_test,k=k,clustering = True),columns = ['error'])
    rmse,err_80 = give_result(result,checklist)
    rmse_rec.append(rmse)
    err_80_rec.append(err_80)
print('When k = {}, the lowest RMSE is {:.2f}m'.format(np.argmin(rmse_rec)+1,np.min(rmse_rec)))
print("When the accuracy is 80%,RSE is {:.2f}m".format(err_80_rec[np.argmin(rmse_rec)]))

When k = 7, the lowest RMSE is 7.06m
When the accuracy is 80%,RSE is 8.36m
