In [1]:
'''
1. When looking for the optimal k,random_state = 101
2. using the optimal k from step 1. get the average mean RMSE from 21 random cases, where random_state = range(1,102,5)
'''


import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error,mean_squared_error
%matplotlib inline
import math as m
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix
pd.set_option("display.precision", 8)
import scipy.stats

from sklearn import svm
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.svm import SVC

from sklearn.cluster import KMeans

def indexlist(data):
    '''use indeces to mark the 143 gps coordinates'''
    checklist = data['ref_x'].unique()
    for i in range(len(checklist)):
        data['point_index'] = data['point_index'].replace([checklist[i]],i)
    return data

def rsrp_indexlist(data):
    '''use indeces to mark the 41 rsrp clusters, where the cluster is categorized by the 3 largest signals'''
    checklist = data['cluster'].unique()
    for i in range(len(checklist)):
        data['rsrp_cluster'] = data['rsrp_cluster'].replace(checklist[i],i)
    return data

def cluster(rsrp,num = 3):
    dataset = pd.read_csv('rds-SRSRP.csv').set_axis([0,1,2,3,4,5,6,7], axis=1).to_numpy()
    #pick out the 3 largest values's indeces
    cluster = dataset.argsort()[:,-num:]
    ascending_cluster = pd.DataFrame(cluster)
    #sort the indeces
    cluster.sort()
    if num == 3:
        rsrp[['aaa','bbb','ccc']]=cluster
    elif num ==2:
        rsrp[['aaa','bbb']]=cluster
    elif num == 1:
        rsrp['aaa']=cluster
    return rsrp, ascending_cluster

def knn_nothing(data,gps,n_neighbors=10):
    scaler = StandardScaler()
    scaler.fit(data)
    scaled_features = scaler.transform(data)
    df_feat = pd.DataFrame(scaled_features,columns=data.columns)
    X_train, X_test, y_train, y_test = train_test_split(scaled_features,gps['point_index'],
                                                        test_size=0.20,random_state=101)
    knn_model = KNeighborsClassifier(n_neighbors)
    knn_model.fit(X_train,y_train)
    pred = knn_model.predict(X_test)
    accuracy = np.mean(pred == y_test)
    #print(classification_report(y_test,pred))
    return np.array([pred]),np.array([y_test])

def knn(data,k=5,state = 101):
    '''Using Lin's clustering idea '''
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:8],data.iloc[:,-4:],
                                                        test_size=0.20,random_state=state)
    pred_rec = []
    for i in range(len(y_test)):
        # get the cluster index of the sample
        a,b,c = y_test.iloc[i,:3]
        #1.try to get 3 indeces
        y_train_chosen_three = y_train[(y_train['aaa'] == a) & 
                                       (y_train['bbb'] == b) & 
                                       (y_train['ccc'] == c)]
        chosen_indeces_three = y_train_chosen_three.index.values
        if len(chosen_indeces_three) >=1:
            output_indeces = chosen_indeces_three
            output_y_train_chosen = y_train_chosen_three
        else:
            #2. instead, get 2 indeces
            y_train_chosen_two = y_train[(y_train['bbb'] == b) & 
                                         (y_train['ccc'] == c)]
            chosen_indeces_two = y_train_chosen_two.index.values
            if len(chosen_indeces_two) >=1:
                output_indeces = chosen_indeces_two
                output_y_train_chosen = y_train_chosen_two
            else:
                #3. instead, get 1 index
                y_train_chosen_one = y_train[(y_train['ccc'] == c)]
                chosen_indeces_one = y_train_chosen_one.index.values
                if len(chosen_indeces_one) >=1:
                    output_indeces = chosen_indeces_one
                    output_y_train_chosen = y_train_chosen_one

        #get the norm-1 distance
        distances = np.dot(np.abs(X_train.loc[output_indeces] - X_test.iloc[i]),np.ones(8))
        # norm-2
        #distances = np.sqrt(np.sum(np.square(X_train - X_test.iloc[i]),axis=1)).values
        
        nn_ids = distances.argsort()[:k]
        nn_index = output_y_train_chosen.iloc[nn_ids,-1]
        predi = int(nn_index.values[0])
        pred_rec.append(predi)
    return np.array([pred_rec]),np.array([y_test.iloc[:,-1]])


#without clustering
def knn_reg(data,k=19,state=101,clustering = True):
    '''in wknn, get weighted average of all k nearest nighbours is worse than average of only 4 top-count neighbours(n=4)'''
    if clustering == True:
        X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:8],data.iloc[:,-4:],
                                                        test_size=0.20,random_state=state)
    else:
        X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:8],data.iloc[:,-1],
                                                        test_size=0.20,random_state=state)
    test = gps.iloc[y_test.index]
    test_ = np.array(list(zip(test.ref_x, test.ref_y)))
    pred_rec = []
    for i in range(len(y_test)):
        if clustering == True:
            # get the cluster index of the sample
            a,b,c = y_test.iloc[i,:3]
            #1.try to get 3 indeces
            y_train_chosen_three = y_train[(y_train['aaa'] == a) & 
                                           (y_train['bbb'] == b) & 
                                           (y_train['ccc'] == c)]
            chosen_indeces_three = y_train_chosen_three.index.values
            if len(chosen_indeces_three) >=1:
                output_indeces = chosen_indeces_three
                output_y_train_chosen = y_train_chosen_three
            else:
                #2. instead, get 2 indeces
                y_train_chosen_two = y_train[(y_train['bbb'] == b) & 
                                             (y_train['ccc'] == c)]
                chosen_indeces_two = y_train_chosen_two.index.values
                if len(chosen_indeces_two) >=1:
                    output_indeces = chosen_indeces_two
                    output_y_train_chosen = y_train_chosen_two
                else:
                    #3. instead, get 1 index
                    y_train_chosen_one = y_train[(y_train['ccc'] == c)]
                    chosen_indeces_one = y_train_chosen_one.index.values
                    if len(chosen_indeces_one) >=1:
                        output_indeces = chosen_indeces_one
                        output_y_train_chosen = y_train_chosen_one
            #get the norm-1 distance
            distances = np.dot(np.abs(X_train.loc[output_indeces] - X_test.iloc[i]),np.ones(8))
            nn_ids = distances.argsort()[:k]
            nn_index = output_y_train_chosen.iloc[nn_ids,-1] 
        else:
            #get the norm-1 distance
            distances = np.dot(np.abs(X_train - X_test.iloc[i]),np.ones(8))
            nn_ids = distances.argsort()[:k]
            nn_index = y_train.iloc[nn_ids]
        
        #regression part
        indeces = nn_index.value_counts().index
        counts = nn_index.value_counts().values
        values = []
        for i in range(len(counts)):
            values.append(np.array(checklist.iloc[int(indeces[i])][0]))
        values = np.array(values)
        pred_i = np.dot(values.T,counts.T)/counts.sum()
        pred_rec.append(pred_i)
    pred_ = np.array(pred_rec)
    se_rec = []
    for i in range(len(test)):
        lon_pred, lat_pred = pred_[i]
        lon_, lat_ = test_[i]
        err_lon = lon_pred - lon_
        err_lat = lat_pred -lat_
        err_x = err_lon*((40000/360)*1000*m.cos(m.degrees(lat_)))
        #err_x = err_lon*((40000/360)*1000*(m.cos(lat_)-m.cos(lat_pred)))
        err_y = err_lat*((40000/360)*1000)
        se = err_x**2 + err_y**2
        se_rec.append(se)
    return se_rec


def error_record(y_p,y_,checklist,se_rec):
    lon_pred, lat_pred = checklist.iloc[y_p][0]
    lon_, lat_ = checklist.iloc[y_][0]
    err_lon = lon_pred - lon_
    err_lat = lat_pred -lat_
    err_x = err_lon*((40000/360)*1000*m.cos(m.degrees(lat_)))
    #err_x = err_lon*((40000/360)*1000*(m.cos(lat_)-m.cos(lat_pred)))
    err_y = err_lat*((40000/360)*1000)
    se = err_x**2 + err_y**2
    se_rec.append(se)

def result_report(se_rec,test,verbose = False):
    rmse = np.sqrt(sum(se_rec)/len(test))
    a = np.sqrt(se_rec)
    #reshape b from (1,1116) to (1116)
    b = np.sort(a.T).flatten()
    err_80 = b[int(len(a)*0.8)]
    if verbose == True:
        print("RMSE is {:.2f}m".format(rmse))
        print("When the accuracy is 80%,RMSE is {:.2f}m".format(b[int(len(a)*0.8)]))
    return rmse,err_80

def give_result(result,checklist):
    try:
        se_rec = []
        pred = np.array(result.pred)
        test = np.array(result.test)
        for i in range(len(result.test)):
            y_ = int(test[i])
            y_p = int(pred[i])
            if y_ == y_p:
                se = 0
                se_rec.append(se)
            else:
                error_record(y_p,y_,checklist,se_rec)
        rmse,err_80 = result_report(se_rec,test)
    except:
        rmse = np.sqrt(np.mean(result.error))
        a = np.sqrt(result)
        b = np.sort(a.T).flatten()
        err_80=b[int(len(a)*0.8)]
    return rmse,err_80

In [4]:
rsrp = pd.read_csv('rds-SRSRP.csv')
gps = pd.read_csv("gps.csv")
gps['point_index'] = gps['ref_x']
gps['coordinate'] =  list(zip(gps.ref_x, gps.ref_y))
#input indeces for each individual gps coordinates
gps['point_index'] = gps['ref_x']
gps = indexlist(gps)
checklist = pd.DataFrame(gps.coordinate.unique(),columns = ['coordinate'])

#clustering
rsrp, ascending_cluster = cluster(rsrp,num=3)
feature = pd.concat([rsrp, gps['point_index']],axis = 1)

In [5]:
LoS = pd.read_csv('LoS.csv')
feature_los = pd.concat([feature, LoS['LoS/nLos']],axis = 1)
only_los = feature_los[feature_los['LoS/nLos']=='LoS'].drop(columns='LoS/nLos')
only_nlos = feature_los[feature_los['LoS/nLos']=='nLoS'].drop(columns='LoS/nLos')

In [10]:
feature_los

Unnamed: 0,rds-SRSRP1,rds-SRSRP2,rds-SRSRP3,rds-SRSRP4,rds-SRSRP5,rds-SRSRP6,rds-SRSRP7,rds-SRSRP8,aaa,bbb,ccc,point_index,LoS/nLos
0,-79,-98,-105,-102,-105,-103,-106,-107,0,1,3,0.0,LoS
1,-82,-98,-103,-102,-105,-103,-105,-109,0,1,3,0.0,LoS
2,-82,-99,-104,-102,-105,-103,-105,-107,0,1,3,0.0,LoS
3,-81,-99,-105,-101,-104,-103,-103,-106,0,1,3,0.0,LoS
4,-81,-98,-103,-103,-104,-106,-106,-107,0,1,3,0.0,LoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5566,-105,-106,-105,-103,-104,-99,-96,-105,3,5,6,142.0,nLoS
5567,-104,-108,-105,-103,-104,-97,-96,-106,3,5,6,142.0,nLoS
5568,-104,-107,-105,-103,-105,-101,-98,-107,3,5,6,142.0,nLoS
5569,-105,-107,-106,-103,-104,-95,-96,-106,3,5,6,142.0,nLoS


In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature_los.iloc[:,:8],feature_los.iloc[:,-1],
                                                        test_size=0.20,random_state=state)

In [20]:
#1-1-1 knn, clustering , k=5,sorted, only LoS
prediction,test = knn(only_los,k=5,state = 101)
df = np.append(prediction, test,axis=0)
result = pd.DataFrame(df.T,columns = ['pred','test'])

#report
rmse,err_80 = give_result(result,checklist)
print("The accuracy is {:.2f}%".format(np.mean(result.pred == result.test)*100))
print("RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

The accuracy is 59.06%
RMSE is 6.76m
When the accuracy is 80%,RMSE is 9.15m


In [31]:
#1-1-2 knn, clustering , k=5,sorted, only nLoS
prediction,test = knn(only_nlos,k=5,state = 101)
df = np.append(prediction, test,axis=0)
result = pd.DataFrame(df.T,columns = ['pred','test'])

#report
rmse,err_80 = give_result(result,checklist)
print("The accuracy is {:.2f}%".format(np.mean(result.pred == result.test)*100))
print("RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

The accuracy is 42.43%
RMSE is 8.82m
When the accuracy is 80%,RMSE is 9.24m


In [77]:
#1-2 knn,clustering, find the optimal k, sorted
rmse_rec = []
err_80_rec = []
acc_rec = []
for k in range(1,21):
    prediction,test = knn(feature,k=k,state = 101)
    df = np.append(prediction, test,axis=0)
    result = pd.DataFrame(df.T,columns = ['pred','test'])
    rmse,err_80 = give_result(result,checklist)
    acc_rec.append(np.mean(result.pred==result.test))
    rmse_rec.append(rmse)
    err_80_rec.append(err_80)
print('When k = {}, the highest accuracy is {:.2f}%'.format(np.argmax(acc_rec)+1,np.max(acc_rec)*100))

print('When k = {}, the lowest RMSE is {:.2f}m'.format(np.argmin(rmse_rec)+1,np.min(rmse_rec)))
print("When the accuracy is 80%,RSE is {:.2f}m".format(err_80_rec[np.argmin(rmse_rec)]))

When k = 1, the highest accuracy is 37.40%
When k = 1, the lowest RMSE is 9.00m
When the accuracy is 80%,RSE is 11.32m


In [32]:
#2-1-1 knn, regression, k=12, only LoS
result = pd.DataFrame(knn_reg(only_los,k=12,clustering = False),columns = ['error'])
rmse,err_80 = give_result(result,checklist)
print("Using manhattan, RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

Using manhattan, RMSE is 4.99m
When the accuracy is 80%,RMSE is 6.11m


In [33]:
#2-1-2 knn, regression, k=12, only nLoS
result = pd.DataFrame(knn_reg(only_nlos,k=12,clustering = False),columns = ['error'])
rmse,err_80 = give_result(result,checklist)
print("Using manhattan, RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

Using manhattan, RMSE is 6.73m
When the accuracy is 80%,RMSE is 7.43m


In [35]:
#2-2 knn,regression, find the optimal k, sorted, only nLos
rmse_rec = []
err_80_rec = []

for k in range(1,21):
    result = pd.DataFrame(knn_reg(only_nlos,k=k,clustering = False),columns = ['error'])
    rmse,err_80 = give_result(result,checklist)
    rmse_rec.append(rmse)
    err_80_rec.append(err_80)
print('When k = {}, the lowest RMSE is {:.2f}m'.format(np.argmin(rmse_rec)+1,np.min(rmse_rec)))
print("When the accuracy is 80%,RSE is {:.2f}m".format(err_80_rec[np.argmin(rmse_rec)]))

When k = 8, the lowest RMSE is 6.67m
When the accuracy is 80%,RSE is 7.37m


In [36]:
#3-1 knn,clustering+regression,k=11, sorted
result = pd.DataFrame(knn_reg(only_los,k=6),columns = ['error'])
rmse,err_80 = give_result(result,checklist)
print("Using manhattan, RMSE is {:.2f}m".format(rmse))
print("When the accuracy is 80%,RMSE is {:.2f}m".format(err_80))

Using manhattan, RMSE is 5.56m
When the accuracy is 80%,RMSE is 6.64m


In [8]:
#3-2 knn,clustering+regression, find the optimal k, sorted
rmse_rec = []
err_80_rec = []

for k in range(1,21):
    result = pd.DataFrame(knn_reg(feature,k=k),columns = ['error'])
    rmse,err_80 = give_result(result,checklist)
    rmse_rec.append(rmse)
    err_80_rec.append(err_80)
print('When k = {}, the lowest RMSE is {:.2f}m'.format(np.argmin(rmse_rec)+1,np.min(rmse_rec)))
print("When the accuracy is 80%,RSE is {:.2f}m".format(err_80_rec[np.argmin(rmse_rec)]))

When k = 9, the lowest RMSE is 7.65m
When the accuracy is 80%,RSE is 8.60m
