In [2]:
import pandas as pd
from pandasql import sqldf
import datetime
from datetime import timedelta
import numpy as np
import copy
import import_ipynb
from Join_Functions import *

importing Jupyter notebook from Join_Functions.ipynb


In [3]:
'''
FUNCTIONS:

calculate_mu
calculate_mu_thresh
calculate_mu_modified
calculate_mu_modified_thresh
euc_distance
calculate_mu_universal
calculate_mu_universal_thresh
euc_distance_norm_1
euc_distance_norm_2
euc_distance_norm_3
euc_distance_penalty_avg
euc_distance_penalty_min
euc_distance_std_dev_thresh
euc_distance_topn
euc_distance_rank
euc_distance_BSS_thresh
cos_sim
ed_1
ed_2
ed_3
ed_4
ed_5
ed_6
ed_hybrid
'''

'\nFUNCTIONS:\n\ncalculate_mu\ncalculate_mu_thresh\ncalculate_mu_modified\ncalculate_mu_modified_thresh\neuc_distance\ncalculate_mu_universal\ncalculate_mu_universal_thresh\neuc_distance_norm_1\neuc_distance_norm_2\neuc_distance_norm_3\neuc_distance_penalty_avg\neuc_distance_penalty_min\neuc_distance_std_dev_thresh\neuc_distance_topn\neuc_distance_rank\neuc_distance_BSS_thresh\ncos_sim\ned_1\ned_2\ned_3\ned_4\ned_5\ned_6\ned_hybrid\n'

In [4]:
#1
def calculate_mu(joined_table, primary_observer):
    '''
    Returns the match factor value

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user

            Returns:
                    mu_0 (float): match factor
    
        NOTES Mu calculation from paper: (magnitude of intersecting access points between two scans) divided by
        (magnitude of how many access points that user observed). The input is a df created in the average_join method.
    '''
    #calculate mu from the perspective of the primary observer
    if joined_table['Phone_A'][0]==primary_observer:
        mu_0 = len(joined_table)/joined_table['Count_A'][0] #len(join_table) is number of intersection networks
    else:
        mu_0 = len(joined_table)/joined_table['Count_B'][0]
        
    #Returns both mu's, as well as the magnitudes of total # of access points scanned by both users. 
    return mu_0

In [5]:
#1A
def calculate_mu_thresh(joined_table, primary_observer,RSSI_min):
    '''
    Returns the match factor value, filtering out networks where both RSSI values are less than threshold

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user
                    RSSI_min (int): Minimum RSSI value to consider

            Returns:
                    mu_0 (float): match factor
    
        NOTES: Mu calculation from paper: (magnitude of intersecting access points between two scans) divided by
        (magnitude of how many access points that user observed). The input is a df created in the average_join method.
    '''
    
    #fixes the case where the threshold specified would cause all networks to be cut out
    rssi_max_actual=np.max([np.max(joined_table.RSSI_A),np.max(joined_table.RSSI_B)])
    if RSSI_min < abs(rssi_max_actual):
        RSSI_min = abs(rssi_max_actual) + 1
        
    #Mu calculation from paper: (magnitude of intersecting access points between two scans) divided by
    #(magnitude of how many access points that user observed). The input is a list created in the average_join method. 
    
    RSSI_min=RSSI_min*-1
    
    #filter out values less than RSSI threshold
    joined_table=joined_table.loc[(joined_table.RSSI_A > RSSI_min) | (joined_table.RSSI_B > RSSI_min)].copy()
    joined_table.reset_index(inplace=True, drop=True)
    if len(joined_table)==0:
        mu_0 = 0
        return mu_0
    
    if joined_table['Phone_A'][0]==primary_observer:
        mu_0 = len(joined_table)/joined_table['Count_A'][0]
    else:

        mu_0 = len(joined_table)/joined_table['Count_B'][0]
        
    #Returns both mu's, as well as the magnitudes of total # of access points scanned by both users. 
    return mu_0

In [6]:
#2
def calculate_mu_modified(joined_table, primary_observer):
    '''
    Returns the match modified factor value

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user

            Returns:
                    modified_0 (float): modified match factor
    
        NOTES: Modified Mu calculation from paper
    '''
    
    #Modified mu calculation from paper, using the mu values and maximum # of access points returned in calculate_mu.
    
    if joined_table['Phone_A'][0]==primary_observer:
        mu_0 = len(joined_table)/joined_table['Count_A'][0]
        modified_0 = mu_0*(1-(1/( max(joined_table['Count_A'][0], joined_table['Count_B'][0]) + 1) ))
    else:
        mu_0 = len(joined_table)/joined_table['Count_B'][0]
        modified_0 = mu_0*(1-(1/( max(joined_table['Count_A'][0], joined_table['Count_B'][0]) + 1) ))
    
    return modified_0

In [7]:
#2A
def calculate_mu_modified_thresh(joined_table, primary_observer, RSSI_min):
    '''
    Returns the match modified factor value, filtering out networks where both RSSI values are less than threshold

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user

            Returns:
                    modified_0 (float): modified match factor
    
        NOTES: Modified Mu calculation from paper
    '''
    #Modified mu calculation from paper, using the mu values and maximum # of access points returned in calculate_mu.

    #fixes the case where the threshold specified would cause all networks to be cut out
    rssi_max_actual=np.max([np.max(joined_table.RSSI_A),np.max(joined_table.RSSI_B)])
    if RSSI_min < abs(rssi_max_actual):
        RSSI_min = abs(rssi_max_actual) + 1
    
    
    #filter out RSSI values less than threshold
    joined_table=joined_table.loc[(joined_table.RSSI_A > -1*RSSI_min) | (joined_table.RSSI_B > -1*RSSI_min), :].copy()
    joined_table.reset_index(inplace=True, drop=True)
    
    if len(joined_table)==0:
        modified_0 = 0
        return modified_0
    
    if joined_table['Phone_A'][0]==primary_observer:
        mu_0 = len(joined_table)/joined_table['Count_A'][0]
        modified_0 = mu_0*(1-(1/( max(joined_table['Count_A'][0], joined_table['Count_B'][0]) + 1) ))
    else:
        mu_0 = len(joined_table)/joined_table['Count_B'][0]
        modified_0 = mu_0*(1-(1/( max(joined_table['Count_A'][0], joined_table['Count_B'][0]) + 1) ))
    
    return modified_0

In [8]:
#4
def euc_distance(table_join):
    '''
    Returns the Euclidean Distance between two Wi-Fi Scans

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user

            Returns:
                    euclidean_distance (float): Euclidean Distance
                    
        NOTES: Calculates Euclidean distance: For each row, take the difference in RSSI values and square this difference. Then, sum up these values across the entire table and take the square root of this number to get overall Euclidean distance.
    '''
    
    total_sq_distance = 0
    for i in range(len(table_join)):
        total_sq_distance += (table_join.RSSI_A[i] - table_join.RSSI_B[i])**2 #could be done much more efficiently using pandas
    euclidean_distance = total_sq_distance**0.5
    
    return euclidean_distance

In [9]:
#3
def calculate_mu_universal(table_join):
    '''
    Returns the universal modified factor value

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared

            Returns:
                    mu (float): universal match factor
    
        NOTES: Calculates the mu factor where the value would be the same for both users, no matter who is the 'primary' observer. This is done by having a uniform denominator of the total number of access points seen across both scans of both users. 
    
    '''
    length=len(table_join)
    mu = length/(table_join['Count_A'][0]-length+table_join['Count_B'][0]-length+length)
    return mu    

In [10]:
#3A
def calculate_mu_universal_thresh(table_join,RSSI_min):
    '''
    Returns the universal modified factor value, filtering out networks where both RSSI values are less than threshold

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared

            Returns:
                    mu (float): universal match factor
    
        NOTES: Calculates the mu factor where the value would be the same for both users, no matter who is the 'primary' observer. This is done by having a uniform denominator of the total number of access points seen across both scans of both users. 
    
    '''
    
    #fixes the case where the threshold specified would cause all networks to be cut out
    rssi_max_actual=np.max([np.max(table_join.RSSI_A),np.max(table_join.RSSI_B)])
    if RSSI_min < abs(rssi_max_actual):
        RSSI_min = abs(rssi_max_actual) + 1    
    
    #filter out RSSI values less than threshold
    table_join=table_join.loc[(table_join.RSSI_A > -1*RSSI_min) | (table_join.RSSI_B > -1*RSSI_min), :].copy()
    table_join.reset_index(inplace=True,drop=True)
    
    if len(table_join)==0:
        mu = 0
        return mu
    
    length=len(table_join)
    mu = length/(table_join['Count_A'][0]-length+table_join['Count_B'][0]-length+length)
    return mu

In [11]:
#5
def euc_distance_norm_1(table_join):
      
    '''
    Returns the Euclidean Distance between two Wi-Fi Scans / n

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user

            Returns:
                    euclidean_distance (float): Euclidean Distance
                    
        NOTES: Normalization testing, /n
    '''
    
    total_sq_distance = 0
    counter=0
    for i in range(len(table_join)):
        total_sq_distance += (table_join.RSSI_A[i] - table_join.RSSI_B[i])**2
        counter+=1
    
    euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance
        

In [12]:
#6
def euc_distance_norm_2(table_join):
    
    '''
    Returns the Euclidean Distance between two Wi-Fi Scans / n^2

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user

            Returns:
                    euclidean_distance (float): Euclidean Distance
                    
        NOTES: Normalization testing, /n^2
    '''
    total_sq_distance = 0
    counter=0
    for i in range(len(table_join)):
        total_sq_distance += (table_join.RSSI_A[i] - table_join.RSSI_B[i])**2
        counter+=1
    
    euclidean_distance = ((total_sq_distance**0.5)/(counter)**2)
    
    return euclidean_distance
        

In [13]:
#7
def euc_distance_norm_3(table_join):
          
    '''
    Returns the Euclidean Distance between two Wi-Fi Scans / sqrt(n)

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    primary observers (str): Initials of primary user

            Returns:
                    euclidean_distance (float): Euclidean Distance
                    
        NOTES: Normalization testing, /sqrt(n)
    '''
        
    total_sq_distance = 0
    counter=0
    for i in range(len(table_join)):
        total_sq_distance += (table_join.RSSI_A[i] - table_join.RSSI_B[i])**2
        counter+=1
    
    euclidean_distance = (total_sq_distance/counter)**0.5
    
    return euclidean_distance
        

In [14]:
#9
def euc_distance_penalty_avg(table_join_cos):   
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans where N/A values are filled with avg RSSI

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared

            Returns:
                    euclidean_distance (float): Euclidean Distance
            
        NOTES: This performs a Euclidean distance calculation, while also taking into account an added 'penalty' (i.e. a farther distance overall between the users for each unmatched network - i.e. user 1 sees a certain BSS, but user 2 does not. 
    '''
    total_sq_distance = 0
    length=len(table_join_cos)
    
    #Finds the global minimum RSSI value read between the two phones - this represents the lowest strength amongst the scans. 
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])
    
    #For both users, any na values - meaning the network was unmatched - the RSSI value here is set to the determined avg. 
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    counter=0
    for i in range(length):
        total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
        counter+=1
    
    euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [15]:

#8
def euc_distance_penalty_min(table_join_cos):   
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans where N/A values are filled with min RSSI

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared

            Returns:
                    euclidean_distance (float): Euclidean Distance
            
        NOTES: This performs a Euclidean distance calculation, while also taking into account an added 'penalty' (i.e. a farther distance overall between the users for each unmatched network - i.e. user 1 sees a certain BSS, but user 2 does not. 
    '''
    total_sq_distance = 0
    length=len(table_join_cos)
    
    #Finds the global minimum RSSI value read between the two phones - this represents the lowest strength amongst the scans. 
    penalty=min(min(table_join_cos['RSSI_x']), min(table_join_cos['RSSI_y']))
    
    #For both users, any na values - meaning the network was unmatched - the RSSI value here is set to the determined minimum. 
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    counter=0
    for i in range(length):
        total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
        counter+=1

    
    euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [16]:
#10
def euc_distance_std_dev_thresh(table_join_cos, std_dev_max):
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans where N/A values are filled with avg RSSI and rows are filtered out based on a maximum std_dev value

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    std_dev_max (float): A maximum acceptable standard deviation value for a BSS within the comparison interval
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''
    total_sq_distance = 0
    length=len(table_join_cos)
    
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #Finds the global minimum RSSI value read between the two phones - this represents the lowest strength amongst the scans. 
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])

    #For both users, any na values - meaning the network was unmatched - the RSSI value here is set to the determined avg.
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    counter=0
    for i in range(length):
        if (table_join_cos.Std_dev_x[i] < std_dev_max or np.isnan(table_join_cos.Std_dev_x[i])) and (table_join_cos.Std_dev_y[i] <std_dev_max or np.isnan(table_join_cos.Std_dev_y[i])):
            total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
            counter+=1
    
    if counter==0:
        euclidean_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [17]:
#11
def euc_distance_topn(table_join_cos, num_rows):
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans where N/A values are filled with avg RSSI and only top n strongest BSSs are considered

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    num_rows (int): number of (strongest) BSSs to consider from each device
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''
    
    #fixes case where num_rows requested is larger than actual num rows
    if num_rows>len(table_join_cos):
        num_rows=len(table_join_cos)
    
    #sort DataFrame by descending strength of networks found by Device A and mark top n to keep
    table_join_cos['Top']=None
    table_join_cos.sort_values(by='RSSI_x', inplace=True, ascending=False)
    table_join_cos.reset_index(inplace=True, drop=True)
    table_join_cos.loc[0:num_rows-1, 'Top']=True
    
    #sort DataFrame by descending strength of networks found by Device B and mark top n to keep
    table_join_cos.sort_values(by='RSSI_y', inplace=True, ascending=False)
    table_join_cos.reset_index(inplace=True, drop=True)
    table_join_cos.loc[0:num_rows-1, 'Top']=True
    
    #filter out networks not in top n for either device
    table_join_cos = table_join_cos.drop(table_join_cos[table_join_cos.Top != True].index)
    table_join_cos.reset_index(inplace=True, drop=True)
    
    total_sq_distance = 0
    length=len(table_join_cos)
    
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #Finds the global minimum RSSI value read between the two phones - this represents the lowest strength amongst the scans. 
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])

    #For both users, any na values - meaning the network was unmatched - the RSSI value here is set to the determined avg.
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    #Calculate Euclidean Distance
    counter=0
    for i in range(length):
        total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
        counter+=1
    
    euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [18]:
#12
def euc_distance_rank(table_join_cos):
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans where RSSI values are re-assigned based on their relative strength rating

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''
    num_rows=1000000000 #to limit the number of rows included in the ranking, if desired
    if num_rows>len(table_join_cos):
        num_rows=len(table_join_cos)
    
    #same as in euc_distance_topn
    #keeps nop n strongest networks
    table_join_cos['Top']=None
    table_join_cos.sort_values(by='RSSI_x', inplace=True, ascending=False)
    table_join_cos.reset_index(inplace=True, drop=True)
    table_join_cos.loc[0:num_rows-1, 'Top']=True
    table_join_cos.sort_values(by='RSSI_y', inplace=True, ascending=False)
    table_join_cos.reset_index(inplace=True, drop=True)
    table_join_cos.loc[0:num_rows-1, 'Top']=True
    table_join_cos = table_join_cos.drop(table_join_cos[table_join_cos.Top != True].index)
    table_join_cos.reset_index(inplace=True, drop=True)
    
    total_sq_distance = 0
    length=len(table_join_cos)
    
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #Finds the global minimum RSSI value read between the two phones - this represents the lowest strength amongst the scans. 
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])

    #For both users, any na values - meaning the network was unmatched - the RSSI value here is set to the determined minimum. 
    
    
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    #calculate step value
    step=1/len(table_join_cos)
    
    #sort by descending RSSI strength for device A
    table_join_cos.sort_values(by='RSSI_x', ascending=False, inplace=True)
    
    #create RSSI revised column for device A
    table_join_cos['RSSI_x_revised']=None
    
    #assign new RSSI value
    for i in range(len(table_join_cos)):
        table_join_cos.loc[i, 'RSSI_x_revised']=(i+1)*step
    table_join_cos.reset_index(inplace=True, drop=True)
    
    #sort by descending RSSI strength for device B
    table_join_cos.sort_values(by='RSSI_y', ascending=False, inplace=True)
    
    #create RSSI revised column for device B
    table_join_cos['RSSI_y_revised']=None
    
    #assign new RSSI value
    for i in range(len(table_join_cos)):
        table_join_cos.loc[i, 'RSSI_y_revised']=(i+1)*step
    table_join_cos.reset_index(inplace=True, drop=True)
    
    #calculate Euclidean Distance
    counter=0
    for i in range(len(table_join_cos)):
        total_sq_distance += (table_join_cos.RSSI_x_revised[i] - table_join_cos.RSSI_y_revised[i])**2
        counter+=1
    
    euclidean_distance=0
    if counter==0:
        euclidean_distance=(total_sq_distance**0.5)
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [19]:
#13 BSS frequency #CONT HERE

def euc_distance_BSS_thresh(table_join_cos, counts_threshold):
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans where BSSs that show up less than n times are filtered out

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    counts_threshold (int): minimum number of times a BSS must show up in the comparison interval to be included in the Euclidean Distance algorithm
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''  
    
    total_sq_distance = 0
    length=len(table_join_cos)
    
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #we could choose either min or mean dyanmically
    
    #Finds the global minimum RSSI value read between the two phones - this represents the lowest strength amongst the scans. 
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])
    
    #For both users, any na values - meaning the network was unmatched - the RSSI value here is set to the determined minimum. 
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    rows_drop=[]
    
    #filter out rows where network is seen less than specified threshold
    #there is a much more efficient way to do this
    for i in range(len(table_join_cos)):
        if table_join_cos.loc[i, 'BSS_Count_x']<counts_threshold or table_join_cos.loc[i, 'BSS_Count_y']<counts_threshold:
            rows_drop.append(i)
    table_join_cos.drop(rows_drop, inplace=True)
    table_join_cos.reset_index(inplace=True, drop=True)
    
    #calculate euclidean distance
    counter=0
    for i in range(len(table_join_cos)):
        total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
        counter+=1
    
    euclidean_distance=0
    
    if counter==0:
        euclidean_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [20]:
#15
def cos_sim(table_join_cos): #TODO Out of bounds error
    '''
    Returns the cosine similarity between two Wi-Fi Scans

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
            Returns:
                    cosine (float): cosine similarity value, between 0 and 1
    '''
     
    table=table_join_cos.dropna()
    vector_1=list(table['RSSI_x'])#.fillna(penalty)
    vector_2=list(table['RSSI_y'])#.fillna(penalty)
    
    #cosine similarity calculation between two vectors
    num=0
    mag_1_sq=0
    mag_2_sq=0
    for i in range(0, len(vector_1)):
        num+=vector_1[i]*vector_2[i]
        mag_1_sq+=(vector_1[i])**2
        mag_2_sq+=(vector_2[i])**2
    den=(mag_1_sq**0.5)*(mag_2_sq**0.5)
    
    if den==0:
        return 0
    
    cosine = num/den
    
    return cosine

In [21]:
def ed_1(table_join_cos, abs_rssi_max):
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans. Keep networks where both RSSIs are within threshold OR one RSSI in threshold and other is Nan. Fill NaNs with threshold

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    abs_rssi_max (int): maximum absolute value of RSSI value to be considered, ie to consider no values less than -70, input 70
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''
    
    total_sq_distance = 0
    length=len(table_join_cos)
    
    #filter out rows based on RSSI threshold
    table_join_cos=table_join_cos.loc[((abs(table_join_cos.RSSI_x) < abs_rssi_max) & (abs(table_join_cos.RSSI_y) < abs_rssi_max )) | np.isnan(table_join_cos.RSSI_x) | np.isnan(table_join_cos.RSSI_y), :].copy()
    table_join_cos.reset_index(inplace=True)
    
    #fill NaNs with specified threshold
    table_join_cos.RSSI_x.fillna(abs_rssi_max, inplace=True)
    table_join_cos.RSSI_y.fillna(abs_rssi_max, inplace=True)
    
    counter=0
    for i in range(len(table_join_cos)):
        total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
        counter+=1
    
    euclidean_distance = 0
    
    if counter==0:
        euclidean_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
        
    #print('total dist = ' + str(total_sq_distance**0.5) + ' / '+str(counter) + ' = ' + str(euclidean_distance))
    
    return euclidean_distance

In [1]:
def ed_2(table_join_cos, abs_rssi_max): #FINSHED
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans. Keep networks where both RSSIs are within threshold OR one RSSI in threshold and other is Nan. Fill NaNs with average

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    abs_rssi_max (int): maximum absolute value of RSSI value to be considered, ie to consider no values less than -70, input 70
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''
    total_sq_distance = 0
    length=len(table_join_cos)
    
    table_join_cos=table_join_cos.loc[((table_join_cos.RSSI_x > -1*abs_rssi_max) & ( table_join_cos.RSSI_y > -1*abs_rssi_max)) | np.isnan(table_join_cos.RSSI_x) | np.isnan(table_join_cos.RSSI_y), :].copy()
    table_join_cos.reset_index(inplace=True, drop = True)

    
    #list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #we could choose either min or mean dyanmically
    
    if len(table_join_cos.copy()['RSSI_x'].dropna(inplace=False)) == 0:
        return 0
    if len(table_join_cos.copy()['RSSI_y'].dropna(inplace=False)) == 0:
        return 0
    
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])
    
    table_join_cos.RSSI_x.fillna(penalty, inplace=True)
    table_join_cos.RSSI_y.fillna(penalty, inplace=True)
    table_join_cos.reset_index(inplace=True)
    
    #penalty=min(min(table_join_cos['RSSI_x']), min(table_join_cos['RSSI_y']))
    #penalty=np.nanmedian(list_RSSIs)
    #For both users, any na values - meaning the network was unmatched - the RSSI value here is set to the determined minimum. 
    
    
    #table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    #table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)

    
    counter=0
    for i in range(len(table_join_cos)):
        total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
        counter+=1
    
    
    if counter==0:
        total_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        total_distance = (total_sq_distance**0.5)/counter
        
    #print('total dist = ' + str(total_sq_distance**0.5) + ' / '+str(counter) + ' = ' + str(total_distance))
    
    return total_distance

In [42]:
def ed_3(table_join_cos, abs_rssi_max): #FINISHED
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans. Fill NaNs with avg --> keep networks when at least one phone's RSSI is within threshold.

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    abs_rssi_max (int): maximum absolute value of RSSI value to be considered, ie to consider no values less than -70, input 70
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''    
    total_sq_distance = 0
    length=len(table_join_cos)
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #Fill NaNs w/ average.  
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])]) 
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    #calculate euc distance according to at least one network being within threshold
    counter=0
    for i in range(length):
        if abs(table_join_cos.RSSI_x[i]) < abs_rssi_max or abs(table_join_cos.RSSI_y[i]) < abs_rssi_max:
            total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
            counter+=1
    
    if counter==0:
        euclidean_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [43]:
def ed_4(table_join_cos, abs_rssi_max): #FINISHED
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans. Fill NaNs with avg --> keep networks where Both RSSIs are within threshold

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    abs_rssi_max (int): maximum absolute value of RSSI value to be considered, ie to consider no values less than -70, input 70
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''    
    
    total_sq_distance = 0
    length=len(table_join_cos)
    
    #Fill NaNs w/ average. 
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    #calculate euc distance according to both networks being within threshold
    counter=0
    for i in range(length):
        if abs(table_join_cos.RSSI_x[i]) < abs_rssi_max and abs(table_join_cos.RSSI_y[i]) < abs_rssi_max:
            total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
            counter+=1
    
    if counter==0:
        euclidean_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [1]:
def ed_5(table_join_cos, abs_rssi_max): #FINISHED
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans. Ignore any unmatched networks --> keep networks where both RSSIs are within threshold.

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    abs_rssi_max (int): maximum absolute value of RSSI value to be considered, ie to consider no values less than -70, input 70
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''
    
    total_sq_distance = 0
    length=len(table_join_cos)
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #calculate euclidean distance
    counter=0
    for i in range(length):
        #not including any entries w nans
        if (not np.isnan(table_join_cos.RSSI_x[i])) and (not np.isnan(table_join_cos.RSSI_y[i])):
            #not including rows where either RSSI value less than threshold
            if abs(table_join_cos.RSSI_x[i]) < abs_rssi_max and abs(table_join_cos.RSSI_y[i]) < abs_rssi_max:
                total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
                counter+=1
    
    if counter==0:
        euclidean_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [2]:
def ed_6(table_join_cos, abs_rssi_max): #FINISHED
    '''
    Returns the normalized Euclidean Distance between two Wi-Fi Scans. Ignore any unmatched networks --> keep networks where at least one RSSI is within threshold.

            Parameters:
                    joined_table (Pandas dataframe): A df created using the join_master function that contains two WiFi scans to be compared
                    abs_rssi_max (int): maximum absolute value of RSSI value to be considered, ie to consider no values less than -70, input 70
            Returns:
                    euclidean_distance (float): Euclidean Distance
    '''
       
    total_sq_distance = 0
    length=len(table_join_cos)
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #calculate euclidean distance
    counter=0
    for i in range(length):
        #not including any entries w nans
        if not (np.isnan(table_join_cos.RSSI_x[i])) and (not np.isnan(table_join_cos.RSSI_y[i])):
            #not including rows where either RSSI value less than threshold
            if abs(table_join_cos.RSSI_x[i]) < abs_rssi_max or abs(table_join_cos.RSSI_y[i]) < abs_rssi_max:
                total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
                counter+=1
    
    if counter==0:
        euclidean_distance =(total_sq_distance**0.5)  ##FIX THIS
    else:
        euclidean_distance = (total_sq_distance**0.5)/counter
    
    return euclidean_distance

In [None]:
#MAKE THIS HYBRID
def ed_hybrid(table_join_cos): #FINISHED
    '''Fill NaNs with avg --> keep networks where Both RSSIs are within threshold'''
    
    abs_rssi_threshold=85
    std_dev_thresh=6
    
    
    total_sq_distance = 0
    length=len(table_join_cos)
    list_RSSIs=table_join_cos['RSSI_x'].tolist()+table_join_cos['RSSI_y'].tolist()
    
    #Fill NaNs w/ average. 
    penalty=np.mean([np.nanmean(table_join_cos['RSSI_x']), np.nanmean(table_join_cos['RSSI_y'])])
    table_join_cos['RSSI_x']=table_join_cos['RSSI_x'].fillna(penalty)
    table_join_cos['RSSI_y']=table_join_cos['RSSI_y'].fillna(penalty)
    
    counter=0
    for i in range(length):
        if (table_join_cos.Std_dev_x[i] < std_dev_thresh or np.isnan(table_join_cos.Std_dev_x[i])) and (table_join_cos.Std_dev_y[i] <std_dev_thresh or np.isnan(table_join_cos.Std_dev_y[i])) and (abs(table_join_cos.RSSI_x[i]) < abs_rssi_threshold and abs(table_join_cos.RSSI_y[i]) < abs_rssi_threshold):
            total_sq_distance += (table_join_cos.RSSI_x[i] - table_join_cos.RSSI_y[i])**2
            counter+=1
    
    if counter==0:
        total_distance=(total_sq_distance**0.5)  ##FIX THIS
    else:
        total_distance = (total_sq_distance**0.5)/counter
    
    return total_distance