# Library of functions

### Import packages

In [17]:
import math
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
import numpy as np
import time
import statistics as st
from scipy.spatial import distance as dt
import distance
from operator import truediv, add, mul
from sklearn.preprocessing import MinMaxScaler
from varname import nameof

### Data loading

In [14]:
def load_Data(name):
    """
    Loads data from csv datasets
    
    Args: filename
    
    Returns: 4 lists containing the 4 datasets
    """
    try:
        # TRAIN data
        # load fingerprints
        df_trnrss = pd.read_csv('data/'+ name + '_trnrss.csv', header=None).values.tolist()
        # load coordinates
        df_trncrd = pd.read_csv('data/' + name + '_trncrd.csv', header=None)

        # for now, drop columns of 'floor' and 'building', keep x,y,z
        df_trncrd.drop(df_trncrd.columns[[3,4]], axis=1, inplace=True)
        df_trncrd = df_trncrd.values.tolist()

        # TEST data
        # load fingerprints
        df_tstrss = pd.read_csv('data/' + name + '_tstrss.csv', header=None).values.tolist()
        # load coordinates
        df_tstcrd = pd.read_csv('data/' + name + '_tstcrd.csv', header=None)

        # for now, drop columns of 'floor' and 'building', keep x,y,z
        df_tstcrd.drop(df_tstcrd.columns[[3,4]], axis=1, inplace=True)
        df_tstcrd = df_tstcrd.values.tolist()

        return(df_trnrss, df_trncrd, df_tstrss, df_tstcrd)
    except:
        print("Could not open file")
        return
    
def load_tstcrd(name):
    """
    Loads testcrd from csv datasets
    
    Args: filename
    
    Returns: list with testcrd dataset
    """
    try:
        # load coordinates
        df_tstcrd = pd.read_csv('data/' + name + '_tstcrd.csv', header=None)
        # for now, drop columns of 'floor' and 'building', keep x,y,z
        df_tstcrd.drop(df_tstcrd.columns[[3,4]], axis=1, inplace=True)
        df_tstcrd = df_tstcrd.values.tolist()

        return df_tstcrd
    except:
        print("Could not open file")
        return

### Get neighbors

In [3]:
# all distance metrics accepted
distance_metrics = ["euclidean", "minkowskiL1", "minkowskiL2", "minkowskiL3", "minkowskiL4", "minkowskiL5", "cityblock",
                    "chebyshev", "sorensen", "gower", "soergel", "kulczynski_d", "lorentzian", "canberra", "intersection",
                    "wavehedges", "czekanowski_s", "czekanowski_d", "motyka_s", "motyka_d", "kulczynski_s", "ruzicka",
                    "tanimoto", "inner", "harmonic", "cosine", "kumar", "jaccard_s", "jaccard_d", "dice_s", "dice_d",
                    "fidelity", "bhattacharrya", "hellinger", "hellinger2", "matusita", "matusita2", "squared_chord",
                    "sqeuclidean", "pearson", "neyman", "squared", "prob_sym", "divergence", "clark", "additive_sym",
                    "kullback_PQ", "jeffreys", "k_divergence", "topsoe", "jensen_shannon", "jensen_diff", "taneja",
                    "kumar_johnson", "avgL", "vicis_wave_hedges", "vicis_symm1", "vicis_symm2", "vicis_symm3", "min_symm",
                    "max_symm"]

In [86]:
# Locate the most similar neighbors and return list of indexes
def get_neighbors(train, test_row, k, name, isAbs, incr):
    """
    Locate the most similar neighbors
    
    Args: train rss dataset, 1 row of test rss dataset, k, distance metric, abs(bool), incr(bool)
    
    Returns: list of indexes
    """
    distances = list()
    dist = 0
        
    # Increment value by 100
    if incr:
        test_row = [n+100 for n in test_row]
        
    # Absolute value of P
    if isAbs:
        test_row = [abs(n) for n in test_row]
    
    for idx, train_row in enumerate(train):        
        
        # Absolute value of Q
        if isAbs:
            train_row = [abs(n) for n in train_row]
            
        # Increment value by 100
        if incr:
            train_row = [n+100 for n in train_row]
            
        # Lp Minkowski family
        if name == "euclidean": dist = math.dist(test_row, train_row)
        if name == "minkowskiL1": dist = dt.minkowski(test_row, train_row, 1)
        if name == "minkowskiL2": dist = dt.minkowski(test_row, train_row, 2)
        if name == "minkowskiL3": dist = dt.minkowski(test_row, train_row, 3)
        if name == "minkowskiL4": dist = dt.minkowski(test_row, train_row, 4)
        if name == "minkowskiL5": dist = dt.minkowski(test_row, train_row, 5)
        if name == "cityblock": dist = dt.cityblock(test_row, train_row)
        if name == "chebyshev": dist = dt.chebyshev(test_row, train_row)

        # L1 family
        if name == "sorensen": dist = distance.sorensen(test_row, train_row)
        if name == "gower": dist = gower(test_row, train_row)
        if name == "soergel": dist = soergel(test_row, train_row)
        if name == "kulczynski_d": dist = kulczynski_d(test_row, train_row)
        if name == "lorentzian": dist = lorentzian(test_row, train_row)
        if name == "canberra": dist = dt.canberra(test_row, train_row)

        # Intersection family
        if name == "intersection": dist = intersection(test_row, train_row)
        if name == "wavehedges": dist = wavehedges(test_row, train_row)
        if name == "czekanowski_s": dist = czekanowski_s(test_row, train_row)
        if name == "czekanowski_d": dist = czekanowski_d(test_row, train_row)
        if name == "motyka_s": dist = motyka_s(test_row, train_row)
        if name == "motyka_d": dist = motyka_d(test_row, train_row)
        if name == "kulczynski_s": dist = kulczynski_s(test_row, train_row)
        if name == "ruzicka": dist = ruzicka(test_row, train_row)
        if name == "tanimoto": dist = tanimoto(test_row, train_row)

        # Inner product family
        if name == "inner": dist = np.inner(test_row, train_row)
        if name == "harmonic": dist = harmonic(test_row, train_row)
        if name == "cosine": dist = dt.cosine(test_row, train_row)
        if name == "kumar": dist = kumar(test_row, train_row)
        if name == "jaccard_s": dist = jaccard_s(test_row, train_row)
        if name == "jaccard_d": dist = jaccard_d(test_row, train_row)
        if name == "dice_s": dist = dice_s(test_row, train_row)
        if name == "dice_d": dist = dice_d(test_row, train_row)

        # Fidelity family or Squared-chord family
        if name == "fidelity": dist = fidelity(test_row, train_row)
        if name == "bhattacharrya": dist = bhattacharrya(test_row, train_row)
        if name == "hellinger": dist = hellinger(test_row, train_row)
        if name == "hellinger2": dist = hellinger2(test_row, train_row)
        if name == "matusita": dist = matusita(test_row, train_row)
        if name == "matusita2": dist = matusita2(test_row, train_row)
        if name == "squared_chord": dist = squared_chord(test_row, train_row)

        # Squared L2 family
        if name == "sqeuclidean": dist = dt.sqeuclidean(test_row, train_row)
        if name == "pearson": dist = pearson(test_row, train_row)
        if name == "neyman": dist = neyman(test_row, train_row)
        if name == "squared": dist = squared(test_row, train_row)
        if name == "prob_sym": dist = prob_sym(test_row, train_row)
        if name == "divergence": dist = divergence(test_row, train_row)
        if name == "clark": dist = clark(test_row, train_row)
        if name == "additive_sym": dist = additive_sym(test_row, train_row)

        # Shannon's entropy family
        if name == "kullback_PQ": dist = kullback_PQ(test_row, train_row)
        if name == "jeffreys": dist = jeffreys(test_row, train_row)
        if name == "k_divergence": dist = k_divergence(test_row, train_row)
        if name == "topsoe": dist = topsoe(test_row, train_row)
        if name == "jensen_shannon": dist = jensen_shannon(test_row, train_row)
        if name == "jensen_diff": dist = jensen_diff(test_row, train_row)

        # Combinations
        if name == "taneja": dist = taneja(test_row, train_row)
        if name == "kumar_johnson": dist = kumar_johnson(test_row, train_row)
        if name == "avgL": dist = avgL(test_row, train_row)
            
        # Vicissitude
        if name == "vicis_wave_hedges": dist = vicis_wave_hedges(test_row, train_row)
        if name == "vicis_symm1": dist = vicis_symm1(test_row, train_row)
        if name == "vicis_symm2": dist = vicis_symm2(test_row, train_row)
        if name == "vicis_symm3": dist = vicis_symm3(test_row, train_row)
        if name == "min_symm": dist = min_symm(test_row, train_row)
        if name == "max_symm": dist = max_symm(test_row, train_row)
        
        distances.append((idx, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = [distances[i][0] for i in range(k)]
    return neighbors

### Knn, predict positions and calculate error

In [87]:
def knn(k, tstrss, trnrss, distance="euclidean", norm=False, isAbs=False, incr=False):
    """
    Calculate knn of fingerprints
    
    Args: k, test rss dataset, train rss dataset, distance metric, norm(bool), abs(bool), incr(bool)
    
    Returns: list of indexes of k nearest neighbours
    """
    all_neighbors = []
    
    if norm:
        # MinMax normalization
        scaler = MinMaxScaler()
        trnrss = scaler.fit_transform(trnrss)
        tstrss = scaler.transform(tstrss)
        
    # iterate test data and find id of k nearest neighbors from trnrss
    for i in tstrss:
        if norm:
            # set to 0 if negative (and set to 1 if greater than 1) to avoid sqrt error
            i = [0 if x<0 else 1 if x>1 else x for j,x in enumerate(i)]
            
        all_neighbors.append(get_neighbors(trnrss, i, k, distance, isAbs, incr))
    return all_neighbors

def predict_position(all_neighbors, trncrd, k):
    """
    Predicts position of each neighbour. Finds k coordinates from indexes and computes mean value.
    
    Args: neighbours list, train coordinates dataset, k
    
    Returns: list of predicted position for each neighbour
    """
    predicted_pos = []
    
    # iterate all neighbors to predict position. Find k indexes in trncrd and calculate mean value.
    for knn in all_neighbors:    
        x, y, z = 0, 0, 0
        for i in knn:
            x += trncrd[i][0]
            y += trncrd[i][1]
            z += trncrd[i][2]
        predicted_pos.append([x/k, y/k, round(z/k, 1)]) # mean value
        
    return predicted_pos

def calculate_error(predicted_pos, tstcrd):
    """
    Calculates error using euclidean distance between predictions and actual position
    
    Args: list of predicted positions, test coordinates dataset
    
    Returns: list of error distances
    """
    error_distances = [math.dist(predicted_pos[i], tstcrd[i]) for i, val in enumerate(predicted_pos)]
    return error_distances

In [88]:
def knn_and_prediction(k, filename, distance="euclidean", norm=False, isAbs=False, incr=False):
    """
    Loads data, calculates knn and predicts position
    
    Args: k, filename, distance metric, norm(bool), abs(bool), incr(bool)
    
    Returns: list of predicted positions
    """
    trnrss, trncrd, tstrss, tstcrd = load_Data(filename)
    neighbors = knn(k, tstrss, trnrss, distance, norm, isAbs, incr)
    return predict_position(neighbors, trncrd, k)

def positioning_error(k, filename, distance="euclidean", norm=False, isAbs=False, incr=False):
    """
    Loads data, calculates knn, predicts position and calculates error distances between prediction and real position
    
    Args: k, filename, distance metric, norm(bool), abs(bool), incr(bool)
    
    Returns: list of error distances
    """
    trnrss, trncrd, tstrss, tstcrd = load_Data(filename)
    neighbors = knn(k, tstrss, trnrss, distance, norm, isAbs, incr)
    positions = predict_position(neighbors, trncrd, k)
    return calculate_error(positions, tstcrd)

In [152]:
def avg_error(err_list):
    """
    Calculates average error with 2 decimals from list of positioning errors
    
    Args: list of error distances
    
    Returns: average error
    """
    return round(st.fmean(err_list), 2)

def print_postitioning_errors(k, filename, metrics_list, norm=False, abs_value=False, increment=False):
    """
    Calculates and prints average error for each distance metric
    
    Args: k, filename, distance metric list, norm(bool), abs(bool), incr(bool)
    
    Returns: nothing
    """
    for i in metrics_list:
        err = positioning_error(k, filename, i, norm, abs_value, increment)
        print("{}: {}".format(i, avg_error(err)))

### Ensemble of different distance metrics

In [12]:
def predict_positions_dst_metrics(k, filename, metrics_list, norm=False, abs_value=False, increment=False):
    """
    Calculates list of predicted positions for each distance metric
    
    Args: k, filename, distance metric list, norm(bool), abs(bool), incr(bool)
    
    Returns: list of positions
    """
    all_positions = []
    for i in metrics_list:
        pred_positions = knn_and_prediction(k, filename, i, norm, abs_value, increment)
        all_positions.append(pred_positions)
    return all_positions

def centroids_list(pos_list):
    """
    Calculates list of centroids from list of predicted positions by different distance metrics
    
    Args: list of predicted positions
    
    Returns: list of centroid positions
    """
    centroids = []
    metrics_len = len(pos_list)
    data_len = len(pos_list[0])
    
    # iterate all positions and calculate mean for each coordinate
    for j in range(data_len):
        x, y, z = 0, 0, 0
        for i in range(metrics_len):
            x += pos_list[i][j][0]
            y += pos_list[i][j][1]
            z += pos_list[i][j][2]
        centroids.append([round(x/metrics_len, 2), round(y/metrics_len, 2), round(z/metrics_len, 1)])
    return centroids

# return error list from ensembled distance metrics
def error_ensemble(k, filename, metrics_list, norm=False, abs_value=False, increment=False):
    """
    Predicts position for selected distance metrics, calculates centroids and calculates error distances
    between prediction and real position
    
    Args: k, filename, distance metric list, norm(bool), abs(bool), incr(bool)
    
    Returns: list of error distances
    """
    # knn and prediction for each distance metric
    positions_lst = predict_positions_dst_metrics(k, filename, metrics_list, norm=False, abs_value=False, increment=False)
    centroids = centroids_list(positions_lst)
    tstcrd = load_tstcrd(filename)
    return calculate_error(centroids, tstcrd)

# return error list from ensembled distance metrics
def error_ensemble2(k, filename, *argv):
    """
    Calculates centroids from predictions
    
    Args: k, filename, predictions
    
    Returns: list of error distances
    """
    pos_list = []
    for i in argv:
        pos_list.append(i)
        
    centroids = centroids_list(pos_list)
    tstcrd = load_tstcrd(filename)
    return calculate_error(centroids, tstcrd)

### Distance metrics

In [126]:
# factor added to avoid sqrt and log of negative numbers and division by 0 error
FACTOR = 1e-7

# distance metrics
def gower(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    A=sum(abs(P-Q))
    return A/len(P)

def soergel(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    A=sum(abs(P-Q))
    return A/sum(np.maximum(P,Q))

def kulczynski_d(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    A=sum(abs(P-Q))
    return A/sum(np.minimum(P,Q))

def lorentzian(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(np.log(1+abs(P-Q)))

def intersection(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    A=sum(abs(P-Q))
    return 0.5*A

def wavehedges(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(abs(P-Q)/(np.maximum(P,Q)+FACTOR))

def czekanowski_s(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    A=sum(abs(P-Q))
    return A/sum(P+Q)

def czekanowski_d(P, Q):
    return 1 - czekanowski_s(P, Q)

def motyka_s(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    maxPQ=sum(np.maximum(P,Q))
    return maxPQ/sum(P+Q)

def motyka_d(P, Q):
    return 1 - motyka_s(P, Q)

def kulczynski_s(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    minPQ=sum(np.minimum(P,Q))
    A=sum(abs(P-Q))
    dist = 0
    if A != 0:
        dist = minPQ/A
    if sum(np.minimum(P,Q)) != 0:
        dist = 1/dist
    return dist

def ruzicka(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    maxPQ=sum(np.maximum(P,Q))
    return 1-sum(np.minimum(P,Q))/maxPQ
        
def tanimoto(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    maxPQ=sum(np.maximum(P,Q))
    return sum(np.maximum(P,Q)-np.minimum(P,Q))/maxPQ

def harmonic(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 1-2.*sum(P*Q/(P+Q+FACTOR))

def kumar(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)*(P-Q))/(sum(P*P)+sum(Q*Q)-len(P))

def jaccard_s(P, Q):
    return kumar(P, Q)
        
def jaccard_d(P, Q):
    return 1 - jaccard_s(P, Q)

def dice_s(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)*(P-Q))/(sum(P*P)+sum(Q*Q))
    
def dice_d(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 1 - dice_s(P, Q)
    
def fidelity(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(np.sqrt(P*Q+FACTOR))

def bhattacharrya(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return -np.log(sum(np.sqrt(P*Q+FACTOR)))

def hellinger(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 2*np.sqrt(1-sum(np.sqrt(P*Q+FACTOR))+FACTOR)

def hellinger2(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return np.sqrt(2*sum((np.sqrt(P+FACTOR)-np.sqrt(Q+FACTOR))**2)+FACTOR)

def matusita(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return np.sqrt(2-2*sum(np.sqrt(P*Q+FACTOR))+FACTOR)

def matusita2(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return np.sqrt(sum((np.sqrt(P+FACTOR)-np.sqrt(Q+FACTOR))**2)+FACTOR)

def squared_chord(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((np.sqrt(P+FACTOR)-np.sqrt(Q+FACTOR))**2)

def pearson(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)**2/(Q+FACTOR))

def neyman(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)**2/(P+FACTOR))

def squared(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)**2/(P+Q+FACTOR))
                
def prob_sym(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 2*sum((P-Q)**2/(P+Q+FACTOR))

def divergence(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 2*sum((P-Q)**2/(P+Q+FACTOR)**2)
    
def clark(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return np.sqrt(sum((abs(P-Q)/(P+Q+FACTOR))**2))

def additive_sym(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)**2*(P+Q)/(P*Q+FACTOR))

def kullback_PQ(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(P*np.log((P/(Q+FACTOR))+FACTOR))

def kullback_QP(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(P*np.log(Q+FACTOR/(P+FACTOR)))

def jeffreys(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)*np.log((P+FACTOR)/(Q+FACTOR)))
    
def k_divergence(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(P*np.log((2*P/(P+Q+FACTOR)+FACTOR)))
    
def topsoe(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(P*np.log((2*P/(P+Q+FACTOR)+FACTOR))+Q*np.log((2*Q/(P+Q+FACTOR)+FACTOR)))
    
def jensen_shannon(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 0.5*sum(P*np.log((2*P/(P+Q+FACTOR))+FACTOR)+Q*np.log((2*Q/(P+Q+FACTOR))+FACTOR))

def jensen_diff(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 0.5*sum(P*np.log(P+FACTOR)+Q*np.log(Q+FACTOR)-(P+Q+FACTOR)*np.log((P+Q+FACTOR)/2.))

def taneja(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 0.5*sum((P+Q)*np.log(((P+Q+FACTOR)/(2.*np.sqrt(P*Q+FACTOR))+FACTOR)))

def kumar_johnson(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(((P*P-Q*Q)+FACTOR)**2/(2*(P*Q+FACTOR)**(1.5)))

def avgL(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return 0.5*(sum(abs(P-Q))+max(abs(P-Q)))
                   
def vicis_wave_hedges(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum(abs(P-Q)/(np.minimum(P,Q)+FACTOR))
                   
def vicis_symm1(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    pqmin=np.minimum(P,Q)
    return sum((P-Q)*(P-Q)/(pqmin**2+FACTOR))
                   
def vicis_symm2(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    pqmin=np.minimum(P,Q)
    return sum((P-Q)*(P-Q)/(pqmin+FACTOR))
                   
def vicis_symm3(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    return sum((P-Q)*(P-Q)/(np.maximum(P,Q)+FACTOR))
                   
def min_symm(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    p=sum((P-Q)*(P-Q)/(P+FACTOR))
    q=sum((P-Q)*(P-Q)/(Q+FACTOR))
    return min(p,q)
                   
def max_symm(P, Q):
    P=np.array(P,dtype=float); Q=np.array(Q,dtype=float)
    p=sum((P-Q)*(P-Q)/(P+FACTOR))
    q=sum((P-Q)*(P-Q)/(Q+FACTOR))
    return max(p,q)

### Plots

In [16]:
def ecdf(xdata):
    """
    Generate x and y axis data
    
    Args: list of positions
    
    Returns: x, y data format
    """
    xdataecdf = np.sort(xdata)
    ydataecdf = np.arange(1, len(xdata) + 1) / len(xdata)
    return xdataecdf, ydataecdf

def plot_ecdf(title, figure, legend, *args):
    """
    Generate plot using matplotlib package
    
    Args: title of figure, number of figure, data
    
    Returns: nothing
    """
    
    #Plot the data using matplotlib
    plt.figure(figure)
    leg = []
    for i in args:
        #Get the x and y data for ecdf plot from ecdf method
        x,y = ecdf(i)
        plt.plot(x, y, marker = '.', linestyle = 'none', markersize = 3)
    
    for i in legend:   
        leg.append(i)
        
    plt.legend(leg)
    plt.title(title)
    plt.xlabel('Error distance')
    plt.ylabel('Error probability')
    plt.margins(0.1)
    
def scatter(x, y, title, xlabel, ylabel):
    """
    Generate scatter plot using matplotlib package
    
    Args: xdata, ydata, title, legend
    
    Returns: nothing
    """
    x = np.array(x,dtype=float)
    y = np.array(y,dtype=float)
    m, b = np.polyfit(x, y, 1)
    plt.scatter(x, y, c ="blue", s = 3)
    plt.plot(x, m*x + b, color="red")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()