# Performance Metrics

Given a dataset of stay points...
- how do we decide how accurate and precise is it?
- how do we rank the stay points?

\begin{align*}
    \operatorname{rank}_t(i) 
        &= \frac{t_i}{t_{max}}\\
    \operatorname{rank}_n(i)
        &= \frac{n_i}{n_{max}}\\
    \operatorname{rank}(i)
        &=\sqrt{\operatorname{rank}_t(i)\operatorname{rank}_n(i)}
\end{align*}

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from math import sqrt

In [2]:
START_TIME = datetime(2020,1,1)

In [26]:
def rank(df, var, rank_var, ascending=False):
    
    # sort by var
    df = df.sort_values(by=var, ascending=ascending).reset_index(drop=True)\

    rank = 1
    
    if not ascending:
        # assign ranking by var
        for i in range(len(df)):
            if i == 0:
                df.loc[i,rank_var] = rank
            else:
                if df.loc[i, var] < df.loc[i-1, var]:
                    rank += 1
                df.loc[i, rank_var] = rank

        df = df.astype({rank_var: int})

    if ascending:
        # assign ranking by var
        for i in range(len(df)):
            if i == 0:
                df.loc[i,rank_var] = rank
            else:
                if df.loc[i, var] > df.loc[i-1, var]:
                    rank += 1
                df.loc[i,rank_var] = rank

        df = df.astype({rank_var: int})
    
    return df

In [9]:
def importance_score(df):
    '''
    :param df: a dataframe containing variables `time`, `num_points`
    '''
    
    df = rank(df, "time", "time_rank")
    df = rank(df, "num_points", "points_rank")
    
    return df

In [13]:
def gen_staypoints(rows=30, min_points=10, min_time=timedelta(minutes=10)):
    df = pd.DataFrame(columns=["time", "num_points"])
    
    for i in range(rows):
        df.loc[i,:] = [min_time + timedelta(minutes=round(np.random.uniform(low=0, high=480))),
                       min_points + round(np.random.uniform(low=0, high=50))]
                                                                            
    return df
    np.random.uniform()

In [44]:
df = gen_staypoints(rows=20)
df

Unnamed: 0,time,num_points
0,2:03:00,33
1,4:25:00,21
2,0:35:00,54
3,7:21:00,22
4,1:27:00,22
5,4:07:00,12
6,2:15:00,25
7,5:45:00,15
8,4:31:00,32
9,1:17:00,25


In [45]:
df = importance_score(df)
df

Unnamed: 0,time,num_points,time_rank,points_rank
0,0:27:00,59,19,1
1,4:54:00,59,5,1
2,0:12:00,57,20,2
3,0:35:00,54,17,3
4,6:17:00,43,3,4
5,0:44:00,40,16,5
6,0:31:00,36,18,6
7,7:11:00,34,2,7
8,2:03:00,33,12,8
9,4:31:00,32,6,9


In [49]:
max_time = max(df.time)
max_pts = max(df.num_points)

df["time_score"] = df.apply(lambda x: x.time/max_time, axis=1)
df["pts_score"] = df.apply(lambda x: x.num_points/max_pts, axis=1)

df["impt1"] = df.apply(lambda x: sqrt(x.time_rank * x.points_rank), axis=1)
df["impt2"] = df.apply(lambda x: 2/(1/x.time_rank + 1/x.points_rank), axis=1)
df["impt3"] = df.apply(lambda x: sqrt(x.time_score * x.pts_score), axis=1)

df = rank(df, "impt1", "impt1_rank", ascending=True)
df = rank(df, "impt2", "impt2_rank", ascending=True)
df = rank(df, "impt3", "impt3_rank", ascending=False)
df

Unnamed: 0,time,num_points,time_rank,points_rank,time_score,pts_score,impt1,impt2,impt3,impt1_rank,impt2_rank,impt3_rank
0,4:54:00,59,5,1,0.666667,1.0,2.236068,1.666667,0.816497,1,1,1
1,6:17:00,43,3,4,0.854875,0.728814,3.464102,3.428571,0.789332,3,5,2
2,7:11:00,34,2,7,0.977324,0.576271,3.741657,3.111111,0.750469,4,4,3
3,7:21:00,22,1,11,1.0,0.372881,3.316625,1.833333,0.61064,2,2,4
4,4:31:00,32,6,9,0.614512,0.542373,7.348469,7.2,0.577317,9,9,5
5,4:25:00,21,7,12,0.600907,0.355932,9.165151,8.842105,0.462474,11,11,6
6,5:45:00,15,4,13,0.782313,0.254237,7.211103,6.117647,0.445974,8,8,7
7,2:03:00,33,12,8,0.278912,0.559322,9.797959,9.6,0.39497,12,13,8
8,2:15:00,25,11,10,0.306122,0.423729,10.488088,10.47619,0.360157,14,14,9
9,3:37:00,14,9,14,0.492063,0.237288,11.224972,10.956522,0.341703,15,16,10
