In [243]:
import pandas as pd
import datetime
import random
import numpy as np
from IPython.core.display import HTML
import statsmodels.api as sm
import plotly.express as px

In [244]:
### Functions to generate random data with time and floating point number

def get_time(mins):
    base = pd.Timestamp.today()
    timestamp_list = [base + datetime.timedelta(minutes=x) for x in range(mins)]
    return timestamp_list

def get_rt(count):
    ### decide outlier range between 1% and 10% of total
    outlier = round(random.uniform(0.01, 0.1)*count)
    list1 = [0.01+round(random.gammavariate(0.1, 1),3) for _ in range(count-outlier)]
    list2 = [round(random.uniform(0.9, 4.0),2) for _ in range(outlier)]
    flist = list1+list2
    [random.shuffle(flist) for _ in range(5)]
    return flist

#### How many datapoints are needed to plot
datapoints = 1000

In [245]:
display(HTML('<H2>RT Plot Against Time</H2>'))
cols = ['time','RT']
df = pd.DataFrame(list(zip(get_time(datapoints), get_rt(datapoints))),columns=cols).set_index('time')
df.plot.line(backend='plotly')

In [246]:
display(HTML('<H2>Basic Stats for RT</H2>'))
df.describe(percentiles=[0.95,0.99]).T

Unnamed: 0,count,mean,std,min,50%,95%,99%,max
RT,1000.0,0.230804,0.630628,0.01,0.011,1.561,3.3806,3.94


In [247]:
## Problem with histograms -- binning bias

display(HTML('<H2>Histogram of Values</H2>'))
df.plot.hist(backend='plotly',bins=10)

In [248]:
display(HTML('<H2>BoxPlot to look at Outliers</H2>'))
df.plot.box(backend='plotly')

In [249]:
display(HTML('<H2>10 Min Interval View of 95P</H2>'))
df1 = df.groupby(pd.Grouper(freq = '10T')).describe(percentiles=[0.95])
df1[[('RT','count'),('RT','mean'),('RT','95%')]].round(3)

Unnamed: 0_level_0,RT,RT,RT
Unnamed: 0_level_1,count,mean,95%
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2023-06-03 14:30:00,3.0,0.011,0.012
2023-06-03 14:40:00,10.0,0.674,3.243
2023-06-03 14:50:00,10.0,0.020,0.065
2023-06-03 15:00:00,10.0,0.041,0.152
2023-06-03 15:10:00,10.0,0.092,0.313
...,...,...,...
2023-06-04 06:30:00,10.0,0.750,2.855
2023-06-04 06:40:00,10.0,0.361,1.842
2023-06-04 06:50:00,10.0,0.444,1.750
2023-06-04 07:00:00,10.0,0.175,0.718


In [250]:
### Assuming 600ms as the high RT tolerance, we look at the 95% for values greater than 600ms
high_threshold=0.6

display(HTML('<H2>10 Min Interval View of Outliers</H2>'))
df2 = df[df['RT']>=high_threshold].groupby(pd.Grouper(freq = '10T')).describe(percentiles=[0.95])
df3 = df2[df2[('RT','count')]>0.0][[('RT','count'),('RT','mean'),('RT','95%'),('RT','max')]]
df4 = pd.merge(df3,pd.DataFrame(df1[('RT','count')]).rename(columns={'count':'tot_count'})
               ,left_index=True,right_index=True,how='outer').dropna()
df4[('RT','out%')] = (df4[('RT','count')]/df4[('RT','tot_count')])*100
display(df4[[('RT','count'),('RT','out%'),('RT','95%')]].round(3))
print(f"Total outliers >={high_threshold}s = {sum(df4[('RT','count')])}")

Unnamed: 0_level_0,RT,RT,RT
Unnamed: 0_level_1,count,out%,95%
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2023-06-03 14:40:00,2.0,20.0,3.827
2023-06-03 15:20:00,1.0,10.0,2.410
2023-06-03 15:50:00,2.0,20.0,1.319
2023-06-03 16:00:00,1.0,10.0,2.410
2023-06-03 16:30:00,1.0,10.0,1.055
...,...,...,...
2023-06-04 06:20:00,1.0,10.0,1.580
2023-06-04 06:30:00,3.0,30.0,3.489
2023-06-04 06:40:00,1.0,10.0,2.990
2023-06-04 06:50:00,3.0,30.0,2.116


Total outliers >=0.6s = 103.0


In [251]:
display(HTML('<H2>10 Min Interval Distribution of Outliers</H2>'))
fig = px.scatter(data_frame=pd.DataFrame(df4[('RT','out%')]).droplevel(0,axis=1)
           , y = 'out%')
fig.show()

In [252]:
## Plotting ECDF
## Definition: https://en.wikipedia.org/wiki/Empirical_distribution_function

display(HTML('<H2> ECDF of RT</H2>'))
ecdf = sm.distributions.ECDF(df['RT'])
fig = px.line(x = ecdf.x ,
              y = ecdf.y,
             labels={
                     "x": "RT (sec)",
                     "y": "probability (0-1)"
                 })
fig.show()