In [1]:
from src.features.build_features import handle_zeros, KL_apply
from src.visualization.visualize import plot_divergence
from src.data.make_dataset import preprocess
from pathlib import Path
import pandas as pd
import numpy as np


In [2]:
FILE_NAME = 'uniform_date.csv'
PATH = Path('../data/raw')
cycle3 = pd.read_csv(PATH / FILE_NAME)
processed = preprocess(cycle3)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [3]:
processed.shape

(11288, 18)

In [4]:
processed = handle_zeros(processed)

> For the group of timestamps that you are considering for each bubble, we want to calculate the mean for each bin mu_b where b is the bin. Then for each forecast f_t_b, where t is the index of the timestamp, we are going to calculate the KL-divergence
K_t = sum(mu_b * log2(f_t_b / mu_b)) along the b axis
Finally we are going to calculate the mean forKL, mean(KL_t) along the t axis.

**So we will have to compute means for the group, during that interval, then we will compute the divergences from that mean for each forecast in the time interval, and then we will average the divergences.**

In [5]:
processed.head(3)

Unnamed: 0,Question,Uniform Date Format,Ordered.Bin.Number,Forecaster,Forecaster.ID,Forecast,TeamName,Commuity.ID,Number.of.Bins,True.Answer,Weight,WRPS,CrossEntropy,FairSkill,SWRPS,Final,Initial,Process.Gain,Tvalue,Total
0,CFF 307.1,2019-11-11 22:00:34,1,Dimitry Sarin,71,0.7,Kiwi,1,2,0.6,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706,1.0
1,CFF 307.1,2019-11-11 22:00:34,2,Dimitry Sarin,71,0.3,Kiwi,1,2,0.4,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706,1.0
2,CFF 308.1,2019-11-11 22:02:20,1,Dimitry Sarin,71,0.7,Kiwi,1,2,0.07,1.0,0.3969,-1.143,-0.45,0.63,0,1,-0.37,43780.91829,1.0


- The KL-divergence I had in mind is different. First we compute the average of the forecasts in that time window, then we compute for each forecaster the divergence of that forecaster from that time window average. I also prefer to vectorize that operation as I did in DTC, because I get a feel for the dimensions involved.
- vector{Ave_i} dot.product ( log2( {matrix forecasters of shape (m,n)} / vector{Ave_i} ) )
- The division is done by bin with broadcast to all n forecasters. Then when you have all the KL_divergences for each forecaster, you average them to obtain the bubble size for that time window.
- The number you calculated, I believe is sort of an entropy for the average.


Notes:
1. $m$ is the number of bins
2. $n$ is the number of forecasters
3. $i$ is the index of the timestamp

In [6]:
def kullback_leibler(forecasts, average):
    print("forecasts", forecasts)
    print("average", average)
    return -average @ (np.log2(forecasts) - np.log2(average.T))

In [7]:
def test_kullback_leibler():
    forecasts = np.array(
        [[0.1, 0.1],
         [0.9, 0.9]]
    )
    average = np.array([[0.1, 0.9]])
    print((kullback_leibler(forecasts, average) == 0).all())

In [8]:
test_kullback_leibler()

forecasts [[0.1 0.1]
 [0.9 0.9]]
average [[0.1 0.9]]
True


In [9]:
def KL_apply(question):
    if not question.empty:
        average_forecast = question.groupby(
            ['Ordered.Bin.Number']
        ).Forecast.mean().reset_index()
        forecasts = question.pivot_table(
            values='Forecast',
            index='Ordered.Bin.Number',
            columns='Forecaster.ID'
        ).values
        question_avg = average_forecast.Forecast.values.reshape(1, -1)
        divergences = kullback_leibler(forecasts, question_avg)
        print('divs', divergences)
        return np.mean(divergences)

In [10]:
processed.Question.unique()

array(['CFF 307.1', 'CFF 308.1', 'CFF 308.3', 'CFF 308.4', 'CFF 307.3',
       'CFF 307.4', 'CFF 307.2', 'CFF 301.1', 'CFF 301.2', 'CFF 301.4',
       'CFF 301.3', 'CFF 304.1', 'CFF 304.2', 'CFF 304.4', 'CFF 304.3',
       'CFF 308.2'], dtype=object)

In [11]:
processed.head(2)

Unnamed: 0,Question,Uniform Date Format,Ordered.Bin.Number,Forecaster,Forecaster.ID,Forecast,TeamName,Commuity.ID,Number.of.Bins,True.Answer,Weight,WRPS,CrossEntropy,FairSkill,SWRPS,Final,Initial,Process.Gain,Tvalue,Total
0,CFF 307.1,2019-11-11 22:00:34,1,Dimitry Sarin,71,0.7,Kiwi,1,2,0.6,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706,1.0
1,CFF 307.1,2019-11-11 22:00:34,2,Dimitry Sarin,71,0.3,Kiwi,1,2,0.4,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706,1.0


In [30]:
question = 'CFF 307.1'
resampling_period = 'D'

In [31]:
res = processed.query("Question == @question & TeamName == 'Kiwi'").reset_index().groupby(pd.Grouper(
    freq=resampling_period,
    label='right',
    key='Uniform Date Format',
)).apply(KL_apply)

forecasts [[0.6 0.7]
 [0.4 0.3]]
average [[0.65 0.35]]
divs [[0.00763441 0.00834246]]
forecasts [[0.7  0.5  0.83 0.7  0.55]
 [0.3  0.5  0.17 0.3  0.45]]
average [[0.685 0.315]]
divs [[0.00076575 0.10113896 0.09054235 0.00076575 0.05483002]]
forecasts [[0.525 0.5   0.2   0.5   0.6   0.5   0.7   0.25  0.7   0.15 ]
 [0.475 0.5   0.8   0.5   0.4   0.5   0.3   0.75  0.3   0.85 ]]
average [[0.46818182 0.53181818]]
divs [[0.00932299 0.00292313 0.26121487 0.00292313 0.05098242 0.00292313
  0.16758682 0.16001126 0.16758682 0.40901355]]
forecasts [[0.48  0.7   0.435]
 [0.52  0.3   0.565]]
average [[0.5125 0.4875]]
divs [[0.00304943 0.11094037 0.01746129]]
forecasts [[0.43 0.55 0.68 0.42]
 [0.57 0.45 0.32 0.58]]
average [[0.48666667 0.51333333]]
divs [[0.00937015 0.01162289 0.11513729 0.01301115]]
forecasts [[0.425 0.3   0.49 ]
 [0.575 0.7   0.51 ]]
average [[0.388 0.612]]
divs [[0.00407585 0.02536469 0.03032772]]
forecasts [[0.56 0.44]
 [0.44 0.56]]
average [[0.5 0.5]]
divs [[0.01046292 0.010462

In [32]:
res[res < 0].index

DatetimeIndex([], dtype='datetime64[ns]', name='Uniform Date Format', freq=None)

In [87]:
processed.query("Question == 'CFF 307.3'").set_index('Uniform Date Format').between_time('02:31:50', '02:32:01').sort_values(by='Forecast').drop(columns='Forecaster')

Unnamed: 0_level_0,Forecaster.ID,Question,Forecast,TeamName,Commuity.ID,Ordered.Bin.Number,Number.of.Bins,True.Answer,Weight,WRPS,CrossEntropy,FairSkill,SWRPS,Final,Initial,Process.Gain,Tvalue,Total
Uniform Date Format,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-11-25 02:32:00,63,CFF 307.3,0.002488,Mango,2,1,5,0.0,2.52,0.0316,-1.069,0.54,0.1777,0,0,-0.342,43794.10556,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.002488,Mango,2,1,5,0.0,2.52,0.0369,-1.075,0.5345,0.1921,1,0,-0.342,,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.004975,Mango,2,2,5,0.02,2.52,0.0316,-1.069,0.54,0.1777,0,0,-0.342,43794.10556,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.014925,Mango,2,2,5,0.02,2.52,0.0369,-1.075,0.5345,0.1921,1,0,-0.342,,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.044776,Mango,2,3,5,0.07,2.52,0.0369,-1.075,0.5345,0.1921,1,0,-0.342,,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.059701,Mango,2,3,5,0.07,2.52,0.0316,-1.069,0.54,0.1777,0,0,-0.342,43794.10556,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.174129,Mango,2,4,5,0.62,2.52,0.0369,-1.075,0.5345,0.1921,1,0,-0.342,,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.179104,Mango,2,4,5,0.62,2.52,0.0316,-1.069,0.54,0.1777,0,0,-0.342,43794.10556,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.253731,Mango,2,5,5,0.29,2.52,0.0316,-1.069,0.54,0.1777,0,0,-0.342,43794.10556,2.01
2019-11-25 02:32:00,63,CFF 307.3,0.263682,Mango,2,5,5,0.29,2.52,0.0369,-1.075,0.5345,0.1921,1,0,-0.342,,2.01


In [85]:
C = 0.005
processed.loc[:, 'Forecast'] = processed.Forecast.replace(0, C)
sums = processed.groupby(
    ['Uniform Date Format', 'Question']
).Forecast.sum().reset_index().rename(columns={'Forecast': 'Total'})

In [18]:
sums.head()

Unnamed: 0,Uniform Date Format,Question,Total
0,2019-11-11 22:00:34,CFF 307.1,1.0
1,2019-11-11 22:02:20,CFF 308.1,1.0
2,2019-11-11 22:02:59,CFF 308.3,1.0
3,2019-11-11 22:03:38,CFF 308.4,1.0
4,2019-11-11 22:06:03,CFF 307.3,1.0


In [102]:
processed.columns

Index(['Forecaster', 'Uniform Date Format', 'Forecaster.ID', 'Question',
       'Forecast', 'TeamName', 'Commuity.ID', 'Ordered.Bin.Number',
       'Number.of.Bins', 'True.Answer', 'Weight', 'WRPS', 'CrossEntropy',
       'FairSkill', 'SWRPS', 'Final', 'Initial', 'Process.Gain', 'Tvalue',
       'Total'],
      dtype='object')

In [22]:
def test_handle_zeros():
    forecasts = merged.Forecast
    grouped = merged.groupby(
        ['Uniform Date Format', 'Question']
    ).Forecast.sum().values
    assert all(np.logical_and(0 <= forecasts, forecasts <= 1))
    assert all(np.isclose(grouped, 1))

In [23]:
test_handle_zeros()

In [88]:
processed.shape

(11351, 20)

In [118]:
index_cols = ['Uniform Date Format', 'Question', 'Ordered.Bin.Number']
agg_dict = {col: 'last' for col in processed.columns if col not in index_cols}
processed.groupby(
    index_cols
, as_index=False).agg(agg_dict)

Unnamed: 0,Uniform Date Format,Question,Ordered.Bin.Number,Forecaster,Forecaster.ID,Forecast,TeamName,Commuity.ID,Number.of.Bins,True.Answer,Weight,WRPS,CrossEntropy,FairSkill,SWRPS,Final,Initial,Process.Gain,Tvalue,Total
0,2019-11-11 22:00:34,CFF 307.1,1,Dimitry Sarin,71,0.700000,Kiwi,1,2,0.60,1.00,0.01,-0.695,-0.0020,0.1000,0,1,0.1500,43780.91706,1.000
1,2019-11-11 22:00:34,CFF 307.1,2,Dimitry Sarin,71,0.300000,Kiwi,1,2,0.40,1.00,0.01,-0.695,-0.0020,0.1000,0,1,0.1500,43780.91706,1.000
2,2019-11-11 22:02:20,CFF 308.1,1,Dimitry Sarin,71,0.700000,Kiwi,1,2,0.07,1.00,0.3969,-1.143,-0.4500,0.6300,0,1,-0.3700,43780.91829,1.000
3,2019-11-11 22:02:20,CFF 308.1,2,Dimitry Sarin,71,0.300000,Kiwi,1,2,0.93,1.00,0.3969,-1.143,-0.4500,0.6300,0,1,-0.3700,43780.91829,1.000
4,2019-11-11 22:02:59,CFF 308.3,1,Dimitry Sarin,71,0.004950,Kiwi,1,5,0.01,2.52,0.1155,-1.599,0.0106,0.3398,0,1,-0.0770,43780.91874,1.010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11283,2019-11-25 04:44:00,CFF 304.4,1,David Smerdon,87,0.004975,Mango,2,5,0.13,2.52,0.1143,-2.363,-0.7540,0.3381,1,0,0.2497,,1.005
11284,2019-11-25 04:44:00,CFF 304.4,2,David Smerdon,87,0.049751,Mango,2,5,0.22,2.52,0.1143,-2.363,-0.7540,0.3381,1,0,0.2497,,1.005
11285,2019-11-25 04:44:00,CFF 304.4,3,David Smerdon,87,0.348259,Mango,2,5,0.32,2.52,0.1143,-2.363,-0.7540,0.3381,1,0,0.2497,,1.005
11286,2019-11-25 04:44:00,CFF 304.4,4,David Smerdon,87,0.447761,Mango,2,5,0.14,2.52,0.1143,-2.363,-0.7540,0.3381,1,0,0.2497,,1.005


In [116]:
processed.shape

(11351, 20)

In [94]:
11351 - 11288

63

In [95]:
processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11351 entries, 0 to 11350
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Forecaster           11351 non-null  object        
 1   Uniform Date Format  11351 non-null  datetime64[ns]
 2   Forecaster.ID        11351 non-null  int64         
 3   Question             11351 non-null  object        
 4   Forecast             11351 non-null  float64       
 5   TeamName             11351 non-null  object        
 6   Commuity.ID          11351 non-null  int64         
 7   Ordered.Bin.Number   11351 non-null  int64         
 8   Number.of.Bins       11351 non-null  int64         
 9   True.Answer          11351 non-null  float64       
 10  Weight               11351 non-null  float64       
 11  WRPS                 11351 non-null  object        
 12  CrossEntropy         11351 non-null  float64       
 13  FairSkill            11351 non-