In [237]:
from src.features.build_features import handle_zeros, resample_KL
from src.visualization.visualize import plot_divergence
from src.data.make_dataset import preprocess
from pathlib import Path
import pandas as pd
import numpy as np


In [238]:
FILE_NAME = 'uniform_date.csv'
PATH = Path('../data/raw')
cycle3 = pd.read_csv(PATH / FILE_NAME)
processed = preprocess(cycle3)
processed = handle_zeros(processed)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



> For the group of timestamps that you are considering for each bubble, we want to calculate the mean for each bin mu_b where b is the bin. Then for each forecast f_t_b, where t is the index of the timestamp, we are going to calculate the KL-divergence
K_t = sum(mu_b * log2(f_t_b / mu_b)) along the b axis
Finally we are going to calculate the mean forKL, mean(KL_t) along the t axis.

**So we will have to compute means for the group, during that interval, then we will compute the divergences from that mean for each forecast in the time interval, and then we will average the divergences.**

In [239]:
question = 'CFF 304.4'
resampling_period = 'H'

In [257]:
processed.head(3)

Unnamed: 0_level_0,Forecaster,Forecaster.ID,Question,Forecast,TeamName,Commuity.ID,Ordered.Bin.Number,Number.of.Bins,True.Answer,Weight,WRPS,CrossEntropy,FairSkill,SWRPS,Final,Initial,Process.Gain,Tvalue
Uniform Date Format,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-11-11 22:00:34,Dimitry Sarin,71,CFF 307.1,0.7,Kiwi,1,1,2,0.6,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706
2019-11-11 22:00:34,Dimitry Sarin,71,CFF 307.1,0.3,Kiwi,1,2,2,0.4,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706
2019-11-11 22:02:20,Dimitry Sarin,71,CFF 308.1,0.3,Kiwi,1,2,2,0.93,1.0,0.3969,-1.143,-0.45,0.63,0,1,-0.37,43780.91829


- The KL-divergence I had in mind is different. First we compute the average of the forecasts in that time window, then we compute for each forecaster the divergence of that forecaster from that time window average. I also prefer to vectorize that operation as I did in DTC, because I get a feel for the dimensions involved.
- vector{Ave_i} dot.product ( log2( {matrix forecasters of shape (m,n)} / vector{Ave_i} ) )
- The division is done by bin with broadcast to all n forecasters. Then when you have all the KL_divergences for each forecaster, you average them to obtain the bubble size for that time window.
- The number you calculated, I believe is sort of an entropy for the average.


Notes:
1. $m$ is the number of bins
2. $n$ is the number of forecasts
3. $i$ is the index of the timestamp

In [258]:
def kullback_leibler(forecasts, average):
    return -average @ (np.log2(forecasts) - np.log2(average.T))

In [259]:
def KL_apply(df):
    average_forecast = df.groupby(
        ['Question', 'Ordered.Bin.Number']
    ).Forecast.mean().reset_index()
    for question in df.Question.unique():
        forecasts = df.query("Question == @question").pivot_table(
            values='Forecast',
            index='Ordered.Bin.Number',
            columns='Forecaster'
        ).values
        question_avg = average_forecast.query(
            "Question == @question"
        ).Forecast.values.reshape(1, -1)
        divergences = kullback_leibler(forecasts, question_avg)
        return np.mean(divergences)

In [260]:
processed.head(2)

Unnamed: 0_level_0,Forecaster,Forecaster.ID,Question,Forecast,TeamName,Commuity.ID,Ordered.Bin.Number,Number.of.Bins,True.Answer,Weight,WRPS,CrossEntropy,FairSkill,SWRPS,Final,Initial,Process.Gain,Tvalue
Uniform Date Format,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-11-11 22:00:34,Dimitry Sarin,71,CFF 307.1,0.7,Kiwi,1,1,2,0.6,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706
2019-11-11 22:00:34,Dimitry Sarin,71,CFF 307.1,0.3,Kiwi,1,2,2,0.4,1.0,0.01,-0.695,-0.002,0.1,0,1,0.15,43780.91706


In [262]:
processed.query("Question == @question").reset_index().groupby(pd.Grouper(
    freq='60Min',
    label='right',
    key='Uniform Date Format',
)).apply(KL_apply).head(60)

Uniform Date Format
2019-11-12 00:00:00    0.000000
2019-11-12 01:00:00    0.673535
2019-11-12 02:00:00    0.665612
2019-11-12 03:00:00         NaN
2019-11-12 04:00:00         NaN
2019-11-12 05:00:00         NaN
2019-11-12 06:00:00         NaN
2019-11-12 07:00:00    0.000000
2019-11-12 08:00:00    1.234409
2019-11-12 09:00:00         NaN
2019-11-12 10:00:00         NaN
2019-11-12 11:00:00         NaN
2019-11-12 12:00:00         NaN
2019-11-12 13:00:00    0.000000
2019-11-12 14:00:00         NaN
2019-11-12 15:00:00         NaN
2019-11-12 16:00:00    0.000000
2019-11-12 17:00:00         NaN
2019-11-12 18:00:00    0.000000
2019-11-12 19:00:00    0.000000
2019-11-12 20:00:00         NaN
2019-11-12 21:00:00         NaN
2019-11-12 22:00:00         NaN
2019-11-12 23:00:00         NaN
2019-11-13 00:00:00    0.287569
2019-11-13 01:00:00    0.000000
2019-11-13 02:00:00    0.000000
2019-11-13 03:00:00         NaN
2019-11-13 04:00:00    0.000000
2019-11-13 05:00:00    0.000000
2019-11-13 06:00:00 