# Automating Entropy Calculations Across Examples

In [None]:
import sys
sys.path.append('..')

import torch
import numpy as np
import pandas as pd
from datetime import datetime as dt
from mod.entropy import entropy

### 1. Importing Data

In [None]:
location_path = "data/"
vec_path = "vecs/reddit-vecs.tsv"

df = pd.read_table(location_path + vec_path, lineterminator='\n')

g_col = 'subreddit'

groups = df[g_col].unique()

# Quick corpus details
print(groups)
print(list(df))
for subreddit in df[g_col].unique():
    print('{} \t {} {}'.format(subreddit, len(df['_id'].loc[df[g_col].isin([subreddit])].unique()), df[g_col].isin([subreddit]).sum()))

# select down to only the groups we care about, and sort by date to unscramble cross-post differences.
df = df.sort_values(by=[g_col]+['comment_created_at', '_id'])
df.index=range(len(df))

And we'll now reformat the vectors to be the correct format--i.e. torch vectors.

In [None]:
def revectorize(x):
    x_ = str(x).replace('[', '').replace(']', '')
    return np.array(x_.split(', ')).astype(float)

begin = dt.now()
df['vec'] = df['vec'].apply(lambda x: revectorize(x))
Eu = df['vec'].values
del df['vec']

Eu = torch.FloatTensor(np.concatenate(Eu, axis=0))
print('{} vecs made in {}\n'.format(Eu.shape[0], dt.now()-begin))

And for accounting, we add a column 'n' to track the number of tokens in a sentence (per Torch)

In [None]:
_idn = df['_id'].value_counts()
df['n'] = df['_id'].apply(lambda x: _idn[x])

### 2. Analyzing the data

First, let's set up a model class.

In [None]:
H = entropy().cuda()

And as with our word vectors, we set up a new document to stream our outputs to.

In [None]:
import os

output_path = "summaries/H-posteriors.csv"

meta_data_cols = [
    '_id',
    'subreddit',
    'comment_ups',
    'user',
    'comment_created_at',
    'sub_id'
]

dfposteriors = pd.DataFrame(columns=['x', 'y', 'xtime', 'ytime', 'n']+['x_'+col for col in meta_data_cols]+['y_'+col for col in meta_data_cols]+['H'])

if not bool(os.listdir('summaries')):
    dfposteriors.to_csv(location_path + output_path, index=False, encoding='utf-8')

And now we can run our model over the data and save the outputs.

We'll start by creating a list of all permissable combinations.

In [None]:
from itertools import combinations
total_combinations = combinations(df['_id'].unique(), 2)

And then run our calculations.

In [None]:
begin = dt.now()
with torch.no_grad():
    for k, (i,j) in enumerate(total_combinations):
        try:
            xsel = df['_id'].isin([i]).values
            ex = Eu[xsel]
            xt = df['comment_created_at'].loc[xsel].values[0]

            ysel = df['_id'].isin([j]).values
            ey = Eu[ysel]
            yt = df['comment_created_at'].loc[ysel].values[0]

            Hij, Hji = H(ex.cuda(), ey.cuda())
            Hij, Hji = Hij.detach().cpu().item(), Hji.detach().cpu().item()

            df_ij = [
                [i, j, xt, yt, ex.shape[0]]+df[meta_data_cols].loc[xsel].tolist()[0]+df[meta_data_cols].loc[ysel].tolist()[0]+[Hij],
                [j, i, yt, xt, ey.shape[0]]+df[meta_data_cols].loc[ysel].tolist()[0]+df[meta_data_cols].loc[xsel].tolist()[0]+[Hji]
            ]

            df_ij = np.array(df_ij)
            df_ij = pd.DataFrame(df_ij, columns=list(dfposteriors))
            df_ij.to_csv(location_path+output_path, index=False, header=False, mode='a', encoding='utf-8')

            if ((k+1) % int(len(total_combinations)/10)) == 0:
                print('combo {}/{}, {}'.format(k+1, len(total_combinations), dt.now()-begin))

        except Exception as ERR:
            print(ERR)
            print(i,j, '\n')

print('-------+++-------')