# Methods Walkthrough for Section 3.2

### 1. Generating the Corpus from the study

In [None]:
import sys
sys.path.append('path_to_repo')

from data.redditany.corpusv2 import *
from data.redditany.split_comments import split_values
from data.redditany.find_patterns_in_comments import find_pattern, pattern_example

dfs = split_values(dfs,'body')
dfs = find_pattern(dfs,'body',pattern_example)

output_file = "path_to_repo/data/corpus/corpora/corpus.csv"
dfs.to_csv(output_file, index=False, encoding='utf-8')

### 2. Generating Tokens and Vectors for Entropy Analysis Inputs

In [None]:
# Used BERT in GPU server. Will update later.

### 3. Monte Carlo Sampling and Calculating Entropy from Group Language Patterns

It's worth noting that this entire process is contained in a single script--`./mod/H.py`

#### 3.1 Initial Set-Up

In [None]:
import sys
sys.path.append('path_to_repo')

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from mod.entropy import entropy

############################################################################################
### Path variables for project on GPU server
############################################################################################
location_path = "/home/zaq/d/convergence/feminism-menslib-mensrights/women/"
data_name = "vecs.tsv"
output_path = location_path + "posteriors.pt"
H_summary_path = location_path + "summaryH.csv"
TTest_summary_path = location_path + "TTest.csv"
sample_history_path = location_path + "sample_history.csv"

importing data and first pass pre-processing (removing erroneous duplicates if they exist)

In [None]:
s_col = 'subreddit_name'
groups = ['menslib', 'feminism', 'mensrights']

df = pd.read_table(location_path+data_name)
print(list(df))
print(df['subreddit_name'].value_counts())
df = df.loc[
    df[s_col].isin(groups)
    # & df['pattern_found'].values
]
df.index=range(len(df))

#In case __id values (original comment_id) are wonky, renumbers them.
df['--id'] = df['__id'].values
update_id_dic = {i:idx for idx,i in enumerate(np.unique(df['__id'].values))}
df['__id'] = df['__id'].replace(update_id_dic)

# Quick summary of what's in the data
print(list(df))
for subreddit in df['subreddit_name'].unique():
    print('{} \t {}'.format(subreddit, len(df['__id'].loc[df['subreddit_name'].isin([subreddit])].unique())))

The document contains the vectors for each token, so we'll convert those back to vectors.

In [None]:
def vec_from_string(dfi):
    return torch.FloatTensor(
        [
            [
                float(i) for i in vec.replace('[', '').replace(']','').split(', ')
            ] for vec in dfi['vecs'].values
        ]
    )

Eu = vec_from_string(df)
del df['vecs']
print(Eu.shape)

#### 3.2 Entropy Model: Description and Sampling Procedure

We'll set up the model itself first using the entropy model class...

Imagine that an interlocutor is playing a kind of language reconstruction game. The interlocutor is given a single utterance from an individual, broken up into tokens. The interlocutor is then given a set of utterances also broken up into tokens from several utterances all taken from a number of members of some group. The interlocutor is then asked to take the groups' tokens and reconstruct an utterance that means something similar to the sentence they observed from the individual. This process can be repeated for the same original utterance using tokens from several different groups. In this scenario, reconstructed utterances that are more similar in meaning to the original utterance will have lower entropy. Reconstructed utterances that are either less similar or less intelligible will have higher entropy.

We operationalize this language game by calculating entropy for utterances using BERT word vectors (Devlin et al. 2019)) to represent each token. This allows us to capture similarity between tokens that are semantically similar but are not a 1:1 mapping of the same word. Let $E_{xi}$ be the set of BERT word vectors for each token $w_i$ in a sentence $x$.

$$E_{xi} = BERT(w_i \in x)$$

the probability that two words are semantically similar to one another based on their word vectors is a function of their location in vector space (Devlin et al., 2019; Mikolov et al., 2013; Pennington et al., 2014). If a word vector were a point in space, words that are more semantically related to one another will be closer to one another. We use cosine similarity (CoS) to calculate the proximity between word vectors. Now, the probability of two word vectors meaning the same thing can be thought of in the following way: if word vectors put words that are more semantically similar to one another closer in space, the probability that a word/token $i$ from a sentence $x$ is semantically similar to a word/token $j$ from a sentence $y$ can be thought of colloquially as how likely you are to hit $xi$ if you were to throw a dart at $yj$. We quantify this intuition about probability and vector space in equation 1 using a Gaussian distribution with a location parameter $\mu=1.$ such that as the CoS value for the comparison of two word vectors approaches 1 we have maximum confidence that the two words mean the same thing, and a scale parameter $\sigma$.

$$P(E_{xi} | E_{yj}) = P_{\mathcal{N}}\left( CoS(E_{xi},E_{yj}) \bigg|  \mu=1, \sigma \right)$$

Think of $\sigma$ like the accuracy of the dart thrower, where lower $\sigma$ values equate to the dart thrower only hitting a word/token $xi$ if it is very close to $yj$ in word vector space.

However, we almost never have a reason to compare any one vector from a sentence $xi$ to any single vector from another sentence/distribution, $yj$. Instead, it’s better to ask how likely is a vector $xi$ conditioned on what we know about the total distribution $y$, in which there are $j$ tokens ($j \in y$). A priori, one way of posing this question is by asking “when we compare $xi$ to the entirety of the distribution $y$, which token $j \in y$ returns the maximum likelihood for $xi$ and what is the probability of $xi$ conditioned on that token?” We thus rewrite equation 1 as follows:


$$P(E_{xi} | E_{y}) = P_{\mathcal{N}} \left( \max_{j} \left(CoS(E_{xi},E_{y}) \right) \bigg|  \mu=1, \sigma \right)$$

From the perspective of a transformer language model like BERT, the only way that the function $\max( CoS(E_{xi},E_{y}))$ can approach 1 is if there exists some overlapping, similar context between $x$ and $y$. Thus, in most cases there are three potential phenomena that increase the probability of $P(E_{xi}|E_y)$. Either (1) a number of lexical items in $j \in y$ tend to be semantically similar to $xi$ (because each lexical item $j \in y$ is embedded in a similar context to $xi$) such that any sample from the distribution $y$ will likely contain items that maximize $P(E_{xi} | E_y)$ (i.e. things that are semantically similar to $xi$ are common in the distribution $y$), (2) the distribution $y$ influenced the construction of $xi$ or vice-versa, or (3) the sample described by $y$ is large enough that something semantically approximating $xi$ eventually shows up in the data by sheer chance (which acts as a sort of de facto null hypothesis). Because group members actively seek to increase similarity between each others’ idiolects in intragroup communication (1) and (2) are orders of magnitude more likely than (3) on most time scales.

Using this probability calculation we can generate an entropy for the entirety of an utterance $x$, by comparing the words vectors for words/tokens $i$ (i.e. all $i \in x$ or $xi$) and the distribution $y$.

$$H( x ; y ) = -\sum_i P(E_{xi}|E_{y}) \log P(E_{xi}|E_{y})$$

In [None]:
H = entropy(sigma=.3, dim=-1)#.to('cuda')

and now, the MC Sampling Procedure

In [None]:
N_permutations, xsize, ysize = 300, 200, 50

M = []
for permutation in range(N_permutations):
    gM = []
    for groupX in groups:
        x = np.random.choice(df['__id'].loc[df['subreddit_name'].isin([groupX])].unique(), size=(xsize,), replace=False)

        m = []
        for xi in x:
            _x = df['__id'].isin([xi]).values

            mi,samples = [],[]
            for groupY in groups:
                y = np.random.choice(df['__id'].loc[
                                          df['subreddit_name'].isin([groupY])
                                          & ~_x
                                          ].unique(), size=(ysize,), replace=False)
                _y = df['__id'].isin(y).values

                mi += [H(Eu[_x].to('cuda'), Eu[_y].to('cuda')).view(1,-1).detach().cpu()]
                samples += [str(y.tolist()).replace(',', '')]

            m += [torch.cat(mi,dim=-1)]

        gM += [torch.cat(m,dim=0).unsqueeze(0)]

    M += [torch.cat(gM,dim=0).unsqueeze(0)]

    if ((permutation+1) % 10) == 0:
        print('permutation {}/{}'.format(permutation+1, N_permutations))

# M is a matrix of shape trials x groups x sample_size x group_history_sampled_from
M = torch.cat(M,dim=0)
torch.save({'M':M}, output_path)

#### 3.3 MCSP Analysis

First, we'll create a document containing the mean and median entropy values for each of our conditions:

In [None]:
Hdata = []
for i, group in enumerate(groups):
    for j,comparison in enumerate(groups):
        res = M[:,i,:,j].reshape(-1)
        Hdata += [[group+':'+comparison,res.mean().item(),res.median().item()]]
Hdata = pd.DataFrame(np.array(Hdata), columns=['comparison', 'mean', 'median'])
# Hdata.to_csv(H_summary_path, encoding='utf-8')

And conclude by calculating the t-test results.

In [None]:
from scipy.stats import ttest_ind as ttest
Tdata = []
for i, group in enumerate(groups):
    for j,comparison in enumerate(groups):
        if group != comparison:
            sample1 = M[:,i,:,i].reshape(-1).numpy()
            sample2 = M[:,i,:,j].reshape(-1).numpy()
            test_results = ttest(sample1,sample2)
            Tdata += [['(ttest) {}:{}'.format(group, comparison), str([test_results.statistic, test_results.pvalue])]]
Tdata = pd.DataFrame(np.array(Tdata), columns=['cond', 'test'])
# Tdata.to_csv(TTest_summary_path, encoding='utf-8')