# Clustering Countries Based on Ratings of Women's Rights in the Workplace

$$ E_x = BERT(x) \delta_{w=w'} $$

$$ P(E_x|E_y) = P_{\mathcal{N}_{\mathcal{T}[0, \infty]}} \left( cosineError(E_x, E_y) \bigg| \mu=0, \sigma \right) $$

$$ P(A|B) = \frac{1}{N} \left( \frac{P(E_A|E_B)}{P(E_B|E_B)}  \right)$$

In [None]:
from CommClusters.mod.context_vecs import *
from CommClusters.data.byVar import *

VERBOSE=True

# VARS = ['polygamy', 'polygyny', 'wife', 'wives', 'polygynous', 'women', 'marriage', 'marriages']
VARS = ['polygamy', 'polygyny', 'wife', 'wives', 'women', 'woman']
# VARS = ['women', 'attack']

OUTDATA_PATH = "CommClusters/data/corpora/WomanStats/PW/d_PW(p3).csv"
df_out = pd.DataFrame(columns=['country', 'var', 'vec'])
df_out.to_csv(OUTDATA_PATH, index=False, encoding='utf-8')
for country in df['Country'].unique():
    if VERBOSE:
        print('{} started'.format(country))
    subdata = df['Data'].loc[df['Country'].isin([country])].values
    for val in subdata:
        for var in VARS:
            if var in val.lower():
                try:
                    data=[[country,var,str(i.view(-1).tolist())] for i in nC(var,val)]
                    data = np.array(data).reshape(-1,3)
                    data=pd.DataFrame(data,columns=list(df_out))
                    data.to_csv(OUTDATA_PATH, index=False, header=False, encoding='utf-8', mode='a')
                except ValueError:
                    0
                except IndexError:
                    0
    if VERBOSE:
        print('=====]{}[====='.format(len(subdata)))

Setting up similarity tables

In [None]:
from CommClusters.mod.sim_matrix import *

NEW = True

# input_path = 'CommClusters/data/corpora/WomanStats/LRW/d_LRW.csv'
# input_path = 'CommClusters/data/corpora/WomanStats/M/d_M.csv'
input_path = 'CommClusters/data/corpora/WomanStats/PW/d_PW(p1).csv'

dfi = pd.read_csv(input_path)
print(dfi['var'].unique())
cID, vID, vecs = dfi['country'].values, dfi['var'].values, torch.FloatTensor([[np.float(i) for i in j.replace('[', '').replace(']', '').split(', ')] for j in dfi['vec'].values])

pA = probFn(.3)

# vars = ['wives', 'wife']
vars = ['polygamy', 'polygyny', #'polygynous'
        ]
# vars = vars+['marriages', 'marriage']
# vars = dfi['var'].unique().tolist()
# vars = ['women']

cID, vecs = cID[sel(vars, vID)], vecs[sel(vars,vID)]
print(np.unique(cID), vecs.shape)

matrix_data = []

topK = 5

c_unique = np.unique(cID)
for country in c_unique:
    A = vecs[sel([country], cID)]
    outputs = []
    for C in c_unique:
        B = vecs[sel([C], cID)]
        res = None
        if B.shape[0] >= topK:
            res = pA.PROB(A,B).topk(k=topK, dim=1)[0].mean().view(-1) / pA.PROB(B,B).topk(k=topK, dim=1)[0].mean().view(-1)
        else:
            res = pA.PROB(A, B).topk(k=B.shape[0], dim=1)[0].mean().view(-1) / pA.PROB(B,B).topk(k=B.shape[0], dim=1)[0].mean().view(-1)
        outputs.append(res)
    matrix_data += [torch.cat(outputs, dim=-1).view(1,-1)]
matrix_data = torch.cat(matrix_data, dim=0)

matrix_data = matrix_data.numpy()
matrix_data = pd.DataFrame(matrix_data, columns=c_unique)
matrix_data['id'] = c_unique

m = {}
if NEW == False:
    m = torch.load('CommClusters/data/corpora/WomanStats/sim.pt')
m[input_path[-10:-4]] = matrix_data
torch.save(m,'CommClusters/data/corpora/WomanStats/sim.pt')

Running the algorithm for analysis

In [None]:
from CommClusters.mod.dataplot import *
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

class prob(nn.Module):

    def __init__(self, sigma):
        super(prob,self).__init__()
        self.dist = torch.distributions.HalfNormal(sigma)

    def p(self, x):
        return torch.exp(self.dist.log_prob(x))

pi = prob(.3)

m = torch.load('CommClusters/data/WomanStats.pt')
m.keys()

cosM, IDX2, IDX1 = m['cos.x']['struct'],m['cos.x']['index'],m['labels']
M1 = torch.FloatTensor([[
    pi.p(1.-(cosM[sel([i],IDX1),sel([j],IDX2)]/cosM[sel([j],IDX1),sel([j],IDX2)].unsqueeze(1))).mean()
    for j in IDX1]
    for i in IDX1])

M1.shape
dfcos = pd.DataFrame(M1.numpy(), columns=m['labels'])
dfcos['Country'] = m['labels']
dfcos.to_csv('cosine-matrix.csv', index=False, encoding='utf-8')
upprob = torch.distributions.Normal(1,.6)
pal = sns.diverging_palette(200,100,center='dark',as_cmap=True)
pal = sns.palettes.color_palette('Blues_d',as_cmap=True)
sel = (m['labels'] == np.array(['United Kingdom', 'France',
                                'United States', 'Mexico',
                                'Benin', 'Nigeria',
                                'Turkey', 'Syria', 'Iran',
                                'China', 'India'
                                ]).reshape(-1,1)).sum(axis=0).astype(np.bool)
print(sel.shape)
d, l = M1[sel],m['labels'][sel]
d = d[:,sel]
Dnom = (d*torch.eye(len(d))).sum(dim=-1)
#d = torch.exp(upprob.log_prob(d))
plot_data((d/Dnom).nan_to_num().T.numpy(), l)

and another version for the presentation with Dr. Hudson

In [None]:
import torch
from CommClusters.mod.dataplot import *
m = torch.load('CommClusters/data/corpora/WomanStats/sim.pt')

data = 'PW(p1)'

M1 = torch.FloatTensor(m[data][list(m[data])[:-1]].values.astype(np.float))
print(M1.shape)
country_list = ['United Kingdom', 'France','United States', 'Mexico','Benin', 'Nigeria','Turkey', 'Syria', 'Iran','China', 'India']
# country_list = ['Afghanistan', 'Algeria', 'Austria','Fiji', 'Georgia', 'Guyana', 'Indonesia', 'Philippines', 'Russia','Serbia', 'Togo']

# pal = sns.diverging_palette(200,100,center='dark',as_cmap=True)
# pal = sns.palettes.color_palette('Blues_d',as_cmap=True)
upprob = torch.distributions.Normal(1,.6)

sel = (m[data]['id'].values == np.array(country_list).reshape(-1,1)).sum(axis=0).astype(np.bool)
print(sel.shape)
d, l = M1[sel],m[data]['id'].values[sel]
# d = d[:,sel]
# Dnom = (d*torch.eye(len(d))).sum(dim=-1)
# plot_data_square((d/Dnom).nan_to_num().T.numpy(), l)
# d = torch.exp(upprob.log_prob(d))
# plot_data_square(d.nan_to_num().T.numpy(), l,metric='cityblock')
d = torch.exp(upprob.log_prob(d))
plot_data(d.nan_to_num().T.numpy(), l, m[data]['id'].values, metric='cosine')

# Finer-Grained Ethnographic Clustering

$$ P(E_x|E_y) = P_{\mathcal{N}_{[0,\infty]}} \left( cosineError(E_x,E_y) \bigg| \mu=0, \sigma \right) $$

$$ P(A|B) = \prod_{w'} \frac{1}{k_{B_{w'}}} \sum_A P(E_A|E_{B_{w'}}) $$

### Creating initial clusters
This process is wildly preferable to my previous one. If for no other reason, it allows us to compare across search terms before consolidating data for analysis, and this leads to finer grained results.

Because my computer can't calculate down to insanely small probabilities, I included two limiting factors in the code. (1) I only used the top N examples from each cosine comparison. (2) rather than taking the product of probabilities which can lead to erroneous zero values due to my computer's precision, I took the mean of the probabilities for all search terms $w'$. This means that the final calulation is

$$ P(A|B) = \frac{1}{N} \sum \left(\prod_{w'} \frac{1}{k_{B_{w'}}} \sum_A P(E_A|E_{B_{w'}}) \right) $$

Which ends up being qualitatively the same as taking the product across $w'$

In [None]:
from CommClusters.mod.sim_matrix import *

NEW = False

# input_path = 'CommClusters/data/corpora/WomanStats/LRW/d_LRW.csv'
# input_path = 'CommClusters/data/corpora/WomanStats/M/d_M.csv'
input_path = 'CommClusters/data/corpora/WomanStats/PW/d_PW(p3).csv'

dfi = pd.read_csv(input_path)
print(dfi['var'].unique())
cID, vID, vecs = dfi['country'].values, dfi['var'].values, torch.FloatTensor([[np.float(i) for i in j.replace('[', '').replace(']', '').split(', ')] for j in dfi['vec'].values])

pA = probFn(.3)

# vars = ['wives', 'wife']
vars = ['polygamy', 'polygyny', #'polygynous'
        ]
# vars = vars+['marriages', 'marriage']
# vars = dfi['var'].unique().tolist()
# vars = ['women']

cID, vecs = cID[sel(vars, vID)], vecs[sel(vars,vID)]
print(np.unique(cID), vecs.shape)

matrix_data = []

topK = 5

c_unique = np.unique(cID)
for country in c_unique:
    A = vecs[sel([country], cID)]
    outputs = []
    for C in c_unique:
        B = vecs[sel([C], cID)]

        res = None
        if B.shape[0] >= topK:
            res = pA.PROB(A,B).topk(k=topK, dim=-1)[0].sum(dim=-1).view(1,-1)
        else:
            res = pA.PROB(A,B).topk(k=B.shape[0], dim=-1)[0].sum(dim=-1).view(1,-1)

        # res = pA.PROB(A,B).sum(dim=-1).view(1,-1)

        outputs.append(res)
    outputs = torch.cat(outputs, dim=0)
    outputs = outputs/outputs.sum(dim=0)
    matrix_data += [outputs.mean(dim=-1).view(1,-1)]
matrix_data = torch.cat(matrix_data, dim=0)

matrix_data = matrix_data.numpy()
matrix_data = pd.DataFrame(matrix_data, columns=c_unique)
matrix_data['id'] = c_unique

m = {}
if NEW == False:
    m = torch.load('CommClusters/data/corpora/WomanStats/sim.pt')
m[input_path.split('_')[-1][:-4]+'v2'] = matrix_data
torch.save(m,'CommClusters/data/corpora/WomanStats/sim.pt')


# Classifying Tucker Carlson Based on Communication Similarity Btw Groups

$$ E_x = BERT(x) \delta_{w=w'} $$

$$ P(E_x|E_y) = \sum P_{\mathcal{N}_{\mathcal{T}[0, \infty]}} \left( cosineError(E_x, E_y) \bigg| \mu=0, \sigma \right) $$

$$ P(A|B) = \frac{1}{k} \sum \frac{P(E_A|E_B)}{P(E_B|E_B)} $$

In [None]:
from CommClusters.mod.sim_matrix import *
#preformatted BERT embeddings using RoBERTa
m = torch.load('CommClusters/data/CarlsonComm.pt')

V = torch.FloatTensor([[np.float(i) for i in v.replace('[','').replace(']','').split(', ')] for v in m['df']['vec'].values])
ids,lex = m['df']['id'].values, m['df']['lex'].values


# Message Propagation Through a System by Tracking Concept Similarity within a Network
## Who inherited Tupac's "America"?

(1) $$ E_x = BERT(x) \delta_{w=w'} $$

(2) $$ P(A|B) = \frac{1}{k_B} \sum_A P_{\mathcal{N}_{\mathcal{T}[0, \infty]}} \left( cosineError(E_A, E_B) \bigg| \mu=0, \sigma \right) $$


In [1]:
from CommClusters.mod.sim_matrix import *

pi = probFn(.3)

m = torch.load('CommClusters/data/rap-america.pt')
m['df'] = m['df'].replace({'Tupac1': 'Tupac', 'Tupac2':'Tupac'})

ids, V = m['df']['id'].values, torch.FloatTensor([[np.float(v) for v in i.replace('[', '').replace(']', '').split(', ')] for i in m['df']['vec'].values])

rappers = ids[~sel(['Tupac'], ids)]
r = pi.PROB(V[~sel(['Tupac'], ids)], V[sel(['Tupac'], ids)])

mR = torch.cat([torch.FloatTensor(sel([artist], rappers)).view(1,-1) for artist in np.unique(rappers)], dim=0)

r = (r * mR.unsqueeze(-1)).sum(dim=1)
r = r/r.sum(dim=0)


# I0 = pi.PROB(V[sel(['Tupac'], ids)],V[sel(['Tupac'], ids)])
# I0 = (I0.sum(dim=-1)/I0.sum()).unsqueeze(1)
#
# pacBEST = I0.view(-1).argsort(descending=True)[:10]
# pacWORST = I0.view(-1).argsort(descending=False)[:10]
# pacI = torch.cat([pacBEST, pacWORST], dim=-1)
#
#
# I0 = I0[pacI].view(1,-1)
# PAC = V[sel(['Tupac'], ids)][pacI]
#
# # eq (2)
# candidates = [rapper for rapper in np.unique(m['df']['id'].values) if rapper != 'Tupac']
# resp = [pi.PROB(V[sel([rapper], ids)], PAC) for rapper in candidates]
# resp = [(i.sum(dim=0)/i.sum()).view(1,-1) for i in resp]
#
# # eq (3)
# # resp = [i/I0 for i in resp]
# # resp = [(i.sum(dim=-1)/i.sum()).view(1,-1) for i in resp]
#
# resp = torch.cat(resp, dim=0)/I0
# resp = resp/resp.sum(dim=-1).unsqueeze(1)

FileNotFoundError: [Errno 2] No such file or directory: 'CommClusters/data/rap-america.pt'

### Plotting the data

Because it'll look pretty and illustrate the point, we'll hit this with a ridge-plot!

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

TOPHITS = r.topk(10,dim=-1)[1]
data = [[artist, TOPHITS[i].view(-1).numpy()] for i,artist in enumerate(np.unique(rappers))]

graphs = []
for rap,g in data:
    dd = pd.DataFrame()
    dd['x'] = g
    dd['id'] = rap
    graphs.append(dd)
graphs = pd.concat(graphs, ignore_index=True)

pal = sns.cubehelix_palette(len(data), rot=-.25, light=.7)
g = sns.FacetGrid(graphs, row='id', hue='id', aspect=15, height=.5, palette=pal)

g.map(sns.kdeplot, "x",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw_adjust=.5)
g.map(plt.axhline, y=0, lw=2, clip_on=False)

def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)

g.map(label, "x")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
plt.show()

Plotting the outputs