In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

matplotlib.rcParams['figure.figsize'] = (10, 8)
warnings.filterwarnings('ignore')

In [2]:
scores = np.load('scores.npy')
# 85577 docs and 7 weat tests
NUM_DOCS, NUM_TESTS = scores.shape
print(NUM_DOCS, NUM_TESTS)

85577 7


In [3]:
scores

array([[ 0.88889837,  1.30532692, -0.11861331, ..., -0.01755389,
         0.93818117,  0.73727047],
       [ 0.88889843,  1.3053272 , -0.11861419, ..., -0.0175539 ,
         0.93818119,  0.73727048],
       [ 0.88889838,  1.30532703, -0.11861412, ..., -0.01755379,
         0.93818119,  0.7372705 ],
       ...,
       [ 0.88889838,  1.30532705, -0.11861413, ..., -0.01755389,
         0.93818119,  0.73727048],
       [ 0.88889839,  1.30532705, -0.11861413, ..., -0.01755391,
         0.93818119,  0.73727048],
       [ 0.88889839,  1.30532705, -0.11861413, ..., -0.01755391,
         0.93818119,  0.73727047]])

# calculate differential bias for each document

$\Delta_{doc} B \approx B_{weat}(w^*) - B_{weat}(\tilde{w})$

In [4]:
from models.fast_glove import FastGlove
from utils.weat import WEAT
from utils.dataset import Dataset

In [5]:
model = FastGlove()
scorer = WEAT(model, model.M.W)
ds = Dataset("../simplewiki-20171103-pages-articles-multistream.xml.bz2")

In [6]:
b_weat_star = np.array(scorer.get_scores())

In [7]:
# this dataframe has weat scores for documents
df = pd.DataFrame(np.abs(scores - b_weat_star), columns=["test_{}".format(i) for i in range(NUM_TESTS)])
df.head()

Unnamed: 0,test_0,test_1,test_2,test_3,test_4,test_5,test_6
0,1.456169e-08,1.307759e-07,8.121568e-07,1.850067e-08,2.289368e-08,2.148733e-08,2.339081e-09
1,4.727765e-08,1.461456e-07,6.461933e-08,2.535762e-09,1.003561e-08,2.076838e-09,2.816465e-09
2,9.81445e-09,2.881559e-08,9.886204e-09,2.220446e-16,1.234048e-07,4.967784e-09,2.950933e-08
3,3.330669e-16,2.220446e-16,1.387779e-17,2.220446e-16,1.015151e-07,1.110223e-16,4.47242e-09
4,3.330669e-16,2.220446e-16,2.524349e-10,2.220446e-16,1.00614e-15,1.110223e-16,3.330669e-16


In [None]:
# sample mean score
df['scores_mean'] = df.mean(axis=1)
# length of document
df['len'] = df.index.map(lambda x: len(ds.lines[x]))

In [None]:
def get_docs_sorted_on_column(dframe, col, n=2):
    """
    col: test name
    n: number of docs to return
    """
    dframe[col].plot(kind="density", title=col)
    series = dframe.sort_values(col, ascending=False).head(n)
    return [ds.lines[i] for i in series.index]

In [None]:
# #most biased on mean score
col = 'scores_mean'
docs = df.sort_values('scores_mean', ascending=False).head(10)[col]

In [None]:
docs

# top most biased document for each test

In [None]:
" ".join(get_docs_sorted_on_column(df, "scores_mean", n = 1)[0])

In [None]:
" ".join(get_docs_sorted_on_column(df, "test_0", n = 1)[0])

In [None]:
" ".join(get_docs_sorted_on_column(df, "test_1", n = 1)[0])

In [None]:
" ".join(get_docs_sorted_on_column(df, "test_2", n = 1)[0])

In [None]:
" ".join(get_docs_sorted_on_column(df, "test_3", n = 1)[0])

In [None]:
" ".join(get_docs_sorted_on_column(df, "test_4", n = 1)[0])

In [None]:
" ".join(get_docs_sorted_on_column(df, "test_5", n = 1)[0])

In [None]:
" ".join(get_docs_sorted_on_column(df, "test_6", n = 1)[0])

# document index vs differential bias

In [None]:

def plot_index_vs_bias(col):
    ax = df[col].plot(kind="line", title=col)
    ax.set_xlabel("document index")
    ax.set_ylabel("$\Delta_{doc} B $")
plot_index_vs_bias("scores_mean")

In [None]:
plot_index_vs_bias("test_0")

In [None]:
plot_index_vs_bias("test_1")

In [None]:
plot_index_vs_bias("test_2")

In [None]:
plot_index_vs_bias("test_3")

In [None]:
plot_index_vs_bias("test_4")

In [None]:
plot_index_vs_bias("test_5")

In [None]:
plot_index_vs_bias("test_6")

In [None]:
# selecting data to train biased WORD2VEC
import pickle as pkl

df['test_3'] /= df.len


In [None]:
pkl.dump(df.sort_values('test_3', ascending=False).head(int(.25 * df.shape[0])).index.to_list(), open("dataset.pkl", "wb"))

In [28]:
# selecting data to train biased WORD2VEC
import pickle as pkl

df['test_3'] /= df.len


In [38]:
pkl.dump(df.sort_values('test_3', ascending=False).head(int(.25 * df.shape[0])).index.to_list(), open("dataset.pkl", "wb"))