utils.py

import string 
from tqdm import tqdm
import pickle

import scipy
import numpy as np
from numpy import linalg as LA
from sklearn.decomposition import PCA

# Experiment 1
WEAT_words = {
'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'], 
'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'],
'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'],
'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'],
'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'],
'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'],
'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'],
}


def has_punct(w):
    
    if any([c in string.punctuation for c in w]):
        return True
    return False

def has_digit(w):
    
    if any([c in '0123456789' for c in w]):
        return True
    return False

def limit_vocab(wv, w2i, vocab, exclude = None):
    vocab_limited = []
    for w in tqdm(vocab[:50000]): 
        if w.lower() != w:
            continue
        if len(w) >= 20:
            continue
        if has_digit(w):
            continue
        if '_' in w:
            p = [has_punct(subw) for subw in w.split('_')]
            if not any(p):
                vocab_limited.append(w)
            continue
        if has_punct(w):
            continue
        vocab_limited.append(w)
    
    if exclude:
        vocab_limited = list(set(vocab_limited) - set(exclude))
    
    print("size of vocabulary:", len(vocab_limited))
    
    wv_limited = np.zeros((len(vocab_limited), len(wv[0, :])))
    for i,w in enumerate(vocab_limited):
        wv_limited[i,:] = wv[w2i[w],:]
    
    w2i_limited = {w: i for i, w in enumerate(vocab_limited)}
    
    return vocab_limited, wv_limited, w2i_limited

def norm_stand(wv):
    W_norm = np.zeros(wv.shape)
    d = (np.sum(wv ** 2, 1) ** (0.5))
    W_norm = (wv.T / d).T
    return W_norm

def normalize(wv):
    
    # normalize vectors
    norms = np.apply_along_axis(LA.norm, 1, wv)
    wv = wv / norms[:, np.newaxis]
    return wv


def topK(w, wv, w2i, vocab, k=10):
    
    # extract the word vector for word w
    idx = w2i[w]
    vec = wv[idx, :]
    
    # compute similarity of w with all words in the vocabulary
    sim = wv.dot(vec)
#     sim = []
#     for i in range(len(wv)):
#         sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec))
#     sim = np.array(sim)
        
    # sort similarities by descending order
    sort_sim = (sim.argsort())[::-1]

    # choose topK
    best = sort_sim[:(k+1)]

    return [vocab[i] for i in best if i!=idx]


def similarity(w1, w2, wv, w2i):
    
    i1 = w2i[w1]
    i2 = w2i[w2]
    vec1 = wv[i1, :]
    vec2 = wv[i2, :]

    return 1-scipy.spatial.distance.cosine(vec1, vec2)


def drop(u, v):
    return u - v * u.dot(v) / v.dot(v)

from sklearn.decomposition import PCA
from sklearn import preprocessing

def doPCA(pairs, wv, w2i):
        
    matrix = []
    cnt = 0
    
    if type(pairs[0]) is list:
        for a, b in pairs:
            if not (a in w2i and b in w2i): continue
            center = (wv[w2i[a], :] + wv[w2i[b], :])/2
            matrix.append(wv[w2i[a], :] - center)
            matrix.append(wv[w2i[b], :] - center)
            cnt += 1
    else:
        for a in pairs:
            if not (a in w2i): continue
            matrix.append(wv[w2i[a], :])
            cnt += 1
        
        embeds = np.array(matrix)
        wv_mean = np.mean(np.array(embeds), axis=0)
        wv_hat = np.zeros(embeds.shape).astype(float)
    
        for i in range(len(embeds)):
            wv_hat[i, :] = embeds[i, :] - wv_mean
        matrix = wv_hat
            
    matrix = np.array(matrix)
    pca = PCA()
    pca.fit(matrix)
    print('pairs used in PCA: ', cnt)
    return pca

# get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors)
import operator
def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100):
    
    tuples = []
    
    sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))
    female = [item[0] for item in sorted_g[:size]]
    male = [item[0] for item in sorted_g[-size:]]
#     vocab = male + female
    selected = female + male if size > 0 else vocab
    
    for w in selected:
        
        top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num]

        m = 0
        f = 0    
        for t in top:
            if gender_bias_bef[t] > 0:
                m+=1
            else:
                f+=1
            
        tuples.append((w, gender_bias_bef[w], m, f))

    return tuples

def get_tuples_prof(wv, w2i, vocab,  words, gender_bias_dict):
    
    wv = normalize(wv)
    
    tuples = []
    for w in words:
        if w not in gender_bias_dict:
            continue
            
        top = topK(w, wv, w2i, vocab, k=105)[:100]
            
        m = 0
        f = 0  
        for t in top:          
            if gender_bias_dict[t] > 0:
                m+=1
            else:
                f+=1
                
        tuples.append((w, gender_bias_dict[w], m, f))
        
    return tuples

# compute correlation between bias-by-projection and bias-by-neighbors

import scipy.stats

def pearson(a,b):
   
    return scipy.stats.pearsonr(a,b)

def compute_corr(tuples, i1, i2):
    
    a = []
    b = []
    for t in tuples:
        a.append(t[i1])
        b.append(t[i2])
    assert(len(a)==len(b))    
    print('pearson: ', scipy.stats.pearsonr(a,b))
    print('spearman: ', scipy.stats.spearmanr(a, b))
    
# Auxiliary finctions

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2):
    
    # perform TSNE
    
    X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors)
    for x,p,y in zip(X_embedded, y_pred, y_true):
        if p:
            if y:
                ax.scatter(x[0], x[1], marker = '.', c = 'c')
            else:
                ax.scatter(x[0], x[1], marker = 'x', c = 'c')
        else:
            if y:
                ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet')
            else:
                ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet')
                        
    
    ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15)

    
def extract_vectors(words, wv, w2i):
    
    X = [wv[w2i[x],:] for x in words]
    
    return X


def cluster_and_visualize(words, X, random_state, y_true, num=2):
    
    y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X)
#     fig, axs = plt.subplots(figsize=(6, 3))
#     visualize(X, y_true, y_pred, axs, 'Original', random_state)
    correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]
    print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct))))

    
import scipy.stats
from sklearn import svm
def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females):
    
    X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]]
    Y_train = [1]*size_train + [0]*size_train
    X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]]
    Y_test = [1]*size_test + [0]*size_test

    clf = svm.SVC(gamma='auto')
    clf.fit(X_train, Y_train)

    preds = clf.predict(X_test)

    accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)]
    acc = float(sum(accuracy))/len(accuracy)
    print('accuracy:', float(sum(accuracy))/len(accuracy))
    
    return acc
    
    
# Auxiliary functions for experiments by Caliskan et al.

import scipy
import scipy.misc as misc
import itertools


def s_word(w, A, B, wv, w2i, vocab, all_s_words):
    
    if w in all_s_words:
        return all_s_words[w]
    
    mean_a = []
    mean_b = []
    
    for a in A:
        mean_a.append(similarity(w, a, wv, w2i))
    for b in B:
        mean_b.append(similarity(w, b, wv, w2i))
        
    mean_a = sum(mean_a)/float(len(mean_a))
    mean_b = sum(mean_b)/float(len(mean_b))
    
    all_s_words[w] = mean_a - mean_b

    return all_s_words[w]


def s_group(X, Y, A, B,  wv, w2i, vocab, all_s_words):
    
    total = 0
    for x in X:
        total += s_word(x, A, B,  wv, w2i, vocab, all_s_words)
    for y in Y:
        total -= s_word(y, A, B,  wv, w2i, vocab, all_s_words)
        
    return total


def p_value_exhust(X, Y, A, B,  wv, w2i, vocab):
    
    if len(X) > 10:
        print('might take too long, use sampled version: p_value')
        return
    
    assert(len(X) == len(Y))
    
    all_s_words = {}
    s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words) 
    
    union = set(X+Y)
    subset_size = int(len(union)/2)
    
    larger = 0
    total = 0
    for subset in set(itertools.combinations(union, subset_size)):
        total += 1
        Xi = list(set(subset))
        Yi = list(union - set(subset))
        if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig:
            larger += 1
    print('num of samples', total)
    return larger/float(total)

def association_diff(t, A, B, wv, w2i):
    
    mean_a = []
    mean_b = []
    
    for a in A:
        mean_a.append(similarity(t, a, wv, w2i))
    for b in B:
        mean_b.append(similarity(t, b, wv, w2i))
        
    mean_a = sum(mean_a)/float(len(mean_a))
    mean_b = sum(mean_b)/float(len(mean_b))
    
    return mean_a - mean_b

def effect_size(X, Y, A, B,  wv, w2i, vocab):
    
    assert(len(X) == len(Y))
    assert(len(A) == len(B))
    
    norm_x = []
    norm_y = []
    
    for x in X:
        norm_x.append(association_diff(x, A, B, wv, w2i))
    for y in Y:
        norm_y.append(association_diff(y, A, B, wv, w2i))
    
    std = np.std(norm_x+norm_y, ddof=1)
    norm_x = sum(norm_x) / float(len(norm_x))
    norm_y = sum(norm_y) / float(len(norm_y))
    
    return (norm_x-norm_y)/std


def p_value_sample(X, Y, A, B, wv, w2i, vocab):
    
    random.seed(10)
    np.random.seed(10)
    all_s_words = {}
    
    assert(len(X) == len(Y))
    length = len(X)
    
    s_orig = s_group(X, Y, A, B,  wv, w2i, vocab, all_s_words) 
    
    num_of_samples = min(1000000, int(scipy.special.comb(length*2,length)*100))
    print('num of samples', num_of_samples)
    larger = 0
    for i in range(num_of_samples):
        permute = np.random.permutation(X+Y)
        Xi = permute[:length]
        Yi = permute[length:]
        if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig:
            larger += 1
    
    return larger/float(num_of_samples)