In [1]:
import pandas as pd
import gensim 
from gensim.models import Word2Vec
import numpy as np
import nltk
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from scipy import spatial
from sklearn.cluster import KMeans
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
from google.colab import files
uploaded = files.upload()

Saving toy_1000_trp.csv to toy_1000_trp.csv


In [5]:
def TrainModel(csv_document, csv_comment_column='body', outputname='outputModel', window = 4, minf=10, epochs=100, ndim=200, lemmatiseFirst = False, verbose = True):
	'''
	Load the documents from csv_document and column csv_comment_column, trains a skipgram embedding model with given parameters and saves it in outputname.
	csv_document <str> : path to reddit csv dataset
	csv_comment_column <str> : column where comments are stored
	outputname <str> : output model name
	window = 4, minf=10, epochs=100, ndim=200, lemmatiseFirst = False, tolower= True : Training and preprocessing parameters
	'''

	def loadCSVAndPreprocess(path, column = 'body', nrowss=None, verbose = True):
		'''
		input:
		path <str> : path to csv file
		column <str> : column with text
		nrowss <int> : number of rows to process, leave None if all
		verbose <True/False> : verbose output
		tolower <True/False> : transform all text to lowercase
		returns:
		list of preprocessed sentences
		'''
		trpCom = pd.read_csv(path, lineterminator='\n', nrows=nrowss)
		documents = []
		for i, row in enumerate(trpCom[column]):
			

			if i%500000 == 0 and verbose == True:
				print('\t...processing line {}'.format(i))
			try:
				pp = gensim.utils.simple_preprocess (row)
				if(lemmatiseFirst == True):
					pp = [wordnet_lemmatizer.lemmatize(w, pos="n") for w in pp]
				documents.append(pp)
			except:
				if(verbose):
					print('\terror with row {}'.format(row))
		print('Done reading all documents')
		return documents

	def trainWEModel(documents, outputfile, ndim, window, minfreq, epochss):
		'''
		documents list<str> : List of texts preprocessed
		outputfile <str> : final file will be saved in this path
		ndim <int> : embedding dimensions
		window <int> : window when training the model
		minfreq <int> : minimum frequency, words with less freq will be discarded
		epochss <int> : training epochs
		'''
		starttime = time.time()
		print('->->Starting training model {} with dimensions:{}, minf:{}, epochs:{}'.format(outputfile,ndim, minfreq, epochss))
		model = gensim.models.Word2Vec (documents, size=ndim, window=window, min_count=minfreq, workers=5)
		model.train(documents,total_examples=len(documents),epochs=epochss)
		model.save(outputfile)
		print('->-> Model saved in {}'.format(outputfile))     

     
	print('->Starting with {} [{}], output {}, window {}, minf {}, epochs {}, ndim {}'.format(csv_document,csv_comment_column,outputname, window, minf, epochs, ndim))
	docs = loadCSVAndPreprocess(csv_document, csv_comment_column, nrowss=None, verbose=verbose)
	starttime = time.time()
	print('-> Output will be saved in {}'.format(outputname))
	trainWEModel(docs, outputname, ndim, window, minf, epochs)
	print('-> Model creation ended in {} seconds'.format(time.time()-starttime))

In [6]:
sid = SentimentIntensityAnalyzer()
def GetTopMostBiasedWords(modelpath, topk, c1, c2, pos = ['JJ','JJR','JJS'], verbose = True):
	'''
	modelpath <str> : path to skipgram w2v model
	topk <int> : topk words
	c1 list<str> : list of words for target set 1
	c2 list<str> : list of words for target set 2
	pos list<str> : List of parts of speech we are interested in analysing
	verbose <bool> : True/False
	'''

	def calculateCentroid(model, words):
		embeddings = [np.array(model[w]) for w in words if w in model]
		centroid = np.zeros(len(embeddings[0]))
		for e in embeddings:
			centroid += e
		return centroid/len(embeddings)

	def getCosineDistance(embedding1, embedding2):       
		return spatial.distance.cosine(embedding1, embedding2)


	#select the interesting subset of words based on pos
	model = Word2Vec.load(modelpath)
	words_sorted = sorted( [(k,v.index, v.count) for (k,v) in model.wv.vocab.items()] ,  key=lambda x: x[1], reverse=False)
	words = [w for w in words_sorted if nltk.pos_tag([w[0]])[0][1] in pos]

	if len(c1) < 1 or len(c2) < 1 or len(words) < 1:
		print('[!] Not enough word concepts to perform the experiment')
		return None

	centroid1, centroid2 = calculateCentroid(model, c1),calculateCentroid(model, c2)
	winfo = []
	for i, w in enumerate(words):
		word = w[0]
		freq = w[2]
		rank = w[1]
		pos = nltk.pos_tag([word])[0][1]
		wv = model[word]
		sent = sid.polarity_scores(word)['compound']
		#estimate cosinedistance diff
		d1 = getCosineDistance(centroid1, wv)
		d2 = getCosineDistance(centroid2, wv)
		bias = d2-d1

		winfo.append({'word':word, 'bias':bias, 'freq':freq, 'pos':pos, 'wv':wv, 'rank':rank, 'sent':sent} )

		if(i%100 == 0 and verbose == True):
			print('...'+str(i), end="")

	#Get max and min topk biased words...
	biasc1 = sorted( winfo, key=lambda x:x['bias'], reverse=True )[:min(len(winfo), topk)]
	biasc2 = sorted( winfo, key=lambda x:x['bias'], reverse=False )[:min(len(winfo), topk)]
    #move the ts2 bias to the positive space
	for w2 in biasc2:
		w2['bias'] = w2['bias']*-1
    
	return [biasc1, biasc2]

In [7]:

def Cluster(biasc1, biasc2, r, repeatk, verbose = True):
	'''
	biasc1 list<words> : List of words biased towards target concept1 as returned by GetTopMostBiasedWords
	biasc2 list<words> : List of words biased towards target concept2 as returned by GetTopMostBiasedWords
	r <int> : reduction factor used to determine k for the kmeans; k = r * len(voc) 
	repeatk <int> : Number of Clustering to perform only to keep the partition with best intrasim
	'''
	def getCosineDistance(embedding1, embedding2): 
		return spatial.distance.cosine(embedding1, embedding2)
	def getIntraSim(partition):
		iS = 0
		for cluster in partition:
			iS += getIntraSimCluster(cluster)
		return iS/len(partition)
	def getIntraSimCluster(cluster):
		if(len(cluster)==1):
			return 0
		sim = 0; c = 0
		for i in range(len(cluster)):
			w1 = cluster[i]['wv']
			for j in range(i+1, len(cluster)):
				w2 = cluster[j]['wv']
				sim+= 1-getCosineDistance(w1,w2)
				c+=1
		return sim/c
	def createPartition(embeddings, biasw, k):
		preds = KMeans (n_clusters=k).fit_predict(embeddings)
		#first create the proper clusters, then estiamte avg intra sim
		all_clusters = []
		for i in range(0, k):
			clust = []
			indexes = np.where(preds == i)[0]
			for idx in indexes:
				clust.append(biasw[idx])
			all_clusters.append(clust)
		score = getIntraSim(all_clusters)
		return [score, all_clusters]


	k = int(r * (len(biasc1)+len(biasc2))/2)
	emb1, emb2  = [w['wv'] for w in biasc1], [w['wv'] for w in biasc2]
	mis1, mis2 = [0,[]], [0,[]]	#here we will save partitions with max sim for both target sets
	for run in range(repeatk):
		p1 = createPartition(emb1, biasc1, k)
		if(p1[0] > mis1[0]):
			mis1 = p1
		p2 = createPartition(emb2, biasc2, k)
		if(p2[0] > mis2[0]):
			mis2 = p2
		if(verbose == True):
			print('New partition for ts1, intrasim: ', p1[0])
			print('New partition for ts2, intrasim: ', p2[0])

	print('[*] Intrasim of best partition found for ts1, ', mis1[0])
	print('[*] Intrasim of best partition found for ts2, ', mis2[0])
	return [mis1[1], mis2[1]]
		

In [8]:
'''
Train an embeddings model using word2vec with different parameters.
'''


trainingSetups = [
    {'csvfile': "toy_1000_trp.csv", 'outputFile': 'Models', 'w':4, 'minf': 10, 'epochs':10 ,'ndim':200},
    #....
]

for setup in trainingSetups:
    TrainModel(setup['csvfile'], 
           'body',
           outputname = setup['outputFile'],
           window = setup['w'],
           minf = setup['minf'],
           epochs = setup['epochs'],
           ndim = setup['ndim'],
           verbose = False
           )

->Starting with toy_1000_trp.csv [body], output Models, window 4, minf 10, epochs 10, ndim 200




Done reading all documents
-> Output will be saved in Models
->->Starting training model Models with dimensions:200, minf:10, epochs:10
->-> Model saved in Models
-> Model creation ended in 254.67558908462524 seconds


In [9]:
'''
List of target sets used in this work, replace them in GetTopMostBiasedWords to obtain different sets of biases
or create your own target sets to represent a concept!
'''

women=["sister" , "female" , "woman" , "girl" , "daughter" , "she" , "hers" , "her"]
men=["brother" , "male" , "man" , "boy" , "son" , "he" , "his" , "him"]  

islam = ["allah", "ramadan", "turban", "emir", "salaam", "sunni", "koran", "imam", "sultan", "prophet", "veil", "ayatollah", "shiite", "mosque", "islam", "sheik", "muslim", "muhammad"]
christian = ["baptism", "messiah", "catholicism", "resurrection", "christianity", "salvation", "protestant", "gospel", "trinity", "jesus", "christ", "christian", "cross", "catholic", "church"]

white_names = ["harris", "nelson", "robinson", "thompson", "moore", "wright", "anderson", "clark", "jackson", "taylor", "scott", "davis", "allen", "adams", "lewis", "williams", "jones", "wilson", "martin", "johnson"]
hispanic_names= ["ruiz", "alvarez", "vargas", "castillo", "gomez", "soto", "gonzalez", "sanchez", "rivera", "mendoza", "martinez", "torres", "rodriguez", "perez", "lopez", "medina", "diaz", "garcia", "castro", "cruz"]


In [10]:
'''
Call GetTopMostBiasedWords to obtain a list of the topk words with POS = ['JJ','JJR','JJS'] 
most biased towards women and men target sets in the model.

The function returns two word lists, b1 and b2, which contain all words from the embedding model most biased towards
women (b1) and men (b2). 
'''

modelpath = 'Models'  #add your model here!
[b1,b2] = GetTopMostBiasedWords(
        modelpath,
        300,
        women,
        men,
        ['JJ','JJR','JJS'],
        True)

  del sys.path[0]
  del sys.path[0]


...0...100...200...300...400...500...600...700...800...900

In [11]:
'''
List all topk biased words
'''
print('biased towards ', women)
print( [b['word'] for b in b1[:30]] )
print('biased towards ', men)
print( [b['word'] for b in b2[:30]] )

biased towards  ['sister', 'female', 'woman', 'girl', 'daughter', 'she', 'hers', 'her']
['available', 'formal', 'mutual', 'second', 'third', 'small', 'okcupid', 'comfortable', 'free', 'viable', 'informal', 'neutral', 'common', 'continued', 'inexpensive', 'arrive', 'variable', 'suitable', 'compatible', 'unplanned', 'geographical', 'single', 'exclusive', 'wide', 'enjoyable', 'organic', 'specific', 'instantaneous', 'chic', 'ready']
biased towards  ['brother', 'male', 'man', 'boy', 'son', 'he', 'his', 'him']
['homosexual', 'ouch', 'unfriended', 'lest', 'respectable', 'unapologetic', 'underwear', 'overall', 'glorious', 'inexperienced', 'poor', 'enable', 'heterosexual', 'dependable', 'pathetic', 'argumentative', 'abusive', 'total', 'abrasive', 'lustful', 'nasty', 'sensitive', 'hippy', 'vicious', 'obsessive', 'bearable', 'metaphorical', 'pedantic', 'psychotic', 'delighted']


In [12]:
'''
Every word returned by GetTopMostBiasedWords contains the next attributes:
word : Word 
bias : Bias strength towards target set 1 (in this example) when compared to target set 2
freq : Frequency of word in the vocabulary of the model
pos  : Part of speech as determined by NLTK
wv   : Embedding of the word, used for clustering later
rank : Frequency ranking of the word in model's vocabulary
sent : Sentiment of word [-1,1], as determined by nltk.sentiment.vader

Here we show the firt word biased towards women in the toy dataset
'''
b1[0]

{'bias': 0.17276236633485464,
 'freq': 1251,
 'pos': 'JJ',
 'rank': 874,
 'sent': 0.0,
 'word': 'available',
 'wv': array([ 0.07670205, -0.6021249 , -0.53167796, -1.3170775 ,  0.7293684 ,
         1.6956341 ,  1.2280496 ,  0.21225786, -0.6355291 ,  2.3789127 ,
         1.5463523 ,  0.99710315, -0.43532225, -0.04214282,  0.09582525,
         1.2594104 ,  1.6384648 , -1.093577  ,  0.2618014 , -0.2249213 ,
         1.3811804 , -0.7382754 , -0.6766405 ,  1.7232198 , -2.1547043 ,
        -0.5104562 , -0.19955562, -0.78912383, -0.7859196 , -0.15164861,
        -0.7796448 , -0.34839168, -1.4293412 ,  0.88701063,  0.8642951 ,
         1.5783696 , -0.7250896 , -0.9839688 ,  0.20061877,  1.8219093 ,
         1.4210857 ,  0.89474654, -1.2435374 ,  1.9881847 , -0.69109607,
         1.1907544 ,  0.5251658 , -2.0476773 ,  2.3237197 , -0.688011  ,
         2.0926065 , -1.6444218 ,  0.25367507,  0.590795  ,  0.8323878 ,
        -0.20088811,  0.7689315 , -1.6661421 , -0.78327775, -0.09116092,
         

In [17]:
'''
Here we show the firt word biased towards men in the toy dataset
'''
b2[5]

{'bias': 0.16837280946765665,
 'freq': 13,
 'pos': 'JJ',
 'rank': 11468,
 'sent': 0.0,
 'word': 'unapologetic',
 'wv': array([ 0.06084435, -0.01344283, -0.28146487, -0.07252999, -0.01667341,
        -0.03266788,  0.01186611, -0.04430195,  0.17923376, -0.14431918,
         0.1304737 , -0.02197906, -0.02756139,  0.1220159 , -0.07651004,
         0.24718468,  0.07713524,  0.27304015, -0.14844182, -0.37265205,
         0.18438892,  0.07987133, -0.05941497,  0.04338085, -0.1629849 ,
         0.02878095,  0.03314534, -0.09514685,  0.03163952, -0.3032108 ,
         0.14570816,  0.0770566 ,  0.41796938, -0.00723047,  0.13007405,
         0.13973737, -0.00296919, -0.19125172, -0.06367651,  0.06598613,
         0.2773335 , -0.21882269, -0.03792665, -0.09654344, -0.06552457,
        -0.24015099, -0.32271338, -0.13033666,  0.03015729, -0.09921532,
         0.05086885,  0.2273484 , -0.25470018,  0.21543704, -0.22669107,
         0.12643558, -0.19124629,  0.05064442,  0.17519145,  0.10620119,
      

In [14]:
'''
Cluster words into concepts by leveragin their embedding distributions, where
b1 : list of biased words towards target set 1
b2 : list of biased words towards target set 2
r  : r parameter for k-means clustering, where k = r*len(b)
100: partition repetitoins for k-means, keeping the partition with best intrasim

The function returns:
List of clusters in a partition and words clustered in each cluster, for both target sets (cl1, cl2).
'''
[cl1,cl2] = Cluster(b1,b2, 0.15, 100)

New partition for ts1, intrasim:  0.13540145996845449
New partition for ts2, intrasim:  0.08075208835402477
New partition for ts1, intrasim:  0.17872933043384304
New partition for ts2, intrasim:  0.10049539933916939
New partition for ts1, intrasim:  0.12282290394118611
New partition for ts2, intrasim:  0.078077233342781
New partition for ts1, intrasim:  0.14728380905850963
New partition for ts2, intrasim:  0.06977397075713551
New partition for ts1, intrasim:  0.15352796818025846
New partition for ts2, intrasim:  0.08254230192983798
New partition for ts1, intrasim:  0.13858360628881414
New partition for ts2, intrasim:  0.09808052887733464
New partition for ts1, intrasim:  0.1368876396117701
New partition for ts2, intrasim:  0.07512129606351435
New partition for ts1, intrasim:  0.14293421566351894
New partition for ts2, intrasim:  0.08103004605536236
New partition for ts1, intrasim:  0.14560969713896535
New partition for ts2, intrasim:  0.09812389924174884
New partition for ts1, intrasim

In [15]:
'''
Exploring the conceptual biases from partition biased towards ts1, only printing the words in each cluster.
'''
#conceptual biases for target set 1
print(len(cl1))
for cluster in cl1:
    print( [k['word'] for k in cluster] )

45
['okcupid', 'senior', 'native', 'english', 'original']
['available', 'suitable', 'single', 'potential', 'attractive', 'desirable']
['uncomfortable']
['viable', 'informal', 'continued', 'variable', 'unplanned', 'geographical', 'organic', 'instantaneous', 'chic', 'earliest', 'actual', 'individual', 'noncommittal', 'fourth', 'innocuous', 'highest', 'plausible', 'unexpected', 'virtual', 'ethical', 'immaterial', 'visible', 'accessible', 'probable', 'weakest', 'arbitrary', 'unrelated', 'incompatible', 'ritual', 'genetic', 'noble', 'hypothetical', 'tangible', 'substantive', 'nonsexual', 'controversial', 'unnatural', 'stumble', 'dynamic', 'additional', 'crumble', 'frivolous', 'rapid', 'diplomatic', 'preferable', 'situational', 'significant', 'definitive', 'unlocked', 'modest', 'doable', 'preliminary', 'unapproachable', 'ordinary', 'lavish', 'laughable', 'foreseeable', 'economic', 'justifiable', 'vocabulary', 'unannounced', 'ineffective', 'negotiable', 'unwilling', 'questionable', 'environme

In [16]:
'''
Exploring the conceptual biases from partition biased towards ts2, only printing the words in each cluster.
'''
print(len(cl2))
for cluster in cl2:
  print( [k['word'] for k in cluster] )

45
['nasty', 'gross', 'puppy', 'sappy', 'grand', 'sudden', 'hilarious']
['sensitive', 'powerful', 'dramatic', 'attentive', 'critical', 'generous']
['social']
['conscious']
['due']
['old']
['inexperienced', 'oblivious']
['underwear', 'bisexual', 'steady', 'black', 'rich', 'obese', 'modern', 'fetish', 'african', 'military', 'promiscuous', 'serial', 'unemployed']
['entire', 'whole']
['experienced']
['overall', 'psychological', 'behavioral', 'severe', 'manifest', 'tiny', 'imaginary', 'inevitable', 'unrequited', 'former', 'internal', 'biological', 'interpersonal', 'ultimate']
['sexual']
['young']
['full']
['angry']
['bigger']
['related']
['such']
['horrible']
['aware']
['guilty']
['unattractive']
['respectable', 'abrasive', 'neurotic', 'obnoxious', 'egotistical', 'apologetic', 'misogynistic', 'courageous', 'antisocial', 'stereotypical', 'pretentious', 'unfaithful', 'charismatic', 'civil', 'competitive', 'honorable', 'expressive', 'exceptional', 'apprehensive', 'uptight', 'complex', 'humble'