In [1]:
import gensim
import re
import string
import pandas as pd
from collections import defaultdict

# Word Embedings

Define sentence iterator. 
Perform pre-processing:
0. Lowercase & tokenize
1. Replace @MENTION & URL

### Train embeddings in sentences using Word2Vec

In [None]:
import os
class MySentences(object):
    def __init__(self, dirname, fname):
        self.dirname = dirname
        self.fname = fname
        
    def __iter__(self):
        for line in open(os.path.join(self.dirname, self.fname)):
            yield line.split()

In [None]:
sentences = MySentences('../data/SentimentCorpus/', 'es.tsv.lower.preprocessed')
model = gensim.models.Word2Vec(sentences, size=400, alpha=0.025, window=5, min_count=5, max_vocab_size=None)

In [None]:
model.save('../data/SentimentCorpus/es.word2vec')

In [None]:
model.most_similar(positive=['hombre'], topn=5)

In [None]:
model.most_similar(positive=['mujer'], topn=5)

In [None]:
model.most_similar(positive=['apple'])

### English

In [None]:
sentences = MySentences('../data/SentimentCorpus/', 'en.tsv.lower.preprocessed')
model = gensim.models.Word2Vec(sentences, size=400, alpha=0.025, window=5, min_count=5, max_vocab_size=None)
model.save('../data/en.word2vec')

In [None]:
model.most_similar(positive=['man'], topn=5)

In [None]:
model.most_similar(positive=['woman'], topn=5)

In [None]:
model.most_similar(positive=['samsung', 'smartphone'], negative=['phone'])

# Expand ANEW and ANSW
Spanish:

In [35]:
es_model = gensim.models.Word2Vec.load('../data/Word2Vec/es.word2vec')

In [36]:
answ = pd.read_csv('../data/SentimentCorpus/ANEW/ANSW.tsv', sep = '\t')
print len(answ)
answ.sort_values(by = 'Val-Mn-All').head()

1034


Unnamed: 0,S-Word,Val-Mn-All,Val-Sd-All,Aro-Mn-All,Aro-Sd-All,Freq
341,violación,1.11,0.48,7.98,1.51,9.29
585,muerto,1.17,0.57,5.99,2.89,123.39
25,asesinar,1.18,0.55,7.44,2.15,4.64
479,guerra,1.23,0.72,7.28,2.23,251.61
99,muerte,1.23,0.64,6.46,2.76,257.32


In [37]:
answ.Freq = answ.Freq.fillna(1) 

In [38]:
res = []

for idx, row in answ.iterrows():
    try:
        sims = es_model.most_similar(positive = [row['S-Word']], topn = 100)
    except:
        sims = []
        
    for sim, weight in filter(lambda x: x[1] > 0.5, sims):
        r = {}
        r['word'] = sim
        for c in [u'Val-Mn-All', u'Val-Sd-All', u'Aro-Mn-All', u'Aro-Sd-All', 'Freq']:
            r[c] = row[c] * weight
            
        res.append(r)

In [39]:
res = pd.DataFrame(res)
print len(res)
res.sort_values(by='Val-Mn-All').head()

15469


Unnamed: 0,Aro-Mn-All,Aro-Sd-All,Freq,Val-Mn-All,Val-Sd-All,word
669,3.726865,1.076984,2.324281,0.591089,0.275507,agarrarlos
668,3.732478,1.078606,2.327782,0.591979,0.275922,mandarlas
667,3.747961,1.08308,2.337438,0.594435,0.277067,referirse
666,3.748557,1.083252,2.33781,0.594529,0.277111,secuestrar
665,3.751257,1.084033,2.339493,0.594957,0.277311,defraudar


### Concat original ANSW

In [40]:
original = answ[['Aro-Mn-All', 'Aro-Sd-All', 'Val-Mn-All', 'Val-Sd-All', 'Freq', 'S-Word']]
original.columns = ['Aro-Mn-All','Aro-Sd-All', 'Val-Mn-All', 'Val-Sd-All', 'Freq', 'word']
tmp = pd.concat([res, original])
print len(tmp)
tmp.sort_values(by='Val-Mn-All').head()

16503


Unnamed: 0,Aro-Mn-All,Aro-Sd-All,Freq,Val-Mn-All,Val-Sd-All,word
669,3.726865,1.076984,2.324281,0.591089,0.275507,agarrarlos
668,3.732478,1.078606,2.327782,0.591979,0.275922,mandarlas
667,3.747961,1.08308,2.337438,0.594435,0.277067,referirse
666,3.748557,1.083252,2.33781,0.594529,0.277111,secuestrar
665,3.751257,1.084033,2.339493,0.594957,0.277311,defraudar


### Compress & Remove Dup

In [41]:
res = tmp
res2 = defaultdict(list)
for idx, row in res.iterrows():
    res2[row['word']].append((float(row['Val-Mn-All']), float(row['Val-Sd-All']),
                              float(row['Aro-Mn-All']), float(row['Aro-Sd-All']),
                              float(row['Freq'])))
    
tmp = {'Valence': {}, 'Valence.std': {}, 'Arousal':{}, 'Arousal.std':{}, 'Frequency':{}}
for k, tlist in res2.iteritems():
    average_tuple = tuple(map(lambda y: sum(y) / float(len(y)), zip(*tlist)))
    tmp['Valence'][k] = average_tuple[0]
    tmp['Valence.std'][k] = average_tuple[1]
    tmp['Arousal'][k] = average_tuple[2]
    tmp['Arousal.std'][k] = average_tuple[3]
    tmp['Frequency'][k] = average_tuple[4]
    
tmp = pd.DataFrame(tmp)
tmp.sort_values(by = 'Valence').head()

Unnamed: 0,Arousal,Arousal.std,Frequency,Valence,Valence.std
agarrarlos,3.726865,1.076984,2.324281,0.591089,0.275507
secuestrar,3.748557,1.083252,2.33781,0.594529,0.277111
someterse,3.751942,1.084231,2.339921,0.595066,0.277361
extorsionar,3.770322,1.089542,2.351384,0.597981,0.27872
impactar,3.771986,1.090023,2.352422,0.598245,0.278843


In [42]:
tmp.Frequency.describe()

count    9604.000000
mean        8.399357
std        28.930689
min         0.000000
25%         0.760956
50%         2.005471
75%         5.025509
max       850.890000
Name: Frequency, dtype: float64

In [43]:
res = tmp
print len(res)
res.to_csv('../data/expandedANSW.csv')

9604


In [44]:
es_model = None
res = None

### __English:__

In [20]:
en_model = gensim.models.Word2Vec.load('../data/Word2Vec/en.word2vec')

In [21]:
anew = pd.read_csv('../data/SentimentCorpus/ANEW/anew_list.csv')
anew.sort_values(by='Valence').head()

Unnamed: 0,Description,WordNo,Valence,std,Arousal,std.1,Dominance,std.2,Frequency
756,rape,344,1.25,0.91,6.81,3.17,2.97,2.94,5
896,suicide,419,1.25,0.69,5.73,3.14,3.58,3.02,17
357,funeral,178,1.39,0.87,4.94,3.21,2.97,2.55,33
763,rejected,349,1.5,1.09,6.37,2.56,2.72,2.58,33
130,cancer,60,1.5,0.85,6.42,2.83,3.42,2.99,25


In [27]:
anew.Frequency = anew.Frequency.fillna(1) #Not so good Turing

In [28]:
res = []

for idx, row in anew.iterrows():
    try:
        sims = en_model.most_similar(positive = [row['Description']], topn = 100)
    except:
        sims = []
        
    for sim, weight in filter(lambda x: x[1] > 0.5, sims):
        r = {}
        r['word'] = sim
        for c in [ u'Valence', u'std', u'Arousal', u'std.1', 'Frequency']:
            r[c] = row[c] * weight
            
        res.append(r)

In [29]:
res = pd.DataFrame(res)
print len(res)
res.sort_values(by='Valence').head()

10568


Unnamed: 0,Arousal,Frequency,Valence,std,std.1,word
8212,3.426078,3.018571,0.628869,0.457817,1.594811,falsely
8211,3.427497,3.019822,0.629129,0.458006,1.595472,racism
8210,3.440826,3.031565,0.631576,0.459787,1.601677,murder
8209,3.452972,3.042266,0.633805,0.46141,1.60733,intercourse
8208,3.463143,3.051228,0.635672,0.46277,1.612065,targeting


In [30]:
res[(res.word == 'terrorists')]

Unnamed: 0,Arousal,Frequency,Valence,std,std.1,word
1987,2.47319,4.766318,1.758242,1.228651,1.244539,terrorists
9559,4.191347,0.576526,0.97433,0.818667,1.372133,terrorists


### Concat original ANEW

In [31]:
original = anew[['Arousal','std.1', 'Valence', 'std', 'Frequency' ,'Description']]
original.columns = ['Arousal', 'std.1', 'Valence', 'std', 'Frequency', 'word']
tmp = pd.concat([res, original])
print len(tmp)
tmp.sort_values(by='Valence').head()

11602


Unnamed: 0,Arousal,Frequency,Valence,std,std.1,word
8212,3.426078,3.018571,0.628869,0.457817,1.594811,falsely
8211,3.427497,3.019822,0.629129,0.458006,1.595472,racism
8210,3.440826,3.031565,0.631576,0.459787,1.601677,murder
8209,3.452972,3.042266,0.633805,0.46141,1.60733,intercourse
8208,3.463143,3.051228,0.635672,0.46277,1.612065,targeting


### Remove Duplicates

In [32]:
res = tmp
res2 = defaultdict(list)
for idx, row in res.iterrows():
    res2[row['word']].append((float(row['Valence']), float(row['std']),
                              float(row['Arousal']), float(row['std.1']),
                              float(row['Frequency'])))
    
tmp = {'Valence': {}, 'Valence.std': {}, 'Arousal':{}, 'Arousal.std':{}, 'Frequency':{}}
for k, tlist in res2.iteritems():
    average_tuple = tuple(map(lambda y: sum(y) / float(len(y)), zip(*tlist)))
    tmp['Valence'][k] = average_tuple[0]
    tmp['Valence.std'][k] = average_tuple[1]
    tmp['Arousal'][k] = average_tuple[2]
    tmp['Arousal.std'][k] = average_tuple[3]
    tmp['Frequency'][k] = average_tuple[4]
    
res = pd.DataFrame(tmp)
print len(res)
res.sort_values(by = 'Valence').head()

7284


Unnamed: 0,Arousal,Arousal.std,Frequency,Valence,Valence.std
falsely,3.426078,1.594811,3.018571,0.628869,0.457817
murder,3.440826,1.601677,3.031565,0.631576,0.459787
statutory,3.499274,1.628884,3.083061,0.642304,0.467598
herfinally,3.577588,1.665338,3.15206,0.656679,0.478062
consenting,3.707448,1.725787,3.266474,0.680515,0.495415


In [33]:
res.ix['terrorists']

Arousal        3.332268
Arousal.std    1.308336
Frequency      2.671422
Valence        1.366286
Valence.std    1.023659
Name: terrorists, dtype: float64

In [34]:
res.to_csv('../data/expandedANEW.csv')