In [1]:
import gensim
import re
import string
import pandas as pd
from collections import defaultdict

# Word Embedings

Define sentence iterator. 
Perform pre-processing:
0. Lowercase & tokenize
1. Replace @MENTION & URL

### Train embeddings in sentences using Word2Vec

In [None]:
import os
class MySentences(object):
    def __init__(self, dirname, fname):
        self.dirname = dirname
        self.fname = fname
        
    def __iter__(self):
        for line in open(os.path.join(self.dirname, self.fname)):
            yield line.split()

In [None]:
sentences = MySentences('../data/SentimentCorpus/', 'es.tsv.neg.lower.preprocessed')
model = gensim.models.Word2Vec(sentences, size=400, alpha=0.025, window=5, min_count=5, max_vocab_size=None)

In [None]:
model.save('../data/SentimentCorpus/es.word2vec')

In [None]:
model.most_similar(positive=['hombre'], topn=5)

In [None]:
model.most_similar(positive=['mujer'], topn=5)

In [None]:
model.most_similar(positive=['apple'])

### English

In [None]:
sentences = MySentences('../data/SentimentCorpus/', 'en.tsv.neg.lower.preprocessed')
model = gensim.models.Word2Vec(sentences, size=400, alpha=0.025, window=5, min_count=5, max_vocab_size=None)
model.save('../data/en.word2vec')

In [None]:
model.most_similar(positive=['man'], topn=5)

In [None]:
model.most_similar(positive=['woman'], topn=5)

In [None]:
model.most_similar(positive=['samsung', 'smartphone'], negative=['phone'])

# Expand ANEW and ANSW
Spanish:

In [2]:
es_model = gensim.models.Word2Vec.load('../data/Word2Vec/es.word2vec')

In [3]:
answ = pd.read_csv('../data/SentimentCorpus/ANEW/ANSW.tsv', sep = '\t')
print len(answ)
answ.sort_values(by = 'Val-Mn-All').head()

1034


Unnamed: 0,S-Word,Val-Mn-All,Val-Sd-All,Aro-Mn-All,Aro-Sd-All,Freq
341,violación,1.11,0.48,7.98,1.51,9.29
585,muerto,1.17,0.57,5.99,2.89,123.39
25,asesinar,1.18,0.55,7.44,2.15,4.64
479,guerra,1.23,0.72,7.28,2.23,251.61
99,muerte,1.23,0.64,6.46,2.76,257.32


In [4]:
answ.Freq = answ.Freq.fillna(1) 

In [5]:
res = []

for idx, row in answ.iterrows():
    try:
        sims = es_model.most_similar(positive = [row['S-Word']], topn = 100)
    except:
        sims = []
        
    for sim, weight in filter(lambda x: x[1] > 0.5, sims):
        r = {}
        r['word'] = sim
        for c in [u'Val-Mn-All', u'Val-Sd-All', u'Aro-Mn-All', u'Aro-Sd-All', 'Freq']:
            r[c] = row[c] * weight
            
        res.append(r)

In [6]:
res = pd.DataFrame(res)
print len(res)
res.sort_values(by='Val-Mn-All').head()

20113


Unnamed: 0,Aro-Mn-All,Aro-Sd-All,Freq,Val-Mn-All,Val-Sd-All,word
847,3.871718,1.118843,2.41462,0.614063,0.286216,trepar
846,3.873786,1.119441,2.41591,0.614391,0.286369,espiar
845,3.875528,1.119944,2.416996,0.614667,0.286497,impactar
844,3.876684,1.120278,2.417717,0.61485,0.286583,agradar
843,3.87893,1.120927,2.419117,0.615207,0.286749,castigar


### Concat original ANSW

In [7]:
original = answ[['Aro-Mn-All', 'Aro-Sd-All', 'Val-Mn-All', 'Val-Sd-All', 'Freq', 'S-Word']]
original.columns = ['Aro-Mn-All','Aro-Sd-All', 'Val-Mn-All', 'Val-Sd-All', 'Freq', 'word']
tmp = pd.concat([res, original])
print len(tmp)
tmp.sort_values(by='Val-Mn-All').head()

21147


Unnamed: 0,Aro-Mn-All,Aro-Sd-All,Freq,Val-Mn-All,Val-Sd-All,word
847,3.871718,1.118843,2.41462,0.614063,0.286216,trepar
846,3.873786,1.119441,2.41591,0.614391,0.286369,espiar
845,3.875528,1.119944,2.416996,0.614667,0.286497,impactar
844,3.876684,1.120278,2.417717,0.61485,0.286583,agradar
843,3.87893,1.120927,2.419117,0.615207,0.286749,castigar


### Compress & Remove Dup

In [8]:
res = tmp
res2 = defaultdict(list)
for idx, row in res.iterrows():
    res2[row['word']].append((float(row['Val-Mn-All']), float(row['Val-Sd-All']),
                              float(row['Aro-Mn-All']), float(row['Aro-Sd-All']),
                              float(row['Freq'])))
    
tmp = {'Valence': {}, 'Valence.std': {}, 'Arousal':{}, 'Arousal.std':{}, 'Frequency':{}}
for k, tlist in res2.iteritems():
    average_tuple = tuple(map(lambda y: sum(y) / float(len(y)), zip(*tlist)))
    tmp['Valence'][k] = average_tuple[0]
    tmp['Valence.std'][k] = average_tuple[1]
    tmp['Arousal'][k] = average_tuple[2]
    tmp['Arousal.std'][k] = average_tuple[3]
    tmp['Frequency'][k] = average_tuple[4]
    
tmp = pd.DataFrame(tmp)
tmp.sort_values(by = 'Valence').head()

Unnamed: 0,Arousal,Arousal.std,Frequency,Valence,Valence.std
dispararle,3.880378,1.121346,2.42002,0.615436,0.286856
creerle,3.882232,1.121881,2.421177,0.61573,0.286993
azotar,3.886537,1.123126,2.423862,0.616413,0.287311
vinculan,3.903528,1.128036,2.434458,0.619108,0.288567
convencer,3.923802,1.133894,2.447102,0.622323,0.290066


In [9]:
tmp.Frequency.describe()

count    11199.000000
mean         7.955937
std         27.674871
min          0.000000
25%          0.806012
50%          2.032133
75%          4.856515
max        850.890000
Name: Frequency, dtype: float64

In [10]:
res = tmp
print len(res)
res.to_csv('../data/expandedANSW.csv')

11199


In [None]:
es_model = None
res = None

### __English:__

In [11]:
en_model = gensim.models.Word2Vec.load('../data/Word2Vec/en.word2vec')

In [12]:
anew = pd.read_csv('../data/SentimentCorpus/ANEW/anew_list.csv')
anew.sort_values(by='Valence').head()

Unnamed: 0,Description,WordNo,Valence,std,Arousal,std.1,Dominance,std.2,Frequency
756,rape,344,1.25,0.91,6.81,3.17,2.97,2.94,5
896,suicide,419,1.25,0.69,5.73,3.14,3.58,3.02,17
357,funeral,178,1.39,0.87,4.94,3.21,2.97,2.55,33
763,rejected,349,1.5,1.09,6.37,2.56,2.72,2.58,33
130,cancer,60,1.5,0.85,6.42,2.83,3.42,2.99,25


In [13]:
anew.Frequency = anew.Frequency.fillna(1) #Not so good Turing

In [14]:
res = []

for idx, row in anew.iterrows():
    try:
        sims = en_model.most_similar(positive = [row['Description']], topn = 100)
    except:
        sims = []
        
    for sim, weight in filter(lambda x: x[1] > 0.5, sims):
        r = {}
        r['word'] = sim
        for c in [ u'Valence', u'std', u'Arousal', u'std.1', 'Frequency']:
            r[c] = row[c] * weight
            
        res.append(r)

In [15]:
res = pd.DataFrame(res)
print len(res)
res.sort_values(by='Valence').head()

17216


Unnamed: 0,Arousal,Frequency,Valence,std,std.1,word
13255,3.441265,2.526626,0.631656,0.459846,1.601881,terrorist
13254,3.442732,2.527704,0.631926,0.460042,1.602564,circumcision
13253,3.461108,2.541195,0.635299,0.462498,1.611118,consent
13252,3.463372,2.542858,0.635714,0.4628,1.612172,slavery
13251,3.471704,2.548975,0.637244,0.463913,1.61605,latestage


In [16]:
res[(res.word == 'terrorists')]

Unnamed: 0,Arousal,Frequency,Valence,std,std.1,word
3106,2.579376,4.418631,1.833732,1.281403,1.297973,terrorists
15652,4.413415,0.607072,1.025952,0.862043,1.444832,terrorists


### Concat original ANEW

In [17]:
original = anew[['Arousal','std.1', 'Valence', 'std', 'Frequency' ,'Description']]
original.columns = ['Arousal', 'std.1', 'Valence', 'std', 'Frequency', 'word']
tmp = pd.concat([res, original])
print len(tmp)
tmp.sort_values(by='Valence').head()

18250


Unnamed: 0,Arousal,Frequency,Valence,std,std.1,word
13255,3.441265,2.526626,0.631656,0.459846,1.601881,terrorist
13254,3.442732,2.527704,0.631926,0.460042,1.602564,circumcision
13253,3.461108,2.541195,0.635299,0.462498,1.611118,consent
13252,3.463372,2.542858,0.635714,0.4628,1.612172,slavery
13251,3.471704,2.548975,0.637244,0.463913,1.61605,latestage


### Remove Duplicates

In [18]:
res = tmp
res2 = defaultdict(list)
for idx, row in res.iterrows():
    res2[row['word']].append((float(row['Valence']), float(row['std']),
                              float(row['Arousal']), float(row['std.1']),
                              float(row['Frequency'])))
    
tmp = {'Valence': {}, 'Valence.std': {}, 'Arousal':{}, 'Arousal.std':{}, 'Frequency':{}}
for k, tlist in res2.iteritems():
    average_tuple = tuple(map(lambda y: sum(y) / float(len(y)), zip(*tlist)))
    tmp['Valence'][k] = average_tuple[0]
    tmp['Valence.std'][k] = average_tuple[1]
    tmp['Arousal'][k] = average_tuple[2]
    tmp['Arousal.std'][k] = average_tuple[3]
    tmp['Frequency'][k] = average_tuple[4]
    
res = pd.DataFrame(tmp)
print len(res)
res.sort_values(by = 'Valence').head()

10074


Unnamed: 0,Arousal,Arousal.std,Frequency,Valence,Valence.std
violence,3.676364,1.711318,2.699239,0.67481,0.491262
murder,3.849967,1.792129,2.826701,0.706675,0.51446
violentrape,3.978278,1.851856,2.920909,0.730227,0.531605
stung,3.249207,1.305804,16.832628,0.765119,0.555987
slavery,3.042771,1.522202,6.65576,0.767449,0.473695


In [19]:
res.ix['terrorists']

Arousal        3.496395
Arousal.std    1.371402
Frequency      2.512851
Valence        1.429842
Valence.std    1.071723
Name: terrorists, dtype: float64

In [20]:
res.to_csv('../data/expandedANEW.csv')

In [24]:
en_model.most_similar(positive=['violent'])


[('nonviolent', 0.5183918476104736),
 ('hateful', 0.5122030973434448),
 ('immoral', 0.5042961835861206),
 ('civilized', 0.49878281354904175),
 ('harassment', 0.49625760316848755),
 ('divisive', 0.4957655072212219),
 ('bigoted', 0.49566975235939026),
 ('alleged', 0.49454841017723083),
 ('vacuous', 0.49234598875045776),
 ('flawed', 0.49025771021842957)]