In [1]:
import gensim
import re
import string
import pandas as pd
from collections import defaultdict

# Word Embedings

Define sentence iterator. 
Perform pre-processing:
0. Lowercase & tokenize
1. Replace @MENTION & URL

### Train embeddings in sentences using Word2Vec

In [2]:
import os
class MySentences(object):
    def __init__(self, dirname, fname):
        self.dirname = dirname
        self.fname = fname
        
    def __iter__(self):
        for line in open(os.path.join(self.dirname, self.fname)):
            yield line.split()

In [None]:
sentences = MySentences('../data/SentimentCorpus/', 'es.tsv.neg.lower.preprocessed')
model = gensim.models.Word2Vec(sentences, size=400, alpha=0.025, window=5, min_count=5, max_vocab_size=None)

In [None]:
model.save('../data/SentimentCorpus/es.word2vec')

In [None]:
model.most_similar(positive=['hombre'], topn=5)

In [None]:
model.most_similar(positive=['mujer'], topn=5)

In [None]:
model.most_similar(positive=['apple'])

### English

In [None]:
sentences = MySentences('../data/SentimentCorpus/', 'en.tsv.neg.lower.preprocessed')
model = gensim.models.Word2Vec(sentences, size=400, alpha=0.025, window=5, min_count=5, max_vocab_size=None)
model.save('../data/en.word2vec')

In [None]:
model.most_similar(positive=['man'], topn=5)

In [None]:
model.most_similar(positive=['woman'], topn=5)

In [None]:
model.most_similar(positive=['samsung', 'smartphone'], negative=['phone'])

# Expand ANEW and ANSW
Spanish:

In [None]:
es_model = gensim.models.Word2Vec.load('../data/Word2Vec/es.word2vec')

In [None]:
answ = pd.read_csv('../data/SentimentCorpus/ANEW/ANSW.tsv', sep = '\t')
print len(answ)
answ.sort_values(by = 'Val-Mn-All').head()

In [None]:
answ.Freq = answ.Freq.fillna(1) 

In [None]:
res = []

for idx, row in answ.iterrows():
    try:
        sims = es_model.most_similar(positive = [row['S-Word']], topn = 100)
    except:
        sims = []
        
    for sim, weight in filter(lambda x: x[1] > 0.5, sims):
        r = {}
        r['word'] = sim
        for c in [u'Val-Mn-All', u'Val-Sd-All', u'Aro-Mn-All', u'Aro-Sd-All', 'Freq']:
            r[c] = row[c] * weight
            
        res.append(r)

In [None]:
res = pd.DataFrame(res)
print len(res)
res.sort_values(by='Val-Mn-All').head()

### Concat original ANSW

In [None]:
original = answ[['Aro-Mn-All', 'Aro-Sd-All', 'Val-Mn-All', 'Val-Sd-All', 'Freq', 'S-Word']]
original.columns = ['Aro-Mn-All','Aro-Sd-All', 'Val-Mn-All', 'Val-Sd-All', 'Freq', 'word']
tmp = pd.concat([res, original])
print len(tmp)
tmp.sort_values(by='Val-Mn-All').head()

### Compress & Remove Dup

In [None]:
res = tmp
res2 = defaultdict(list)
for idx, row in res.iterrows():
    res2[row['word']].append((float(row['Val-Mn-All']), float(row['Val-Sd-All']),
                              float(row['Aro-Mn-All']), float(row['Aro-Sd-All']),
                              float(row['Freq'])))
    
tmp = {'Valence': {}, 'Valence.std': {}, 'Arousal':{}, 'Arousal.std':{}, 'Frequency':{}}
for k, tlist in res2.iteritems():
    average_tuple = tuple(map(lambda y: sum(y) / float(len(y)), zip(*tlist)))
    tmp['Valence'][k] = average_tuple[0]
    tmp['Valence.std'][k] = average_tuple[1]
    tmp['Arousal'][k] = average_tuple[2]
    tmp['Arousal.std'][k] = average_tuple[3]
    tmp['Frequency'][k] = average_tuple[4]
    
tmp = pd.DataFrame(tmp)
tmp.sort_values(by = 'Valence').head()

In [None]:
tmp.Frequency.describe()

In [None]:
res = tmp
print len(res)
res.to_csv('../data/expandedANSW.csv')

In [None]:
es_model = None
res = None

### __English:__

In [None]:
en_model = gensim.models.Word2Vec.load('../data/Word2Vec/en.word2vec')

In [None]:
anew = pd.read_csv('../data/SentimentCorpus/ANEW/anew_list.csv')
anew.sort_values(by='Valence').head()

In [None]:
anew.Frequency = anew.Frequency.fillna(1) #Not so good Turing

In [None]:
res = []

for idx, row in anew.iterrows():
    try:
        sims = en_model.most_similar(positive = [row['Description']], topn = 100)
    except:
        sims = []
        
    for sim, weight in filter(lambda x: x[1] > 0.5, sims):
        r = {}
        r['word'] = sim
        for c in [ u'Valence', u'std', u'Arousal', u'std.1', 'Frequency']:
            r[c] = row[c] * weight
            
        res.append(r)

In [None]:
res = pd.DataFrame(res)
print len(res)
res.sort_values(by='Valence').head()

In [None]:
res[(res.word == 'terrorists')]

### Concat original ANEW

In [None]:
original = anew[['Arousal','std.1', 'Valence', 'std', 'Frequency' ,'Description']]
original.columns = ['Arousal', 'std.1', 'Valence', 'std', 'Frequency', 'word']
tmp = pd.concat([res, original])
print len(tmp)
tmp.sort_values(by='Valence').head()

### Remove Duplicates

In [None]:
res = tmp
res2 = defaultdict(list)
for idx, row in res.iterrows():
    res2[row['word']].append((float(row['Valence']), float(row['std']),
                              float(row['Arousal']), float(row['std.1']),
                              float(row['Frequency'])))
    
tmp = {'Valence': {}, 'Valence.std': {}, 'Arousal':{}, 'Arousal.std':{}, 'Frequency':{}}
for k, tlist in res2.iteritems():
    average_tuple = tuple(map(lambda y: sum(y) / float(len(y)), zip(*tlist)))
    tmp['Valence'][k] = average_tuple[0]
    tmp['Valence.std'][k] = average_tuple[1]
    tmp['Arousal'][k] = average_tuple[2]
    tmp['Arousal.std'][k] = average_tuple[3]
    tmp['Frequency'][k] = average_tuple[4]
    
res = pd.DataFrame(tmp)
print len(res)
res.sort_values(by = 'Valence').head()

In [None]:
res.ix['terrorists']

In [None]:
res.to_csv('../data/expandedANEW.csv')