In [3]:
import os
import sys
import re
import numpy as np
import pandas
import gensim
import collections
import pandas as pd
import gzip

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("darkgrid")

from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE
from collections import defaultdict
from nltk.corpus import stopwords

DATA_FILE = '../data/all_titles_sw.txt.gz'
EMBEDDING_OUT_PATH = '../w2v/embeddings/'

## Read Data

In [4]:
with gzip.open(DATA_FILE, 'rb') as f:
    file_content = f.read()
all_sentences = file_content.decode('utf-8').split('\n')
sw_path = os.path.abspath("../stopwords.txt")
sw = stopwords.words(sw_path)

## Build Data

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
all_sentences  = list(sent_to_words(all_sentences))
all_sentences[:1]

[['corneal',
  'endothelial',
  'deposits',
  'associated',
  'with',
  'rifabutin',
  'use']]

In [4]:
def make_bigrams(bigram, texts):
    return [bigram[doc] for doc in texts]

In [5]:
def make_trigrams(bigram, trigram, texts):
    return [trigram[bigram[doc]] for doc in texts]

In [6]:
# Build the bigram and trigram models
bigram = Phrases(all_sentences, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[all_sentences], threshold=100)  

In [7]:
data_ngrams = make_bigrams(bigram, all_sentences)

In [8]:
data_ngrams = make_trigrams(bigram, trigram, all_sentences)

In [9]:
len(all_sentences)

2160491

In [10]:
word_freq = defaultdict(int)
for word in data_ngrams:
    for i in word:
        word_freq[i] += 1
len(word_freq)

422774

#### for data with stopwords

In [11]:
top10 = collections.OrderedDict([line for line in sorted(word_freq.items(), key=lambda x:x[1], reverse=True) if line[0] not in sw][:10])
top10

OrderedDict([('brazil', 144894),
             ('study', 106474),
             ('analysis', 88876),
             ('effects', 85923),
             ('brazilian', 81035),
             ('patients', 78301),
             ('evaluation', 62146),
             ('activity', 60475),
             ('rats', 53699),
             ('properties', 47204)])

#### for data without stopwords

In [12]:
#top10 = collections.OrderedDict(sorted(word_freq.items(), key=lambda x:x[1], reverse=True)[:10])
#top10

## Build Model

In [13]:
model = Word2Vec(data_ngrams, 
                 min_count=3,   # Ignore words that appear less than this
                 size=300,      # Dimensionality of word embeddings
                 workers=10,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=30)

In [14]:
len(model.wv.vocab)

224630

## Test Model

In [15]:
model.wv.most_similar(positive='wireless')

[('wireless_networks', 0.5335965752601624),
 ('wireless_sensor_networks', 0.5160489082336426),
 ('wireless_sensor', 0.4690731167793274),
 ('bluetooth', 0.4526071846485138),
 ('ieee', 0.4511309266090393),
 ('reconfigurable', 0.4503900408744812),
 ('mimo', 0.44723522663116455),
 ('energyefficient', 0.4468957185745239),
 ('ofdm', 0.4348280429840088),
 ('wlan', 0.42918577790260315)]

In [23]:
df = pd.DataFrame(columns = top10.keys())
for word in top10.keys():
    df[word] = model.wv.most_similar(positive=word)
df

Unnamed: 0,brazil,study,analysis,effects,brazilian,patients,evaluation,activity,rats,properties
0,"(brasil, 0.8688706159591675)","(studies, 0.7355968356132507)","(analyses, 0.7738421559333801)","(effect, 0.9310852289199829)","(brasilian, 0.848153829574585)","(patient, 0.8059606552124023)","(assessment, 0.7546509504318237)","(activities, 0.8008769750595093)","(rat, 0.827140212059021)","(proprieties, 0.6129109859466553)"
1,"(brazilian, 0.8031655550003052)","(investigation, 0.6739801168441772)","(analisys, 0.6831382513046265)","(influence, 0.7686419486999512)","(brazil, 0.80316561460495)","(patientes, 0.7938013076782227)","(study, 0.6117603778839111)","(acitivity, 0.5780574083328247)","(mice, 0.7445719242095947)","(property, 0.5564204454421997)"
2,"(brazi, 0.7469620704650879)","(analysis, 0.6443814039230347)","(study, 0.6443814039230347)","(impact, 0.6441784501075745)","(brasil, 0.6299118995666504)","(individuals, 0.7883312106132507)","(analysis, 0.5995725989341736)","(activityof, 0.570565938949585)","(rabbits, 0.6112174987792969)","(propertiesof, 0.5214764475822449)"
3,"(brasilian, 0.6604605913162231)","(evaluation, 0.6117604970932007)","(evaluation, 0.5995726585388184)","(efects, 0.6215373277664185)","(braziliam, 0.6071518659591675)","(subjects, 0.7555750608444214)","(evalution, 0.5922452211380005)","(activies, 0.5061926245689392)","(ratsa, 0.5670409202575684)","(propeties, 0.47726237773895264)"
4,"(spain, 0.6341298222541809)","(assessment, 0.5354693531990051)","(assessment, 0.5787420272827148)","(influences, 0.6062849760055542)","(spanish, 0.5729133486747742)","(women, 0.7411853075027466)","(assesment, 0.5624517202377319)","(acivity, 0.48496779799461365)","(mouse, 0.5656639933586121)","(behavior, 0.4641653895378113)"
5,"(brazilstrongp, 0.6220083236694336)","(investigations, 0.5060980319976807)","(analisis, 0.5593183040618896)","(efect, 0.5589128136634827)","(sao_paulo, 0.5689626932144165)","(children, 0.7381284236907959)","(evaluations, 0.5607291460037231)","(activy, 0.47533169388771057)","(wistar, 0.5301536917686462)","(characteristics, 0.4538622200489044)"
6,"(portugal, 0.6136013269424438)","(stydy, 0.48788881301879883)","(investigation, 0.5209058523178101)","(role, 0.4827345013618469)","(brazils, 0.5603929758071899)","(pacients, 0.718267023563385)","(investigation, 0.5300816297531128)","(activiy, 0.46622419357299805)","(rabbit, 0.5016534328460693)","(stability, 0.4474780559539795)"
7,"(brazils, 0.60157310962677)","(sudy, 0.46877536177635193)","(studies, 0.5072707533836365)","(depends, 0.4525492787361145)","(brazillian, 0.5571433305740356)","(outpatients, 0.6770029067993164)","(evaluating, 0.5208792686462402)","(activites, 0.44755539298057556)","(dogs, 0.5006157159805298)","(proprierties, 0.43291914463043213)"
8,"(argentina, 0.5932648181915283)","(sutdy, 0.46010395884513855)","(analysys, 0.47804951667785645)","(impacts, 0.43384480476379395)","(latin_american, 0.5461232662200928)","(patiens, 0.643639862537384)","(assessing, 0.4701904058456421)","(actvity, 0.4389968812465668)","(hamsters, 0.49119776487350464)","(behaviour, 0.4326108694076538)"
9,"(sao_paulo, 0.5817981958389282)","(analisys, 0.4498993158340454)","(modeling, 0.45176786184310913)","(depending, 0.4296967387199402)","(rio_de_janeiro, 0.5454093217849731)","(adults, 0.6371790170669556)","(evaluate, 0.4666847288608551)","(action, 0.4334433078765869)","(balbc_mice, 0.47453898191452026)","(activity, 0.3933919668197632)"


## Save Model

In [17]:
embedding_name = str(DATA_FILE.split('/')[-1])[:-4] + '.bin'
accuracy_file = str(DATA_FILE.split('/')[-1])[:-4] + '.csv'

In [18]:
#model.save(EMBEDDING_OUT_PATH + embedding_name)

In [19]:
#model = Word2Vec.load("../lattes/embedding/embedding")

In [20]:
model.wv.save_word2vec_format(EMBEDDING_OUT_PATH + embedding_name,binary=True)

In [21]:
accuracy = model.accuracy('/home/antonio/lattes/questions-words.txt')

  """Entry point for launching an IPython kernel.


In [22]:
pandas.DataFrame(accuracy).to_csv(EMBEDDING_OUT_PATH + accuracy_file, sep = '|')