#  Processamento de Linguagem Natural - Word2Vec

## Similaridade de Texto

Como computar a similaridade entre duas strings?

In [1]:
import warnings
warnings.filterwarnings("ignore")
from nltk.cluster.util import cosine_distance
from nltk import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
import codecs

In [2]:
a = 'Refrigerador Brastemp CFR45 20L frostfree'
b = 'Geladeira Brastemp CFR45 20L com desgelo automático'

In [3]:
# Tokens similares
tokensA = a.split()
tokensB = b.split()
set(tokensA).intersection(tokensB)

{'20L', 'Brastemp', 'CFR45'}

In [4]:
similar = len(set(tokensA).intersection(tokensB))
total = len(set(tokensA).union(tokensB))
print ('{} tokens similares de {} tokens: {:0.2f}% de similaridade'.format(similar, total, similar/total*100))

3 tokens similares de 9 tokens: 33.33% de similaridade


## Usando word2vec Para Computar Similaridades entre Vetores
https://radimrehurek.com/gensim/models/word2vec.html

Word2vec é um grupo de modelos relacionados que são usados para produzir word embeddings. Esses modelos são redes neurais artificiais de duas camadas que são treinadas para reconstruir contextos linguísticos de palavras. O Word2vec toma como entrada um grande corpus de texto e produz um espaço vetorial, tipicamente de várias centenas de dimensões, com cada palavra única no corpus sendo atribuída um vetor correspondente no espaço. Os vetores de palavras são posicionados no espaço vetorial de tal forma que as palavras que compartilham contextos comuns no corpus estão localizadas próximas umas das outras no espaço.

O Word2vec foi criado por uma equipe de pesquisadores liderada por Tomas Mikolov no Google. O algoritmo foi posteriormente analisado e explicado por outros pesquisadores. Incorporar vetores criados usando o algoritmo Word2vec tem muitas vantagens em comparação com algoritmos anteriores como Latent Semantic Analysis.

In [6]:
# Leitura do Corpus - Produtos
import codecs

In [7]:
# Carregando o conteúdo do Corpus para um objeto Python
with codecs.open('corpus.txt', encoding = 'utf8') as fp:
    corpus = fp.read()

In [11]:
corpus[:120]

'Kit com 4 Pneus de Alta Performance Pirelli Aro 16 205/55R16 Phantom\nChegou o kit que junta resistência e conforto, além'

In [12]:
from nltk import sent_tokenize, word_tokenize

In [13]:
# Tokenization com NLTK - este processo é demorado!!!
sentences = [[w.lower() for w in word_tokenize(sentence, language = 'portuguese')] 
             for sentence in sent_tokenize(corpus, language = 'portuguese')]

In [15]:
print(sentences[0])
print(sentences[1])

['kit', 'com', '4', 'pneus', 'de', 'alta', 'performance', 'pirelli', 'aro', '16', '205/55r16', 'phantom', 'chegou', 'o', 'kit', 'que', 'junta', 'resistência', 'e', 'conforto', ',', 'além', 'de', 'níveis', 'máximos', 'de', 'segurança', '.']
['são', '4', 'pneus', 'para', 'seu', 'carro', 'ficar', 'completo', 'e', 'com', 'a', 'qualificação', 'que', 'você', 'precisa', '.']


In [16]:
from gensim.models import Word2Vec

In [17]:
?Word2Vec

In [18]:
# Treinando o modelo
modelo = Word2Vec(sentences, size = 100, window = 5, min_count = 5, workers = 8)
modelo.init_sims(replace = True)

In [19]:
modelo.most_similar('geladeira')

[('cozinha', 0.603610098361969),
 ('sala', 0.6010788083076477),
 ('inverse', 0.5992774963378906),
 ('bebida', 0.5664677023887634),
 ('estação', 0.5561043620109558),
 ('garagem', 0.5558688640594482),
 ('torneira', 0.5476281046867371),
 ('banheira', 0.5454915761947632),
 ('facilite', 0.5445488691329956),
 ('condensadora', 0.529249370098114)]

In [20]:
modelo.most_similar('computador')

[('dispositivo', 0.7884957194328308),
 ('telefone', 0.787076473236084),
 ('pc', 0.7727987170219421),
 ('navegador', 0.7233446836471558),
 ('computador-', 0.7225916981697083),
 ('veículo', 0.6774432063102722),
 ('aplicativo', 0.6563814878463745),
 ('console', 0.656001091003418),
 ('conecte', 0.6487194895744324),
 ('pendrive', 0.635190486907959)]

In [21]:
modelo.most_similar('carregador')

[('veicular', 0.8345345854759216),
 ('adaptador', 0.8042702674865723),
 ('12v', 0.7721058130264282),
 ('recarregável', 0.7611072659492493),
 ('bivolt', 0.7256201505661011),
 ('bastão', 0.7100176811218262),
 ('lítio', 0.7043484449386597),
 ('dock', 0.7039474844932556),
 ('usb3.0', 0.696510374546051),
 ('rj45', 0.6964977383613586)]

### Dataset Brown - 1.014.312 words
Political
Sports
Society
Spot News
Financial
Cultural, etc

In [45]:
import gensim
from nltk.corpus import brown
model = gensim.models.Word2Vec(brown.sents())

In [26]:
model.similarity('university','school')

0.84210825691298152

In [46]:
model.doesnt_match('breakfast cereal dinner lunch'.split())

'cereal'

In [47]:
model.most_similar(positive=['woman','home'], negative=['man'], topn = 1)

[('clothes', 0.8275762796401978)]

In [48]:
model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)

[('Spain', 0.937682032585144)]

In [49]:
model.most_similar('price')

[('specific', 0.9589784145355225),
 ('adjustment', 0.9588124752044678),
 ('substantial', 0.9587265253067017),
 ('actual', 0.9569110870361328),
 ('substance', 0.9552959203720093),
 ('wage', 0.9545634984970093),
 ('provision', 0.9526653289794922),
 ('requirement', 0.9524614810943604),
 ('analysis', 0.952355146408081),
 ('primarily', 0.9522107839584351)]

In [50]:
model.most_similar('money', topn=5)

[('care', 0.9107005596160889),
 ('chance', 0.8993101716041565),
 ('job', 0.8890410661697388),
 ('everything', 0.863770067691803),
 ('trouble', 0.8619292378425598)]

### Dataset subjectivity - temas variados

In [32]:
from nltk.corpus import subjectivity

In [33]:
model = gensim.models.Word2Vec(subjectivity.sents())

In [34]:
model.similarity('university','school')

0.99844252724117244

In [41]:
model.most_similar(positive=['woman', "work"], negative=['age'], topn = 1)

[('boy', 0.9965816736221313)]

In [42]:
modelo.most_similar('man')

[('fall', 0.8365045785903931),
 ('so', 0.8174225091934204),
 ('forum', 0.8065845966339111),
 ('her', 0.7995058298110962),
 ('yours', 0.7960962057113647),
 ('est', 0.7954405546188354),
 ('christina', 0.7952874898910522),
 ('gang', 0.7915248870849609),
 ('calvin', 0.7889474630355835),
 ('let', 0.7771402597427368)]

In [43]:
modelo.most_similar('book')

[('provides', 0.906433641910553),
 ('offers', 0.8989260196685791),
 ('approach', 0.8901066780090332),
 ('gives', 0.8850250244140625),
 ('through', 0.8822256326675415),
 ('read', 0.8804838061332703),
 ('comprehensive', 0.8787658214569092),
 ('starting', 0.8769986629486084),
 ('presents', 0.8768618702888489),
 ('great', 0.8727860450744629)]

### Dataset MovieReviews - Cinema

In [53]:
from nltk.corpus import movie_reviews, treebank

In [63]:
movie_reviews.sents()

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.'], ...]

In [66]:
print(movie_reviews.words()[:30])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his']


In [67]:
# aplica o algoritmo Word2Vec
mr = Word2Vec(movie_reviews.sents())

In [74]:
mr.most_similar('king')

[('princess', 0.8578184843063354),
 ('robin', 0.8438497185707092),
 ('captain', 0.8431352376937866),
 ('neil', 0.8429046869277954),
 ('edward', 0.8417447805404663),
 ('chris', 0.8391830325126648),
 ('george', 0.8385847806930542),
 ('david', 0.8332217931747437),
 ('paul', 0.8303455114364624),
 ('jackson', 0.8275794982910156)]

In [75]:
mr.most_similar('father')

[('mother', 0.9693006277084351),
 ('wife', 0.9567008018493652),
 ('son', 0.9455345869064331),
 ('brother', 0.9371278882026672),
 ('daughter', 0.9293205738067627),
 ('husband', 0.9215914011001587),
 ('girlfriend', 0.9062857627868652),
 ('sister', 0.8998033404350281),
 ('boyfriend', 0.8947833776473999),
 ('boss', 0.8794924020767212)]

In [76]:
mr.most_similar('bad')

[('good', 0.8460168838500977),
 ('cool', 0.7931435108184814),
 ('stupid', 0.7368031740188599),
 ('funny', 0.7174288034439087),
 ('nice', 0.7068173885345459),
 ('dumb', 0.7039850354194641),
 ('great', 0.6515941619873047),
 ('scary', 0.643081545829773),
 ('tough', 0.6386542916297913),
 ('basically', 0.63398277759552)]

In [77]:
mr.most_similar('horror')

[('slasher', 0.8947128653526306),
 ('science', 0.865003228187561),
 ('fiction', 0.8451724052429199),
 ('animated', 0.8113899827003479),
 ('action', 0.7863391637802124),
 ('disaster', 0.7821124792098999),
 ('teen', 0.7691265344619751),
 ('genre', 0.7654432058334351),
 ('budget', 0.7644156217575073),
 ('pulp', 0.7450346946716309)]

In [78]:
mr.most_similar('city')

[('planet', 0.7939170598983765),
 ('york', 0.7932645082473755),
 ('space', 0.7749285697937012),
 ('south', 0.7659956812858582),
 ('world', 0.7627872228622437),
 ('apes', 0.7576571702957153),
 ('angels', 0.7515949010848999),
 ('united', 0.7486950755119324),
 ('war', 0.7461063861846924),
 ('jungle', 0.7420922517776489)]

In [79]:
mr.most_similar('couples')

[('caricatures', 0.895542323589325),
 ('cardboard', 0.8932735919952393),
 ('standouts', 0.891860842704773),
 ('barrels', 0.8906723260879517),
 ('musicians', 0.8880579471588135),
 ('passengers', 0.8830095529556274),
 ('males', 0.878976047039032),
 ('oppression', 0.8779348134994507),
 ('artists', 0.8778836727142334),
 ('painted', 0.8760911226272583)]

### Dataset brasileiro - Floresta (textos variados)

In [73]:
from nltk.corpus import floresta
floresta.sents()

[['Um', 'revivalismo', 'refrescante'], ['O', '7_e_Meio', 'é', 'um', 'ex-libris', 'de', 'a', 'noite', 'algarvia', '.'], ...]

In [80]:
f = Word2Vec(floresta.sents(), size = 100, window = 5, min_count = 5, workers = 8)
f.init_sims(replace = True)

In [81]:
f.most_similar('casa')

[('passou', 0.9998331069946289),
 ('meio', 0.9997837543487549),
 ('Folha', 0.9997823238372803),
 ('outras', 0.9997642636299133),
 ('hora', 0.9997502565383911),
 ('partida', 0.9997397661209106),
 ('rede', 0.9997308254241943),
 ('jogadores', 0.9997305870056152),
 ('parar', 0.999725341796875),
 ('reunião', 0.9996980428695679)]

In [86]:
f.most_similar('política', topn=9)

[('história', 0.9997707605361938),
 ('caso', 0.9997049570083618),
 ('morte', 0.9996691346168518),
 ('está', 0.9996117353439331),
 ('já', 0.9995875358581543),
 ('vai', 0.9995788931846619),
 ('rua', 0.9995462894439697),
 ('também', 0.9995440244674683),
 ('depois', 0.9995191097259521)]

In [89]:
f.most_similar('noite', topn=20)

[('parte', 0.9997426271438599),
 ('já', 0.9996792674064636),
 ('através', 0.9996192455291748),
 ('Europa', 0.9996017217636108),
 ('depois', 0.999599277973175),
 ('Segundo', 0.9995843768119812),
 ('bem_como', 0.9995585680007935),
 ('mais', 0.9995459914207458),
 ('último', 0.9995173215866089),
 ('abertura', 0.9994987845420837),
 ('morte', 0.9994944930076599),
 ('mesma', 0.999457836151123),
 ('primeira', 0.9994251728057861),
 ('produção', 0.9994106292724609),
 ('este', 0.9994090795516968),
 ('cinema', 0.9994083046913147),
 ('até', 0.9993873834609985),
 ('fim', 0.999384880065918),
 ('trabalho', 0.9993780255317688),
 ('política', 0.9993778467178345)]