# **Извлечение коллокаций + NER**



## Три способа извлечения наименований товаров



1.   Взять родовое понятие, обучить word2vec на текстах отзывах, получить близкие к родовому слову синонимы и доставать из текстов эти слова. Минусы: не удастся вытащить прям все, потому что в отзывах могут и вовсе не встречаться родовые понятия, кроме того, многие родовые слова для музыкальных инструментов состоят не из одного слова, что затрудняет автоматический выбор синонимов.
2.   Взять из метадаты названия и добавить их в словарь к родовым понятиям. Плюсы: высокая полнота. Минусы: очень много ручной работы.
3.   Взять родовые понятия в word2vec, взять синонимы к ним, заменить их на какой-нибудь тег типа NE. Обучить модель предсказывать по контексту будет ли там NE или нет. Плюсы: так можно предсказывать также названия. Минусы: проблемы с длинной NE, так как названия могут состоять из нескольких слов.





В качестве датасета я вырала [датасет](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz) Musical Instruments

In [90]:
import json
import nltk
import gzip
import re
import gensim
import pandas as pd
import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.utils import shuffle
from nltk.collocations import *
from string import punctuation
from collections import Counter

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [26]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [105]:
!wget 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz'

--2022-12-14 16:08:24--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32052614 (31M) [application/x-gzip]
Saving to: ‘reviews_Digital_Music_5.json.gz.1’


2022-12-14 16:08:30 (5.37 MB/s) - ‘reviews_Digital_Music_5.json.gz.1’ saved [32052614/32052614]



In [106]:
reviews = getDF('reviews_Digital_Music_5.json.gz')

In [107]:
def preprocess(text, stopwords, delete_stopwords=False):
    text = text.lower().strip()
    for char in punctuation:
        text = text.replace(char, " ")
    tokens = text.split()
    new_text = []
    for i in tokens:
        if delete_stopwords:
            if i not in stopwords:
                new_text.append(i)
        else:
            new_text.append(i)
    return new_text
stop = stopwords.words('english')

In [108]:
reviews = shuffle(reviews)

In [109]:
corpus = [preprocess(i, stop) for i in reviews[:300000].reviewText.tolist()]

In [110]:
pd.set_option('display.max_colwidth', None)

In [111]:
reviews['reviewText'][:30]

52357                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [112]:
model = gensim.models.Word2Vec(corpus, window=2)

Возьмём в качестве родовых понятий `sound`, `music`, `musical`, `record`, `stringed`, `instrument`




In [114]:
entities = []
for i in model.wv.most_similar('sound', topn=15):
    entities.append(i[0])

In [115]:
for i in model.wv.most_similar('music', topn=15):
    entities.append(i[0])

In [116]:
for i in model.wv.most_similar('album', topn=15):
    entities.append(i[0])

In [117]:
for i in model.wv.most_similar('genre', topn=15):
    entities.append(i[0])

In [118]:
for i in model.wv.most_similar('guitar', topn=15):
    entities.append(i[0])

In [119]:
for i in model.wv.most_similar('song', topn=15):
    entities.append(i[0])

In [120]:
for i in model.wv.most_similar('rock', topn=15):
    entities.append(i[0])

In [121]:
entities = [lemmatizer.lemmatize(i) for i in entities]

In [122]:
entities = set(entities)
entities = list(entities)
len(entities)

91

In [123]:
def get_mention(text, entities):
    mentions = []
    for word in text.split():
        if lemmatizer.lemmatize(word) in entities:
            mentions.append(word)
    return mentions

In [124]:
new_corpus = []

for i in corpus:
    new_corpus.append(' '.join(i))

In [125]:
for i in range(len(new_corpus)):
    new_corpus[i] = ' '.join(preprocess(new_corpus[i], stop, True))

In [126]:
bigrams = ngrams(' '.join(new_corpus).split(), 2)

# Считаем эн-граммы

In [127]:
bi = Counter(bigrams)

In [128]:
bi_entities = []
for i in bi.most_common():
    if lemmatizer.lemmatize(i[0][0]) in entities or lemmatizer.lemmatize(i[0][1]) in entities:
        bi_entities.append(' '.join([i[0][0], i[0][1]]))

In [129]:
trigrams = ngrams(' '.join(new_corpus).split(), 3)
tri = Counter(trigrams)
tri_entities = []
for i in tri.most_common():
    if lemmatizer.lemmatize(i[0][0]) in entities or lemmatizer.lemmatize(i[0][1]) in entities or lemmatizer.lemmatize(i[0][2]) in entities:
        tri_entities.append(' '.join([i[0][0], i[0][1], i[0][2]]))

# Коллокации

In [130]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(' '.join(new_corpus).split())
finder.apply_freq_filter(20)

In [131]:
pmi = []
for item in entities:
    rate = [i for i in finder.nbest(bigram_measures.pmi, 1000000) if item in i]
    pmi.append([' '.join(i) for i in rate])

In [132]:
pmi[0][:30]

['minneapolis sound',
 'sound affects',
 'surround sound',
 'sound effects',
 'fuller sound',
 'sound revenge',
 'sound alike',
 'sound quality',
 'sound dated',
 'signature sound',
 'wall sound',
 'distinctive sound',
 'organic sound',
 'distinct sound',
 'philly sound',
 'sound system',
 'remastered sound',
 'layers sound',
 'improved sound',
 'unique sound',
 'trademark sound',
 'sound silence',
 'safe sound',
 'retro sound',
 'sound fresh',
 'sound similar',
 'doesnt sound',
 'polished sound',
 'motown sound',
 'oriented sound']

In [133]:
jaccard = []
for item in entities:
    rate = [i for i in finder.nbest(bigram_measures.jaccard, 1000000) if item in i]
    jaccard.append([' '.join(i) for i in rate])

In [134]:
jaccard[0][:30]

['sound quality',
 'sound like',
 'sound effects',
 'unique sound',
 'rock sound',
 'songs sound',
 'sound good',
 'overall sound',
 'new sound',
 'wall sound',
 'sound much',
 'guitar sound',
 'sound great',
 'may sound',
 'sound fresh',
 'sound similar',
 'sound bit',
 'signature sound',
 'band sound',
 'remastered sound',
 'make sound',
 'pop sound',
 'sound better',
 'makes sound',
 'different sound',
 'sound dated',
 'sound little',
 'would sound',
 'vocals sound',
 'sound alike']

In [135]:
dice = []
for item in entities:
    rate = [i for i in finder.nbest(bigram_measures.dice, 1000000) if item in i][:150]
    dice.append([' '.join(i) for i in rate])

In [136]:
dice[0][:30]

['sound quality',
 'sound like',
 'sound effects',
 'unique sound',
 'rock sound',
 'songs sound',
 'sound good',
 'overall sound',
 'new sound',
 'wall sound',
 'sound much',
 'guitar sound',
 'sound great',
 'may sound',
 'sound fresh',
 'sound similar',
 'sound bit',
 'signature sound',
 'band sound',
 'remastered sound',
 'make sound',
 'pop sound',
 'sound better',
 'makes sound',
 'different sound',
 'sound dated',
 'sound little',
 'would sound',
 'vocals sound',
 'sound alike']

In [137]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(' '.join(new_corpus).split())
finder.apply_freq_filter(20)

In [138]:
pmi_tri = []
for item in entities:
    rate = [i for i in finder.nbest(trigram_measures.pmi, 1000000000) if item in i][:150]
    pmi_tri.append([' '.join(i) for i in rate])
pmi_tri[0][:30]

['west coast sound',
 'new wave sound',
 'sound quality excellent',
 'old school sound',
 'quality sound excellent',
 'songs sound alike',
 'hip hop sound',
 'r b sound',
 'still sound fresh',
 'supposed sound like',
 'sound quality good',
 'sound exactly like',
 'better sound quality',
 'songs sound similar',
 'great sound quality',
 'good sound quality',
 'trying sound like',
 'sound quality great',
 'hard rock sound',
 'sound like anything',
 'sound much better',
 'may sound like',
 'sound like recorded',
 'makes sound like',
 'making sound like',
 'sound like something',
 'sound pretty good',
 'pop rock sound',
 'might sound like',
 'sound much like']

In [139]:
jaccard_tri = []
for item in entities:
    rate = [i for i in finder.nbest(trigram_measures.jaccard, 1000000000) if item in i][:150]
    jaccard_tri.append([' '.join(i) for i in rate])
jaccard_tri[0][:30]

['r b sound',
 'sound quality excellent',
 'hip hop sound',
 'songs sound like',
 'west coast sound',
 'quality sound excellent',
 'new wave sound',
 'sound quality good',
 'sound much like',
 'old school sound',
 'hard rock sound',
 'would sound like',
 'sound like something',
 'better sound quality',
 'great sound quality',
 'sound much better',
 'good sound quality',
 'still sound fresh',
 'make sound like',
 'may sound like',
 'makes sound like',
 'sound quality great',
 'really sound like',
 'pop rock sound',
 'songs sound similar',
 'trying sound like',
 'sound pretty good',
 'many songs sound',
 'sound exactly like',
 'sound like could']

In [141]:
likelihood = []
for item in entities:
    rate = [i for i in finder.nbest(trigram_measures.likelihood_ratio, 1000000000) if item in i][:150]
    likelihood.append([' '.join(i) for i in rate])
likelihood[0][:30]

['hip hop sound',
 'r b sound',
 'west coast sound',
 'sound pretty good',
 'old school sound',
 'new wave sound',
 'sound much better',
 'hard rock sound',
 'songs sound like',
 'sound like something',
 'make sound like',
 'may sound like',
 'makes sound like',
 'sound like old',
 'quot sound like',
 'trying sound like',
 'sound like one',
 'song sound like',
 'making sound like',
 'supposed sound like',
 'would sound like',
 'really sound like',
 'might sound like',
 'vocals sound like',
 'sound like could',
 'sound like anything',
 'album sound like',
 'sound like recorded',
 'music sound like',
 'beats sound like']

# Пять примеров

In [142]:
examples = {}
for idx, item in enumerate(entities[:5]):
    examples[item] = jaccard[idx][:5] + likelihood[idx][:5]

In [156]:
for key,val in examples.items():
  print('\n', key,'\n---\n', '\n'.join(val))


 sound 
---
 sound quality
sound like
sound effects
unique sound
rock sound
hip hop sound
r b sound
west coast sound
sound pretty good
old school sound

 style 
---
 vocal style
unique style
musical style
singing style
change style
hip hop style
style hip hop
r b style
style r b
laid back style

 rap 
---
 gangsta rap
rap game
rap fan
coast rap
southern rap
rap hip hop
hip hop rap
one best rap
west coast rap
old school rap

 effort 
---
 sophomore effort
solid effort
solo effort
latest effort
sophmore effort
first solo effort

 genre 
---
 genre bending
sub genre
metal genre
musical genre
genre music
