In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.test.utils import datapath
from gsdmm import MovieGroupProcess

# KeyBERT
from keybert import KeyBERT

# spacy
import spacy

# gensim logging
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/theo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
df = pd.read_csv('../train_40k.csv')

In [4]:
# convert to list
data = df.Text.values.tolist()

# remove emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['The description and photo on this product needs to be changed to indicate '
 'this product is the BuffalOs version of this beef jerky.']


In [5]:
# split senteces to list of words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['the', 'description', 'and', 'photo', 'on', 'this', 'product', 'needs', 'to', 'be', 'changed', 'to', 'indicate', 'this', 'product', 'is', 'the', 'buffalos', 'version', 'of', 'this', 'beef', 'jerky']]


In [6]:
# Build bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# show trigram
print(trigram_mod[bigram_mod[data_words[0]]])

['the', 'description', 'and', 'photo', 'on', 'this', 'product', 'needs', 'to', 'be', 'changed', 'to', 'indicate', 'this', 'product', 'is', 'the', 'buffalos', 'version', 'of', 'this', 'beef_jerky']


In [7]:
# stopwords, bigrams, trigrams, lemmatization functions
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Make Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 
nlp = spacy.load("en_core_web_sm")

# Do lemmatization
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['description', 'photo', 'product', 'need', 'change', 'indicate', 'product', 'version']]


In [9]:
print(data_lemmatized[0])

['description', 'photo', 'product', 'need', 'change', 'indicate', 'product', 'version']


In [10]:
dictionary_sample = gensim.corpora.Dictionary(data_lemmatized)
print(dictionary_sample)

Dictionary(33665 unique tokens: ['change', 'description', 'indicate', 'need', 'photo']...)


In [11]:
sample_vocab_length = len(dictionary_sample)

In [12]:
# initialize GSDMM
gsdmm = MovieGroupProcess(K=50, alpha=0.1, beta=0.1, n_iters=10)

y = gsdmm.fit(data_lemmatized, sample_vocab_length)

In stage 0: transferred 38405 clusters with 50 clusters populated
In stage 1: transferred 21353 clusters with 50 clusters populated
In stage 2: transferred 9535 clusters with 46 clusters populated
In stage 3: transferred 6894 clusters with 39 clusters populated
In stage 4: transferred 5713 clusters with 35 clusters populated
In stage 5: transferred 4814 clusters with 35 clusters populated
In stage 6: transferred 4297 clusters with 34 clusters populated
In stage 7: transferred 4028 clusters with 34 clusters populated
In stage 8: transferred 4003 clusters with 34 clusters populated
In stage 9: transferred 3916 clusters with 33 clusters populated


In [14]:
top_words(gsdmm.cluster_word_distribution, top_index, 20)


Cluster 12 : [('toy', 4933), ('love', 4690), ('play', 4208), ('old', 4153), ('get', 3444), ('game', 3005), ('year', 2847), ('buy', 2830), ('great', 2774), ('son', 2211), ('fun', 2040), ('make', 2000), ('time', 1994), ('little', 1973), ('daughter', 1921), ('month', 1816), ('well', 1627), ('kid', 1622), ('really', 1615), ('good', 1553)]

Cluster 49 : [('get', 3345), ('use', 2437), ('make', 2430), ('product', 2401), ('time', 2330), ('work', 2267), ('well', 2186), ('go', 1943), ('good', 1832), ('take', 1791), ('buy', 1786), ('also', 1623), ('try', 1615), ('little', 1468), ('first', 1442), ('day', 1435), ('even', 1433), ('really', 1414), ('find', 1406), ('much', 1390)]

Cluster 28 : [('get', 1213), ('buy', 1081), ('well', 1064), ('great', 1036), ('product', 958), ('make', 895), ('love', 877), ('use', 822), ('easy', 760), ('bed', 753), ('good', 742), ('put', 720), ('work', 709), ('baby', 666), ('also', 636), ('fit', 635), ('dog', 635), ('go', 631), ('little', 626), ('look', 622)]

Cluster 3

<h1>Predict Unseen<h1>

In [15]:
def sent_to_words(sentence):
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [16]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [17]:
def make_bigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]

In [18]:
def make_trigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)  
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [19]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    nlp = spacy.load("en_core_web_sm")
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [20]:
def formatUnseen(unseen_document):
    data_words = list(sent_to_words(unseen_document))
    # Remove Stop Words
    data_words_nostops = remove_stopwords(sent_to_words(data_words))
    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    return data_lemmatized

In [21]:
from operator import itemgetter
def getTopTopic(unseen_document):
    data_lemmatized = formatUnseen(unseen_document)
    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    # Create Corpus
    other_texts = data_lemmatized
    other_corpus = [id2word.doc2bow(text) for text in other_texts]
    unseen_doc = other_corpus[0]
    vector = gsdmm.score(unseen_doc)
    print(vector)
    # get the element that has the highest score
# this will then be the topic that fits the unseen_document the best
    index = vector.index(max(vector))
    print(vector[index])
    print(index)
    return index

In [25]:
# Selected sentneces to test by uncommenting
# unseen_document = "Beetlejuice is a movie I consider to be one of Tim Burton's best movies. I also consider it to be one of those kind of movies that could have come out only in the 80s (much like Labyrinth starring David Bowie and Jennifer Connelly).Beetlejuice deals with a recently deceased married couple, the Maitlands, who finds themselves essentially "'trapped'" in their former house for the next century or so. Unfortunately, this means living with the house's new owners, the Deetz. The Maitlands don't mind Deetz daughter Lydia played by Winona Ryder so much but her much more obnoxious parents and want to scare them away from the house. Their case worker tells them only one thing"
# unseen_document = "I was looking for some blue wax because I noticed that in European Wax Center usethat kind of wax and it is so less painfull. I really think that this product has the same effect. The kit is perfectly designed to make your waxing easier and with minimal pain as possible."
unseen_document = "I have been having eye irritation and issues for a while now. Every eye makeup remover burns and irritates my eyes. However, when I use jojoba oil, my make up comes off easily and there is no irritation. It is the best thing ever!!!! You can read the other reviews to see the other AMAZING benefits of it :)"
# unseen_document = "I am a very picky person when it comes to men's cologne and women's perfumes. I only use Herrera. Nothing comes close to Herrera for Men. It has a very unique smell and isn't overwhelming. It's sure to set yourself apart from the traditional guy cologne's and old man musks out there. Everyone always wants to know what I am wearing because it is so unique and pleasing to smell. Try it, you'll love it and so will your lady, period."
# unseen_document = "This is a great product at a great price. I have been using a Waterpik for years and love it."

# proccess unseen_document and get the top topic and its words
hotTopic = getTopTopic(unseen_document)
top_words(gsdmm.cluster_word_distribution, [hotTopic], 20)

[0.06231680932669086, 0.003907617835293337, 0.18917573328514292, 0.003907617835293337, 9.989724884871218e-18, 0.003907617835293337, 0.003907617835293337, 0.039080130600877644, 2.2312759901962293e-22, 0.003907617835293337, 7.875480012040694e-12, 2.5204938812069545e-05, 4.5610112014628927e-32, 3.001715627168319e-21, 0.010329735082790326, 0.003907617835293337, 0.11851480313599229, 0.05670912194869464, 0.09504155027882039, 0.003907617835293337, 1.296960965770374e-21, 0.003907617835293337, 1.2093618915470952e-11, 0.14427686892640004, 3.0755539584471496e-13, 0.04988920934776385, 1.527125454854196e-09, 0.003907617835293337, 6.287717567829975e-26, 0.06359817234815798, 1.3084113173646442e-21, 0.003907617835293337, 0.10461251079859575, 0.003907617835293337, 2.454459013185963e-24, 2.4648783437525203e-11, 0.003907617835293337, 0.003907617835293337, 2.317472953263218e-19, 9.048905837444215e-08, 0.003907617835293337, 0.003907617835293337, 2.3832051095301657e-23, 0.003907617835293337, 5.5472016551770