In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Read in Dataset

In [2]:
df = pd.read_csv('../data/curated/reviews/yiting_cleaned_reviews.csv')
df

Unnamed: 0,Sentiment,Time,Text,processed_text
0,0,18/6/21,This is a very healthy dog food. Good for thei...,healthy dog food good digestion also good smal...
1,0,7/7/21,I've been very pleased with the Natural Balanc...,pleased natural balance dog food dog issue dog...
2,0,18/6/21,"Before I was educated about feline nutrition, ...",educate feline nutrition allow cat become addi...
3,0,7/7/21,"My holistic vet recommended this, along with a...",holistic vet recommend along brand try cat pre...
4,0,1/7/21,I bought this coffee because its much cheaper ...,buy coffee much cheaper ganocafe organic reish...
...,...,...,...,...
5439,1,26/2/21,"This is an okay gift box, only if you like med...",okay gift box like mediocre cheese summer saus...
5440,1,18/12/19,It looks llike I just walked into a raw deal. ...,look llike walked raw deal item intolerably st...
5441,1,19/1/20,Thank god that i tasted the metal before i swa...,thank god taste metal swallow dont even get ge...
5442,1,13/9/20,This product was very good when I began buying...,product good begin buy lately terrible taste r...


# TF-IDF

In [5]:
vectorizer = TfidfVectorizer(max_features = 1000, min_df = 3, sublinear_tf = True)
vectorized_text = vectorizer.fit_transform(df['processed_text'])

In [6]:
vectorized_text.shape

(5444, 1000)

In [7]:
vectorizer.get_feature_names_out()

array(['able', 'absolutely', 'acid', 'actual', 'actually', 'add',
       'addition', 'admit', 'adult', 'afternoon', 'aftertaste', 'age',
       'ago', 'agree', 'air', 'allergy', 'allow', 'almond', 'almost',
       'alone', 'along', 'already', 'also', 'alternative', 'although',
       'always', 'amaze', 'amazing', 'amazon', 'amount', 'animal',
       'another', 'anymore', 'anyone', 'anything', 'anyway', 'anywhere',
       'appear', 'apple', 'area', 'aroma', 'around', 'arrive',
       'artificial', 'asian', 'ask', 'assume', 'ate', 'available',
       'avoid', 'aware', 'away', 'awesome', 'awful', 'baby', 'back',
       'bad', 'bag', 'bake', 'balance', 'ball', 'banana', 'bar', 'barely',
       'bargain', 'base', 'basically', 'batch', 'bbq', 'bean', 'beat',
       'become', 'beef', 'begin', 'believe', 'benefit', 'berry', 'best',
       'better', 'beverage', 'big', 'biscuit', 'bisquick', 'bit', 'bite',
       'bitter', 'black', 'bland', 'blend', 'blood', 'blue', 'blueberry',
       'body', '

# LDA

In [26]:
lda_model = LatentDirichletAllocation(n_components = 4, 
                                      learning_method = 'online',
                                      random_state = 42)

lda_topics = lda_model.fit_transform(vectorized_text)

In [27]:
# Log Likelyhood: Higher the better
print(lda_model.score(vectorized_text))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print(lda_model.perplexity(vectorized_text))

-161749.9024621396
1260.9737478668708


In [28]:
# lda component: for each topic, the prob of each word in the vocabulary
lda_model.components_.shape

(4, 1000)

In [29]:
vocab = vectorizer.get_feature_names_out()
for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key = lambda x:x[1], reverse = True)[:10]
    print("Topic " + str(i) + ": ")
    print(" ".join([i[0] for i in sorted_words]))

Topic 0: 
salt sauce chicken use soup make cook taste oil great
Topic 1: 
coffee tea taste flavor cup like drink try great water
Topic 2: 
product price buy good get great love taste order find
Topic 3: 
sugar sweet syrup juice soda flavor orange fruit taste like


In [30]:
dominant_topic_list = [np.where(topic == np.max(topic))[0][0] for topic in lda_topics]
df['dominant_topic'] = dominant_topic_list

In [31]:
df.dominant_topic.value_counts().sort_index()

0     438
1    1508
2    3285
3     213
Name: dominant_topic, dtype: int64

# Model for Topic Modelling

In [32]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [33]:
x_train, x_test, y_train, y_test = train_test_split(lda_topics, df['dominant_topic'], test_size = 0.2, random_state = 1)

In [41]:
pd.DataFrame(y_train).value_counts()

dominant_topic
2                 2632
1                 1214
0                  340
3                  169
dtype: int64

In [34]:
svc = SVC(C=1.0, random_state=1, kernel='poly')

In [35]:
svc.fit(x_train, y_train)

In [36]:
x_test_predicted = svc.predict(x_test)

In [37]:
np.sum(x_test_predicted == y_test)

1079

In [38]:
np.sum(x_test_predicted == y_test)/y_test.shape[0]

0.9908172635445363