<a href="https://colab.research.google.com/github/zmfong/AMLproject/blob/main/AML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Set intial variables and constants
%config InlineBackend.figure_format='retina'

# Graph Designs
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load the data

In [19]:
#from google.colab import drive
#drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/reddit_comments.csv')
df.shape

(12495, 5)

In [20]:
df.drop('Unnamed: 0', inplace = True, axis=1)
df

Unnamed: 0,user,comment,date_time,sub_title
0,raymmm,Weird I got two burgers when I ordered this pr...,1.684483e+09,Why would McDonalds do this to their customers...
1,JaphieJaphie,Since so many have said the promo was real and...,1.684484e+09,Why would McDonalds do this to their customers...
2,surigato,I have ordered and received 2fillet o fish as ...,1.684483e+09,Why would McDonalds do this to their customers...
3,parasaiteeee,Foodpanda has always been and will always be a...,1.684484e+09,Why would McDonalds do this to their customers...
4,LeftCarpet3520,"As some1 who worked in CS, it is usually a pla...",1.684487e+09,Why would McDonalds do this to their customers...
...,...,...,...,...
12490,Bcpjw,"Yeah, this would never happen at odette! Lol!\...",1.663348e+09,Burger King staff shouts ‘for at least 10 minu...
12491,Bcpjw,"Damn, wondering how the employee’s family felt...",1.663347e+09,Burger King staff shouts ‘for at least 10 minu...
12492,patricklhe,The restaurant may not be busy but could still...,1.663341e+09,Burger King staff shouts ‘for at least 10 minu...
12493,tom-slacker,Shots fired,1.663346e+09,Burger King staff shouts ‘for at least 10 minu...


Preprocessing

In [12]:
import nltk
# Uncomment to download "stopwords"
nltk.download("stopwords")
from nltk.corpus import stopwords

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [28]:
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words=set(nltk.corpus.stopwords.words('english'))
from nltk.tokenize import word_tokenize

In [29]:
def clean_text(headline):
    le=WordNetLemmatizer()
    word_tokens=word_tokenize(str(headline))
    tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
    cleaned_text=" ".join(tokens)
    return cleaned_text

In [35]:
df['cleaned_comment']=df['comment'].apply(clean_text)
import re
df['cleaned_comment'] = df['cleaned_comment'].apply(lambda x:text_preprocessing(x))
df

Unnamed: 0,user,comment,date_time,sub_title,cleaned_comment
0,raymmm,Weird I got two burgers when I ordered this pr...,1.684483e+09,Why would McDonalds do this to their customers...,weird burger ordered promo yesterday edit thin...
1,JaphieJaphie,Since so many have said the promo was real and...,1.684484e+09,Why would McDonalds do this to their customers...,since many said promo real indeed burger inclu...
2,surigato,I have ordered and received 2fillet o fish as ...,1.684483e+09,Why would McDonalds do this to their customers...,ordered received 2fillet fish well
3,parasaiteeee,Foodpanda has always been and will always be a...,1.684484e+09,Why would McDonalds do this to their customers...,foodpanda always always trash company work pan...
4,LeftCarpet3520,"As some1 who worked in CS, it is usually a pla...",1.684487e+09,Why would McDonalds do this to their customers...,some1 worked usually play safe cover backside ...
...,...,...,...,...,...
12490,Bcpjw,"Yeah, this would never happen at odette! Lol!\...",1.663348e+09,Burger King staff shouts ‘for at least 10 minu...,yeah would never happen odette really time pla...
12491,Bcpjw,"Damn, wondering how the employee’s family felt...",1.663347e+09,Burger King staff shouts ‘for at least 10 minu...,damn wondering employee family felt watching v...
12492,patricklhe,The restaurant may not be busy but could still...,1.663341e+09,Burger King staff shouts ‘for at least 10 minu...,restaurant busy could still shorthanded
12493,tom-slacker,Shots fired,1.663346e+09,Burger King staff shouts ‘for at least 10 minu...,shots fired


Topic Analysis

In [37]:
vect =TfidfVectorizer(stop_words='english',max_features=1000)
vect_text=vect.fit_transform(df['cleaned_comment'])

In [58]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,
learning_method='online',random_state=2023,max_iter=1) 
lda_top=lda_model.fit_transform(vect_text)

In [62]:
print("Document 0: ")
for i,topic in enumerate(lda_top[0]):
  print("Topic ",i,": ",topic*100,"%")

Document 0: 
Topic  0 :  1.891948697181528 %
Topic  1 :  64.26150569054275 %
Topic  2 :  1.8921785815579426 %
Topic  3 :  1.8912872428900114 %
Topic  4 :  12.26828026719669 %
Topic  5 :  5.8674778977739255 %
Topic  6 :  1.8912654286980075 %
Topic  7 :  6.251743882439256 %
Topic  8 :  1.8930271457040269 %
Topic  9 :  1.8912851660158787 %


In [60]:
vocab = vect.get_feature_names_out()
for i, comp in enumerate(lda_model.components_):
     vocab_comp = zip(vocab, comp)
     sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:20]
     print("Topic "+str(i)+": ")
     for t in sorted_words:
            print(t[0],end=" ")

Topic 0: 
deleted life arnold shit sound like wrong kind black economy recently removed bring place bos happened crispy using guy coffee Topic 1: 
look need cheese yeah business like double check popeyes think care going cost home dude open game definitely real better Topic 2: 
time like want getting make price mean coke come malaysia send waiting prefer instead fresh problem tell quality food weekend Topic 3: 
jollibee better used nice feel local thank anymore choice agree option experience really working know article live super market cheaper Topic 4: 
food long week remember fast right expensive meat liao good yishun eating thought guess time suck sorry thats restaurant went Topic 5: 
meal thanks hope good tried happy actually wonder store start menu mcspicy branch like feel morning time night reddit mcdonalds Topic 6: 
chicken best breast haha wing rice sure like pretty feeling size miss thigh singapore great hand short area stop fine Topic 7: 
http com maybe outlet covid drumstick