In [54]:
import numpy as np
import pandas as pd
import warnings
import datetime

from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from transformers import pipelines

import re
from collections import OrderedDict, Counter
import itertools
import string

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import praw

import json

pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 100
seed = 55
np.random.seed(seed)

In [2]:
df = pd.read_csv('./data/data_final.csv')

In [3]:
df.head()

Unnamed: 0,id,tconst,title,originalTitle,comments,runtimeMinutes,startYear,post_date_utc,post_year,post_month,post_day,genres,numVotes,averageRating
0,vzcwal,tt13406136,the princess,The Princess,Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on h...,94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
1,vzcwal,tt13406136,the princess,The Princess,"Silly, but entertaining and non stop action",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
2,vzcwal,tt13406136,the princess,The Princess,"The yassification of The Raid\n\nActually, this was fun enough and mad respect to Joey King for ...",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
3,vzcwal,tt13406136,the princess,The Princess,"Honestly, this was pretty fun. The plot is nothing special yes.\n\nBut Joey King was actually e...",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
4,vzcwal,tt13406136,the princess,The Princess,"Man, I loved this movie. Yeah, it was campy, but whatever. The premise worked for me, I liked th...",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6


In [4]:
###########################

In [5]:
df['comments'] = df['comments'].apply(lambda x: x.lower())

In [6]:
df['comments'] = df['comments'].apply(
    lambda x: re.sub(
        pattern=r'http\S+', repl='HYPERLINK', string=x)
        )

In [7]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+|\b[iIaA]\b"
# Pattern: Any word with at least two characters, including up to one apostrophe
# Also captures the English words "I" and "a".

tokenizer = RegexpTokenizer(token_pattern)

In [55]:
stemmer = SnowballStemmer('english')

In [13]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))

In [56]:
stopword_list_stem = [stemmer.stem(sw) for sw in stopword_list]

In [14]:
df['comments_tok'] = df['comments'].apply(tokenizer.tokenize)

In [15]:
df['comments_tok_sw'] = df['comments_tok'].apply(lambda x: [t for t in x if t not in stopword_list])

In [57]:
df['comments_stem'] = df['comments_tok'].apply(lambda x: [stemmer.stem(t) for t in x])

In [60]:
df['comments_stem_sw'] = df['comments_stem'].apply(lambda x: [t for t in x if t not in stopword_list_stem])

In [61]:
df

Unnamed: 0,id,tconst,title,originalTitle,comments,runtimeMinutes,startYear,post_date_utc,post_year,post_month,post_day,genres,numVotes,averageRating,comments_tok,comments_tok_sw,comments_stem,comments_stem_sw
0,vzcwal,tt13406136,the princess,The Princess,joey king needs a new agent. she’s proven she has talent but she has so many terrible films on h...,94.0,2022,1.657851e+09,2022,7,14,"Action,Drama,Fantasy",11474,5.6,"[joey, king, needs, a, new, agent, she, proven, she, has, talent, but, she, has, so, many, terri...","[joey, king, needs, new, agent, proven, talent, terrible, films, resume]","[joey, king, need, a, new, agent, she, proven, she, has, talent, but, she, has, so, mani, terrib...","[joey, king, need, new, agent, proven, talent, terribl, film, resum]"
1,vzcwal,tt13406136,the princess,The Princess,"silly, but entertaining and non stop action",94.0,2022,1.657851e+09,2022,7,14,"Action,Drama,Fantasy",11474,5.6,"[silly, but, entertaining, and, non, stop, action]","[silly, entertaining, non, stop, action]","[silli, but, entertain, and, non, stop, action]","[silli, entertain, non, stop, action]"
2,vzcwal,tt13406136,the princess,The Princess,"the yassification of the raid\n\nactually, this was fun enough and mad respect to joey king for ...",94.0,2022,1.657851e+09,2022,7,14,"Action,Drama,Fantasy",11474,5.6,"[the, yassification, of, the, raid, actually, this, was, fun, enough, and, mad, respect, to, joe...","[yassification, raid, actually, fun, mad, respect, joey, king, putting, effort, action, scenes, ...","[the, yassif, of, the, raid, actual, this, was, fun, enough, and, mad, respect, to, joey, king, ...","[yassif, raid, actual, fun, mad, respect, joey, king, effort, action, scene, stunt, could'v, kis..."
3,vzcwal,tt13406136,the princess,The Princess,"honestly, this was pretty fun. the plot is nothing special yes.\n\nbut joey king was actually e...",94.0,2022,1.657851e+09,2022,7,14,"Action,Drama,Fantasy",11474,5.6,"[honestly, this, was, pretty, fun, the, plot, is, nothing, special, yes, but, joey, king, was, a...","[honestly, pretty, fun, plot, special, yes, joey, king, actually, enjoyable, fight, scenes, surp...","[honest, this, was, pretti, fun, the, plot, is, noth, special, yes, but, joey, king, was, actual...","[honest, pretti, fun, plot, special, yes, joey, king, actual, enjoy, fight, scene, surpris, good..."
4,vzcwal,tt13406136,the princess,The Princess,"man, i loved this movie. yeah, it was campy, but whatever. the premise worked for me, i liked th...",94.0,2022,1.657851e+09,2022,7,14,"Action,Drama,Fantasy",11474,5.6,"[man, i, loved, this, movie, yeah, it, was, campy, but, whatever, the, premise, worked, for, me,...","[man, loved, movie, yeah, campy, premise, worked, liked, performances, action, solid, loved, fig...","[man, i, love, this, movi, yeah, it, was, campi, but, whatev, the, premis, work, for, me, i, lik...","[man, love, movi, yeah, campi, premis, work, like, perform, action, solid, love, fight, sequenc,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,47szbr,tt2652118,"crouching tiger, hidden dragon: sword of destiny","Crouching Tiger, Hidden Dragon: Sword of Destiny",i was entertained bcuz i love kung fu movies but i agree this was pretty bad.,96.0,2016,1.456542e+09,2016,2,26,"Action,Adventure,Drama",19664,6.1,"[i, was, entertained, bcuz, i, love, kung, fu, movies, but, i, agree, this, was, pretty, bad]","[entertained, bcuz, love, kung, fu, movies, agree, pretty, bad]","[i, was, entertain, bcuz, i, love, kung, fu, movi, but, i, agre, this, was, pretti, bad]","[entertain, bcuz, love, kung, fu, movi, agre, pretti, bad]"
70688,47szbr,tt2652118,"crouching tiger, hidden dragon: sword of destiny","Crouching Tiger, Hidden Dragon: Sword of Destiny",is the original on netflix?,96.0,2016,1.456542e+09,2016,2,26,"Action,Adventure,Drama",19664,6.1,"[is, the, original, on, netflix]","[original, netflix]","[is, the, origin, on, netflix]","[origin, netflix]"
70689,47szbr,tt2652118,"crouching tiger, hidden dragon: sword of destiny","Crouching Tiger, Hidden Dragon: Sword of Destiny","wait, it came out?",96.0,2016,1.456542e+09,2016,2,26,"Action,Adventure,Drama",19664,6.1,"[wait, it, came, out]","[wait, came]","[wait, it, came, out]","[wait, came]"
70690,47szbr,tt2652118,"crouching tiger, hidden dragon: sword of destiny","Crouching Tiger, Hidden Dragon: Sword of Destiny",a sequel to a phenomal epic movie with 10 years of planning... HYPERLINK,96.0,2016,1.456542e+09,2016,2,26,"Action,Adventure,Drama",19664,6.1,"[a, sequel, to, a, phenomal, epic, movie, with, years, of, planning, HYPERLINK]","[sequel, phenomal, epic, movie, years, planning, HYPERLINK]","[a, sequel, to, a, phenom, epic, movi, with, year, of, plan, hyperlink]","[sequel, phenom, epic, movi, year, plan, hyperlink]"


In [62]:
#########################################

In [None]:
def spacy_lemmatize(text):
    """
    Uses NLTK and SpaCy to tokenize a string and return the lemma of each token.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    docs_lemmatized = [[t.lemma_.lower() for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(docs_lemmatized))

In [None]:
df['comments_lem'] = df['comments'].apply(spacy_lemmatize)

In [None]:
df['comments_lem']

In [None]:
comments_lem_imploded = df.groupby('id').agg(
    {'comments_lem': lambda x: list(itertools.chain.from_iterable(x))}
    ).reset_index()

In [None]:
comments_imploded

In [None]:
just_movies = df.drop(columns=['comments', 'comments_lem']).drop_duplicates(subset='id')

In [None]:
df2 = pd.merge(
    left=just_movies,
    right=comments_lem_imploded,
    how='inner',
    on='id'
)

In [None]:
df2

In [None]:
df2['comments_lem_no_sw'] = df2['comments_lem'].apply(lambda x: [t for t in x if t not in stopword_list])

In [None]:
df2

In [None]:
################################

In [None]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+|\b[iIaA]\b"
# Pattern: Any word with at least two characters, including up to one apostrophe
# Also captures the English words "I" and "a".

tokenizer = RegexpTokenizer(token_pattern)

In [None]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))
stopword_list.extend(["i'm"])

In [None]:
def spacy_lemmatize(text):
    """
    Uses NLTK and SpaCy to tokenize a string and return the lemma of each token.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    docs_lemmatized = [[t.lemma_.lower() for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(docs_lemmatized))

In [None]:
################################

In [None]:
example = df['comments'].loc[70690]

In [None]:
example

In [None]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+|\b[iIaA]\b"
# Pattern: Any word with at least two characters, including up to one apostrophe
# Also captures the English words "I" and "a".

tokenizer = RegexpTokenizer(token_pattern)

In [None]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))
stopword_list.extend(["i'm"])

In [None]:
def spacy_lemmatize(text):
    """
    Uses NLTK and SpaCy to tokenize a string and return the lemma of each token.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    docs_lemmatized = [[t.lemma_.lower() for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(docs_lemmatized))

In [None]:
idx = np.random.choice(df.index)

display(df.loc[[idx]])

print(spacy_lemmatize(df.loc[idx]['comments']))