In [1]:
import numpy as np
import pandas as pd
import warnings
import datetime

from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.metrics import Precision, Recall, AUC, RootMeanSquaredError, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from tensorflow.random import set_seed

from transformers import pipelines

import re
from collections import OrderedDict, Counter
import itertools
import string

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import praw

import json

pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 100
seed = 55
np.random.seed(seed)

In [2]:
df = pd.read_csv('./data/data_final.csv')

In [3]:
df.head()

Unnamed: 0,id,tconst,title,originalTitle,comments,runtimeMinutes,startYear,post_date_utc,post_year,post_month,post_day,genres,numVotes,averageRating
0,vzcwal,tt13406136,the princess,The Princess,Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on h...,94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
1,vzcwal,tt13406136,the princess,The Princess,"Silly, but entertaining and non stop action",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
2,vzcwal,tt13406136,the princess,The Princess,"The yassification of The Raid\n\nActually, this was fun enough and mad respect to Joey King for ...",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
3,vzcwal,tt13406136,the princess,The Princess,"Honestly, this was pretty fun. The plot is nothing special yes.\n\nBut Joey King was actually e...",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6
4,vzcwal,tt13406136,the princess,The Princess,"Man, I loved this movie. Yeah, it was campy, but whatever. The premise worked for me, I liked th...",94.0,2022,1657851000.0,2022,7,14,"Action,Drama,Fantasy",11474,5.6


In [4]:
###########################

In [5]:
df_imploded  = df.groupby(['id', 'title', 'averageRating'])[['comments']].agg(list).reset_index()

In [6]:
df_imploded['comments'] = df_imploded['comments'].apply(lambda x: ' '.join(x))

In [7]:
df_imploded['comments'] = df_imploded['comments'].apply(lambda x: x.lower())

In [8]:
df_imploded['comments'] = df_imploded['comments'].apply(
    lambda x: re.sub(
        pattern=r'http\S+', repl='HYPERLINK', string=x)
        )

In [9]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+|\b[iIaA]\b"
# Pattern: Any word with at least two characters, including up to one apostrophe
# Also captures the English words "I" and "a".

tokenizer = RegexpTokenizer(token_pattern)

In [10]:
stemmer = SnowballStemmer('english')

In [11]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))

In [12]:
stopword_list_stem = [stemmer.stem(sw) for sw in stopword_list]

In [13]:
df_imploded['comments_tok'] = df_imploded['comments'].apply(tokenizer.tokenize)

In [14]:
df_imploded['comments_tok_sw'] = df_imploded['comments_tok'].apply(lambda x: [t for t in x if t not in stopword_list])

In [15]:
df_imploded['comments_stem'] = df_imploded['comments_tok'].apply(lambda x: [stemmer.stem(t) for t in x])

In [16]:
df_imploded['comments_stem_sw'] = df_imploded['comments_stem'].apply(lambda x: [t for t in x if t not in stopword_list_stem])

In [17]:
df_imploded

Unnamed: 0,id,title,averageRating,comments,comments_tok,comments_tok_sw,comments_stem,comments_stem_sw
0,47szbr,"crouching tiger, hidden dragon: sword of destiny",6.1,"just finished this, and i thought it was ok. compared to the first it seemed a lot more, i don't...","[just, finished, this, and, i, thought, it, was, ok, compared, to, the, first, it, seemed, a, lo...","[finished, thought, ok, compared, lot, know, cheesy, sorta, fun, way, glad, cast, moving, wu, da...","[just, finish, this, and, i, thought, it, was, ok, compar, to, the, first, it, seem, a, lot, mor...","[finish, thought, ok, compar, lot, know, cheesi, sorta, fun, way, glad, cast, wu, dang, mountain..."
1,48vhc8,zootopia,8.0,"""hold on, walter and jesse are at the door.""\n\ni thought that scene felt similar to breaking ba...","[hold, on, walter, and, jesse, are, at, the, door, i, thought, that, scene, felt, similar, to, b...","[hold, walter, jesse, door, thought, scene, felt, similar, breaking, bad, consider, zootopia, sm...","[hold, on, walter, and, jess, are, at, the, door, i, thought, that, scene, felt, similar, to, br...","[hold, walter, jess, door, thought, scene, felt, similar, break, bad, consid, zootopia, smartest..."
2,48vhmk,whiskey tango foxtrot,6.6,"i personally enjoyed it. not the best, not the worst, but i liked it. i liked the way they handl...","[i, personally, enjoyed, it, not, the, best, not, the, worst, but, i, liked, it, i, liked, the, ...","[personally, enjoyed, best, worst, liked, liked, way, handled, humor, especially, genre, martin,...","[i, person, enjoy, it, not, the, best, not, the, worst, but, i, like, it, i, like, the, way, the...","[person, enjoy, best, worst, like, like, way, handl, humor, especi, genr, martin, freeman, scruf..."
3,48vhsf,london has fallen,5.9,"if you go into this movie expecting good action, gerard butler being a badass, and a terrible pl...","[if, you, go, into, this, movie, expecting, good, action, gerard, butler, being, a, badass, and,...","[movie, expecting, good, action, gerard, butler, badass, terrible, plot, bases, covered, kind, m...","[if, you, go, into, this, movi, expect, good, action, gerard, butler, be, a, badass, and, a, ter...","[movi, expect, good, action, gerard, butler, badass, terribl, plot, base, cover, kind, movi, com..."
4,49wvnj,10 cloverfield lane,7.2,"i kept going back and forth: ""is he crazy? no, he's right. no he's crazy. holy shit he's both...","[i, kept, going, back, and, forth, is, he, crazy, no, he's, right, no, he's, crazy, holy, shit, ...","[kept, going, forth, crazy, he's, right, he's, crazy, holy, shit, he's, crazy, right, edge, seat...","[i, kept, go, back, and, forth, is, he, crazi, no, he, right, no, he, crazi, holi, shit, he, bot...","[kept, forth, crazi, right, crazi, holi, shit, crazi, right, edg, seat, movi, essenti, hour, lon..."
...,...,...,...,...,...,...,...,...
917,vzcv66,where the crawdads sing,7.1,i did enjoy her house representing the 2 different ways the men treated her . tate was invited o...,"[i, did, enjoy, her, house, representing, the, different, ways, the, men, treated, her, tate, wa...","[enjoy, house, representing, different, ways, men, treated, tate, invited, enter, invited, obser...","[i, did, enjoy, her, hous, repres, the, differ, way, the, men, treat, her, tate, was, invit, ove...","[enjoy, hous, repres, differ, way, men, treat, tate, invit, enter, invit, observ, respect, chase..."
918,vzcvkz,mrs harris goes to paris,7.1,this was so cute it just made me smile the whole time. highly recommend. the only word for this...,"[this, was, so, cute, it, just, made, me, smile, the, whole, time, highly, recommend, the, only,...","[cute, smile, time, highly, recommend, word, movie, sweet, lovely, tiny, french, pastry, film, w...","[this, was, so, cute, it, just, made, me, smile, the, whole, time, high, recommend, the, onli, w...","[cute, smile, time, high, recommend, word, movi, sweet, love, tini, french, pastri, film, want, ..."
919,vzcvsd,the sea beast,7.1,absolutely crazy that netflix dropped this and also the mitchells vs the machines with almost no...,"[absolutely, crazy, that, netflix, dropped, this, and, also, the, mitchells, vs, the, machines, ...","[absolutely, crazy, netflix, dropped, mitchells, vs, machines, fanfare, movies, incredible, anim...","[absolut, crazi, that, netflix, drop, this, and, also, the, mitchel, vs, the, machin, with, almo...","[absolut, crazi, netflix, drop, mitchel, vs, machin, fanfar, movi, incred, anim, great, stori, w..."
920,vzcw0a,the man from toronto,5.8,o offence to woody but i feel like the original casting of jason statham would have at least im...,"[offence, to, woody, but, i, feel, like, the, original, casting, of, jason, statham, would, have...","[offence, woody, feel, like, original, casting, jason, statham, improved, shitxhow, slightly, ea...","[offenc, to, woodi, but, i, feel, like, the, origin, cast, of, jason, statham, would, have, at, ...","[offenc, woodi, feel, like, origin, cast, jason, statham, improv, shitxhow, slight, easi, tell, ..."


In [None]:
# fig, axes = plt.subplots(figsize=(7, 24))
# fd_0 = FreqDist(df['comments_stem_sw'].explode()).most_common(100)
# fd_0 = OrderedDict(fd_0)
# tokens_0 = list(fd_0.keys())[::-1]
# freq_0 = list(fd_0.values())[::-1]
# # fd_1 = FreqDist(series[y==1].explode()).most_common(cutoff)
# # fd_1 = OrderedDict(fd_1)
# # tokens_1 = list(fd_1.keys())[::-1]
# # freq_1 = list(fd_1.values())[::-1]
# # shared_tokens = [t for t in tokens_0 if t in tokens_1]
# axes.barh(y=tokens_0, width=freq_0)
# # axes[1].barh(y=tokens_1, width=freq_1, color=['C6' if token in shared_tokens else 'C0' for token in tokens_1])
# axes.set_ylabel('Tokens', size=8)
# axes.set_xlabel('Frequency', size=8)
# # axes[1].set_xlabel('Frequency', size=10)
# # fig.suptitle(f'Top {cutoff} {token_type}', size=15)
# axes.set_title('Non-Disaster')
# # axes[1].set_title('Disaster')
# # custom_bars = [Line2D([0], [0], color='C6', lw=10), Line2D([0], [0], color='C0', lw=10)]
# # axes.legend(custom_bars, ['In common', 'Not in common'])
# # axes[1].legend(custom_bars, ['In common', 'Not in common'])
# fig.set_facecolor('white');
# plt.tight_layout()
# plt.show()

In [18]:
comments = df_imploded.drop(columns='averageRating')
target = df_imploded['averageRating']

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [94]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: [stemmer.stem(t) for t in tokenizer.tokenize(x)], 
    stop_words=stopword_list_stem,
    max_features=1000
    )

tfidf_mat = vectorizer.fit_transform(comments['comments'])
tfidf_mat.shape



(922, 1000)

In [95]:
def extract_best_indices(m, topk, mask=None):
    """
    Use sum of the cosine distance over all tokens.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score 
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:topk]  
    return best_index

In [96]:
def get_recommendations_tfidf(sentence, tfidf_mat):
    
    """
    Return the database sentences in order of highest cosine similarity relatively to each 
    token of the target sentence. 
    """
    # Embed the query sentence
    tokens = [stemmer.stem(t) for t in tokenizer.tokenize(sentence)]
    vec = vectorizer.transform(tokens)
    # Create list with similarity between query and dataset
    mat = cosine_similarity(vec, tfidf_mat)
    # Best cosine distance for each token independantly
    print(mat.shape)
    best_index = extract_best_indices(mat, topk=10)
    return best_index

In [97]:
df_imploded.sample()

Unnamed: 0,id,title,averageRating,comments,comments_tok,comments_tok_sw,comments_stem,comments_stem_sw
115,5rrasx,rings,4.5,priest: samara can't hurt me cuz i'm blind lol\n\nsamara: bet *the bye bye man* was at least fun...,"[priest, samara, can't, hurt, me, cuz, i'm, blind, lol, samara, bet, the, bye, bye, man, was, at...","[priest, samara, can't, hurt, cuz, i'm, blind, lol, samara, bet, bye, bye, man, fun, ineptitude,...","[priest, samara, can't, hurt, me, cuz, i'm, blind, lol, samara, bet, the, bye, bye, man, was, at...","[priest, samara, can't, hurt, cuz, i'm, blind, lol, samara, bet, bye, bye, man, fun, ineptitud, ..."


In [179]:
query_sentence = 'superhero action crime drama dark night vengance'

best_index = get_recommendations_tfidf(query_sentence, tfidf_mat)

display(df_imploded[['title' , 'averageRating']].iloc[best_index])

(7, 922)


Unnamed: 0,title,averageRating
795,kate,6.2
770,jolt,5.6
566,extraction,6.7
223,proud mary,5.0
319,the night comes for us,6.9
300,peppermint,6.4
375,cold pursuit,6.2
921,the princess,5.6
458,angel has fallen,6.4
845,the 355,5.5


In [None]:
# Splitting the full dataset into training and testing data

X_train, X_test, y_train, y_test = train_test_split(
    comments, target, test_size=0.2, random_state=seed
)

# Splitting off a validation set

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=.5, random_state=seed
)

In [None]:
df_imploded['comments_stem'].apply(lambda x: ' '.join(x))

In [None]:
df_imploded['comments'].sample().apply(lambda x: [stemmer.stem(t) for t in tokenizer.tokenize(x)])

In [None]:
tfidf = TfidfVectorizer(
    tokenizer=lambda x: [stemmer.stem(t) for t in tokenizer.tokenize(x)], 
    stop_words=stopword_list_stem,
    max_features=1000
    )

X_train_vec = tfidf.fit_transform(X_train['comments'])

X_train_vec_df = pd.DataFrame(
    X_train_vec.toarray(),
    columns=tfidf.get_feature_names_out(),
    index=X_train.index
    )
X_train_vec_df.head()


In [None]:
scaler = StandardScaler()
X_train_vec_df_scaled = scaler.fit_transform(X_train_vec_df)
X_train_vec_df_scaled = pd.DataFrame(X_train_vec_df_scaled, index=X_train.index, columns=X_train_vec_df.columns)

X_train_vec_df_scaled.head()

In [None]:
X_test_vec = tfidf.transform(X_test['comments'])

X_test_vec_df = pd.DataFrame(
    X_test_vec.toarray(),
    columns=tfidf.get_feature_names_out(),
    index=X_test.index
    )

X_test_vec_df_scaled = scaler.transform(X_test_vec_df)
X_test_vec_df_scaled = pd.DataFrame(X_test_vec_df_scaled, index=X_test.index, columns=X_test_vec_df.columns)

In [None]:
X_val_vec = tfidf.transform(X_val['comments'])

X_val_vec_df = pd.DataFrame(
    X_val_vec.toarray(),
    columns=tfidf.get_feature_names_out(),
    index=X_val.index
    )

X_val_vec_df_scaled = scaler.transform(X_val_vec_df)
X_val_vec_df_scaled = pd.DataFrame(X_val_vec_df_scaled, index=X_val.index, columns=X_val_vec_df.columns)

In [None]:
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(np.array(y_train).reshape(-1,1))
y_test_scaled = scaler_y.transform(np.array(y_test).reshape(-1,1))
y_val_scaled = scaler_y.transform(np.array(y_val).reshape(-1,1))

In [None]:
n_input = X_train_vec_df_scaled.shape[1]
n_input

In [None]:
set_seed(seed)

model_baseline = models.Sequential()

model_baseline.add(layers.Dropout(0.2, input_shape=(n_input,)))

model_baseline.add(layer=layers.Dense(
    units=1000,
    activation='relu',
    input_shape=(n_input,),
    kernel_regularizer=regularizers.L2(.25)
    ))

model_baseline.add(layers.Dropout(0.2))

model_baseline.add(layer=layers.Dense(
    units=500,
    activation='relu',
    kernel_regularizer=regularizers.L2(.25)
    ))

model_baseline.add(layer=layers.Dense(
    units=1,
    ))

model_baseline.compile(
    optimizer='sgd',
    loss='mse',
    metrics=[RootMeanSquaredError()]
    )

early_stopping = [
    EarlyStopping(monitor='val_loss', patience=10)
    ]

model_hist_baseline = model_baseline.fit(
    np.array(X_train_vec_df_scaled),
    y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_val_vec_df_scaled, y_val),
    verbose=True,
    callbacks=early_stopping
)

In [None]:
def plot_nn_curves(model_history):

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18,5))
    fl_ax = axes.flatten()
    for idx, metric in enumerate(['loss', 'root_mean_squared_error']):
        pair = [m for m in model_history.history.keys() if metric in m]
        fl_ax[idx].plot(model_history.history[pair[0]], label=metric)
        fl_ax[idx].plot(model_history.history[pair[1]], label=metric+'_val')
        fl_ax[idx].set_xlabel('epochs')
        fl_ax[idx].set_ylabel(metric)
        fl_ax[idx].set_title(f'{metric.upper()} Evaluation')
        fl_ax[idx].legend()
        plt.tight_layout();

plot_nn_curves(model_hist_baseline)


In [None]:
model_baseline.evaluate(X_val_vec_df_scaled, y_val, return_dict=True)

In [None]:
y_val[17:23]

In [None]:
(model_baseline.predict(X_val_vec_df_scaled))[17:23]

In [None]:
#########################################

In [None]:
def spacy_lemmatize(text):
    """
    Uses NLTK and SpaCy to tokenize a string and return the lemma of each token.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    docs_lemmatized = [[t.lemma_.lower() for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(docs_lemmatized))

In [None]:
df['comments_lem'] = df['comments'].apply(spacy_lemmatize)

In [None]:
df['comments_lem']

In [None]:
comments_lem_imploded = df.groupby('id').agg(
    {'comments_lem': lambda x: list(itertools.chain.from_iterable(x))}
    ).reset_index()

In [None]:
comments_imploded

In [None]:
just_movies = df.drop(columns=['comments', 'comments_lem']).drop_duplicates(subset='id')

In [None]:
df2 = pd.merge(
    left=just_movies,
    right=comments_lem_imploded,
    how='inner',
    on='id'
)

In [None]:
df2

In [None]:
df2['comments_lem_no_sw'] = df2['comments_lem'].apply(lambda x: [t for t in x if t not in stopword_list])

In [None]:
df2

In [None]:
################################

In [None]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+|\b[iIaA]\b"
# Pattern: Any word with at least two characters, including up to one apostrophe
# Also captures the English words "I" and "a".

tokenizer = RegexpTokenizer(token_pattern)

In [None]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))
stopword_list.extend(["i'm"])

In [None]:
def spacy_lemmatize(text):
    """
    Uses NLTK and SpaCy to tokenize a string and return the lemma of each token.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    docs_lemmatized = [[t.lemma_.lower() for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(docs_lemmatized))

In [None]:
################################

In [None]:
example = df['comments'].loc[70690]

In [None]:
example

In [None]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+|\b[iIaA]\b"
# Pattern: Any word with at least two characters, including up to one apostrophe
# Also captures the English words "I" and "a".

tokenizer = RegexpTokenizer(token_pattern)

In [None]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))
stopword_list.extend(["i'm"])

In [None]:
def spacy_lemmatize(text):
    """
    Uses NLTK and SpaCy to tokenize a string and return the lemma of each token.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    docs_lemmatized = [[t.lemma_.lower() for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(docs_lemmatized))

In [None]:
idx = np.random.choice(df.index)

display(df.loc[[idx]])

print(spacy_lemmatize(df.loc[idx]['comments']))