In [1]:
import numpy as np
import pandas as pd

import re
import string
from datetime import datetime

import praw

import json

pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 100
seed = 55
np.random.seed(seed)

In [2]:
# Reading in list of movies

movies_cleaned_df = pd.read_csv('../data/movies_cleaned.csv', index_col='id')

In [3]:
# Reading in the comments I collected

with open('../data/movies_comments.json') as f:
    movies_comments_dict = json.load(f)

In [4]:
# Making the comments collection a DataFrame

movies_comments_df = pd.DataFrame(movies_comments_dict).T

In [5]:
movies_comments_df

Unnamed: 0,title,comments,post_date_utc
vzcwal,the princess,[Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on ...,1657850947.0
vzcw0a,the man from toronto,[ O offence to Woody but I feel like the original casting of Jason Statham would have at least i...,1657850925.0
vzcvsd,the sea beast,[Absolutely crazy that Netflix dropped this and also The Mitchells Vs The Machines with almost n...,1657850907.0
vzcvkz,mrs. harris goes to paris,"[This was so cute it just made me smile the whole time. Highly recommend., The only word for th...",1657850890.0
vzcv66,where the crawdads sing,[I did enjoy her house representing the 2 different ways the men treated her . Tate was invited ...,1657850854.0
...,...,...,...
49wvnj,10 cloverfield lane,"[I kept going back and forth: ""Is he crazy? No, he's right. No he's crazy. Holy shit he's bot...",1457666387.0
48vhsf,london has fallen,"[If you go into this movie expecting good action, Gerard Butler being a badass, and a terrible p...",1457060665.0
48vhmk,whiskey tango foxtrot,"[I personally enjoyed it. Not the best, not the worst, but I liked it. I liked the way they hand...",1457060598.0
48vhc8,zootopia,"[""Hold on, Walter and Jesse are at the door.""\n\nI thought that scene felt similar to breaking b...",1457060484.0


In [6]:
# Combining international and domestic releases (same title; different discussion thread)

In [7]:
movies_comments_df[movies_comments_df['title'].duplicated(keep=False)]

Unnamed: 0,title,comments,post_date_utc
qd6vqv,dune,"[The balls of Denis Villeneuve to put ""Dune: Part 1"" at the front of a film without the sequel g...",1634868214.0
pprhvt,dune,"[Seeing Jason Momoa clean shaven shook me to my core.\n\nVisually, the movie was absolutely stun...",1631845730.0
ltdr45,the girl on the train,"[Wait, this is getting a remake already?, Lmaoooooooo. Why did they remake this, Out of all the ...",1614391733.0
aq72ni,how to train your dragon: the hidden world,[To go to the international thread for the film go here: [https://redd.it/am2yeh](https://redd.i...,1550804401.0
am2yeh,how to train your dragon: the hidden world,"[I agree that this is the weakest entry for the series but that does not mean it is bad at all, ...",1549033910.0
a6qh98,aquaman,[This thread is for the US release of the film. [You can go here to see the original internation...,1545361207.0
a4rs7c,mortal engines,[This is for the US release of the film. [You can see the international release thread here.](ht...,1544756423.0
a4qfoa,aquaman,"[[removed], This movie was really fond of the ""tender moment suddenly interrupted by big explosi...",1544648421.0
a40xfn,mortal engines,"[>Andy Serkis as London\n\nI.. what? , [deleted], The movie was just a chore to sit through for ...",1544196791.0
8sxn3q,jurassic world: fallen kingdom,[This is the thread for the US release. To see [the international thread click here.](https://ww...,1529633205.0


In [8]:
# The Girl on the Train is not a duplicate. Two movies with the same name.

In [9]:
multi_discussion_movies = \
    movies_comments_df[
        movies_comments_df['title'].duplicated()
        ]['title'].drop(index='56973x').tolist()

In [10]:
multi_discussion_index = \
    movies_comments_df[
        movies_comments_df['title'].duplicated(keep=False)
        ].drop(index=['56973x', 'ltdr45']).index.tolist()

In [11]:
movies_comments_df[movies_comments_df['title'].isin(multi_discussion_movies)]['title'].value_counts()

dune                                          2
how to train your dragon: the hidden world    2
aquaman                                       2
mortal engines                                2
jurassic world: fallen kingdom                2
the death of stalin                           2
coco                                          2
american made                                 2
alien: covenant                               2
guardians of the galaxy vol. 2                2
t2 trainspotting                              2
rogue one: a star wars story                  2
Name: title, dtype: int64

In [12]:
for title in multi_discussion_movies:
    duo = movies_comments_df[movies_comments_df['title'] == title]
    discussion_1 = duo.index[0]
    discussion_2 = duo.index[1]
    movies_comments_df.loc[discussion_1]['comments'] +=\
        movies_comments_df.loc[discussion_2]['comments']
    movies_comments_df = movies_comments_df.drop(index=discussion_2)

In [13]:
# Double checking if there are more duplicates

movies_comments_df\
    [(movies_comments_df['title']\
        .apply(lambda x: x.replace("the ", "") if x.startswith("the ") else x))\
            .apply(lambda x: x[:8]).duplicated(keep=False)]

Unnamed: 0,title,comments,post_date_utc
v8wun8,jurassic world dominion,"[My favourite dino was Salad Fingers., I liked how Malcolm's line about ""you exploited people's ...",1654826563.0
utjdh7,downton abbey: a new era,[this was like snuggling up with a cozy blanket. loved it. i think it was better than the first ...,1653014013.0
ujcuuw,doctor strange in the multiverse of madness,"[America: I can’t control my power\n\nDoctor strange: yeah you can\n\nAmerica: true, They really...",1651802362.0
tysxfs,sonic the hedgehog 2,"[My theater went absolutely nuts at the post credit sequence, With phenomenal cosmic powers, I'l...",1649383955.0
tvh87q,apollo 10½: a space age childhood,"[The marketing for this must have sucked.\n\nI'm usually up to date on movie releases, even no b...",1649013443.0
tnbhdb,the lost city,"[I love the comedy trope of casually killing a random henchman, then reeling with horror at the ...",1648173775.0
t7pqlt,jeen-yuhs: a kanye trilogy part 3,[The saddest part was Kanye's friend realizing that Kanye is losing his grip on reality and reas...,1646536518.0
sv7agc,jeen-yuhs: a kanye trilogy part 1,"[If you’re a Kanye fan, or a just a hip hop fan in general, this is a must watch. It’s incredibl...",1645153666.0
s4a86d,hotel transylvania: transformania,[At one point the movie stops for like two minutes so all the characters can do the Cha-Cha Slid...,1642215928.0
qjg3x9,army of thieves,"[ I would really like to see a sequel. Perhaps a heist during the zombie apocalypse., I guess I'...",1635647029.0


In [14]:
# Still need to merge these movies:

# - star wars: the last jedi -- 7jwxnd, 7rb3uy
# - avengers: infinity war -- 8f84h0, 8gvr6n
# - jeen-yuhs -- sv7agc, t7pqlt
# - avengers: endgame -- bh8iei, bk33kl
# - they shall not grow old -- 9x244z, a5yk8k
# - thor: ragnarok -- 7agfes, 78ivjl

In [15]:
# star wars: the last jedi -- 7jwxnd, 7rb3uy

In [16]:
movies_comments_df.loc[['7jwxnd', '7rb3uy']]

Unnamed: 0,title,comments,post_date_utc
7jwxnd,star wars: episode viii – the last jedi,"[""Luke Skywalker projecting a force ghost across the galaxy to fuck with Kylo Ren"" is the level ...",1513306809.0
7rb3uy,star wars: the last jedi (thread vol. 2),"[Side-quest literally started as a videogame cutscene. \n\nMaz in a ridiculous situation, hologr...",1516294928.0


In [17]:
# Merging the threads

movies_comments_df.loc['7jwxnd']['comments']\
     += movies_comments_df.loc['7rb3uy']['comments']

movies_comments_df = movies_comments_df.drop(index=['7rb3uy'])

In [18]:
# avengers: infinity war -- 8f84h0, 8gvr6n

In [19]:
movies_comments_df.loc[['8f84h0', '8gvr6n']]

Unnamed: 0,title,comments,post_date_utc
8f84h0,avengers: infinity war,[So I guess Avengers is doing pretty well huh? As you can probably tell this is one of our most ...,1524794408.0
8gvr6n,avengers: infinity war (thread vol. 2),[This is the the second thread due to high volume of the first thread. You can [see the initial ...,1525399266.0


In [20]:
# Removing administrative comments

movies_comments_df.loc['8f84h0']['comments']\
    = movies_comments_df.loc['8f84h0']['comments'][1:]

movies_comments_df.loc['8gvr6n']['comments']\
    = movies_comments_df.loc['8gvr6n']['comments'][1:]

In [21]:
# Merging the threads

movies_comments_df.loc['8f84h0']['comments']\
     += movies_comments_df.loc['8gvr6n']['comments']
     
movies_comments_df = movies_comments_df.drop(index=['8gvr6n'])

In [22]:
# jeen-yuhs -- sv7agc, t7pqlt

# This is more of a three-part docuseries than a movie. 
# Plus, Part 2 is missing from the dataset. 
# Better to drop it. Sorry, Kanye fans.

In [23]:
movies_comments_df = movies_comments_df.drop(['sv7agc', 't7pqlt'])

In [24]:
# avengers: endgame -- bh8iei, bk33kl

In [25]:
movies_comments_df.loc[['bh8iei', 'bk33kl']]

Unnamed: 0,title,comments,post_date_utc
bh8iei,avengers: endgame,"[We are officially posting the new official discussion thread on **May 2nd, 21:00 (-5:00 GMT)**,...",1556247619.0
bk33kl,avengers: endgame (2nd thread),"[""I can do this all day.""\n\n""Yeah, ugh, I know.""\n\nMy favorite part about Cap vs Cap is that C...",1556848803.0


In [26]:
# Removing administrative comments

movies_comments_df.loc['bh8iei']['comments']\
    = movies_comments_df.loc['bh8iei']['comments'][1:]

In [27]:
# Merging the threads

movies_comments_df.loc['bh8iei']['comments'] +=\
    movies_comments_df.loc['bk33kl']['comments']

movies_comments_df = movies_comments_df.drop(index=['bk33kl'])

In [28]:
# they shall not grow old -- 9x244z, a5yk8k

In [29]:
movies_comments_df.loc[['9x244z', 'a5yk8k']]

Unnamed: 0,title,comments,post_date_utc
9x244z,they shall not grow old (uk release),"[""Everyone says when you are faced with death you see your past, but when you're 19 you don't ha...",1542337222.0
a5yk8k,they shall not grow old,[This discussion is for the US release of the film. [You can see the international release threa...,1545102023.0


In [30]:
# Removing administrative comments
movies_comments_df.loc['a5yk8k']['comments']\
    = movies_comments_df.loc['a5yk8k']['comments'][1:]

In [31]:
# Merging the threads

movies_comments_df.loc['a5yk8k']['comments'] +=\
    movies_comments_df.loc['9x244z']['comments']

movies_comments_df = movies_comments_df.drop(index=['9x244z'])

In [32]:
# - thor: ragnarok -- 7agfes, 78ivjl

In [33]:
movies_comments_df.loc[['7agfes', '78ivjl']]

Unnamed: 0,title,comments,post_date_utc
7agfes,thor: ragnarok,[[Click here to go to international thread](https://www.reddit.com/r/movies/comments/78ivjl/offi...,1509674697.0
78ivjl,thor: rangarok,"[Honestly, the Marvel movies have recently been trying a little too hard to be funny as of late....",1508879521.0


In [34]:
# Removing administrative comments
movies_comments_df.loc['7agfes']['comments']\
    = movies_comments_df.loc['7agfes']['comments'][1:]

In [35]:
# Merging the threads

movies_comments_df.loc['7agfes']['comments'] +=\
    movies_comments_df.loc['78ivjl']['comments']

movies_comments_df = movies_comments_df.drop(index=['78ivjl'])

In [36]:
# correct spelling

movies_comments_df.loc['7agfes']['title'] = 'thor: ragnarok'

In [37]:
# correcting one more movie title:
movies_comments_df.loc['ncomky']

title                                                                             army of the dead (theater release)
comments         [I just want to say every plot point involving Dave Bautista's daughter was awful. Other than th...
post_date_utc                                                                                           1621044683.0
Name: ncomky, dtype: object

In [38]:
movies_comments_df.loc['ncomky']['title'] = 'army of the dead'

In [39]:
# Supposedly all movies have been merged now

# Adding date columns

In [40]:
movies_comments_df['post_year'] = movies_comments_df['post_date_utc'].apply(lambda x: datetime.fromtimestamp(x).year)
movies_comments_df['post_month'] = movies_comments_df['post_date_utc'].apply(lambda x: datetime.fromtimestamp(x).month)
movies_comments_df['post_day'] = movies_comments_df['post_date_utc'].apply(lambda x: datetime.fromtimestamp(x).day)

In [41]:
movies_comments_df.sample(3)

Unnamed: 0,title,comments,post_date_utc,post_year,post_month,post_day
4b1l4n,pee-wee's big holiday,"[I just finished watching it, and I'm grinning from ear to ear. It's hard to believe how well th...",1458357496.0,2016,3,18
4sww1w,ghostbusters,[Just a note since last year we've had a rule for official discussions that states if it's clear...,1468547759.0,2016,7,14
k4c9oz,small axe: lover's rock,[Steve McQueen releasing the most gorgeous film of the year all about the amazing beauty of inti...,1606792181.0,2020,11,30


In [42]:
# Making a CSV of JUST the movie title, id, and date it was discussed on Reddit.
# This is will be helpful when trying to match movies to their counterparts in a ratings dataset (like IMDb).

In [43]:
reddit_movies_final = movies_comments_df.drop(columns=['comments', 'post_date_utc']).reset_index()
reddit_movies_final.rename(columns={'index': 'id'}, inplace=True)

In [44]:
reddit_movies_final.sample(10)

Unnamed: 0,id,title,post_year,post_month,post_day
834,5hbav3,office christmas party,2016,12,8
35,t0tapr,studio 666,2022,2,24
18,v3p27b,crimes of the future,2022,6,2
630,97ybbl,alpha,2018,8,16
872,4w86a7,suicide squad,2016,8,4
6,vtzd6m,marcel the shell with shoes on,2022,7,7
90,r1mrog,bruised,2021,11,24
592,9x2e23,fantastic beasts: the crimes of grindelwald,2018,11,15
396,ej1r13,the grudge,2020,1,2
577,a5yi33,spider-man: into the spider-verse,2018,12,13


In [45]:
# Run this once
# reddit_movies_final.to_csv("../data/reddit_movies_final.csv", index=True)

In [46]:
######################

In [47]:
# Exploding the comments

comments_df = movies_comments_df.explode('comments').reset_index()
comments_df.rename(columns={'index': 'id'}, inplace=True)
comments_df.sample(5)

Unnamed: 0,id,title,comments,post_date_utc,post_year,post_month,post_day
72306,4i35uf,captain america: civil war,"Who knew Captain America would be the most consistent, highest quality series in the MCU?",1462500708.0,2016,5,5
19404,k2cra9,the christmas chronicles 2,The movie could have been better if it wasn't filled with cringy bits and completely unrealistic...,1606521784.0,2020,11,27
19087,k6zkca,sound of metal,Movies about drummers seem to stick... Whiplash and now Sound of Metal.,1607136537.0,2020,12,4
21235,iu19h6,the devil all the time,Pattinson impresses me more with every role,1600280499.0,2020,9,16
29743,dvuao3,the good liar,"Enjoyed this. Yes, the final twist was a little out of nowhere, but I enjoyed how they set it up...",1573786812.0,2019,11,14


In [48]:
# Check comments for duplicates
# Manually review every comment where the first 50 characters are appear elsewhere.
# Could be: spam, deleted, removed
# Also could be same commenter in two threads about the same movie

comment_counts = comments_df['comments'].apply(lambda x: x[:50]).value_counts()
maybe_duplicate_comments = comment_counts[comment_counts > 1].keys()
maybe_duplicate_comments = list(maybe_duplicate_comments)
len(maybe_duplicate_comments)

65

In [49]:
# Review the comments in maybe_duplicate_comments. 
# Decide which are actually duplicates, which could be coincidental, 
# and which should only be kept in one thread (same movie)

In [50]:
list(enumerate(maybe_duplicate_comments))

[(0, '[deleted]'),
 (1, '[removed]'),
 (2, 'I liked it.'),
 (3, 'I liked it'),
 (4, "Heads up there's some new features on YouPoll!\n\nNo"),
 (5, 'This movie was better than it had any right to be.'),
 (6, 'I enjoyed it.'),
 (7, 'What a boring movie'),
 (8, '\n\nInterdum et malesuada fames ac ante ipsum primis'),
 (9, 'Watched **The death of Stalin** this weekend, got '),
 (10, 'Loved it!'),
 (11, '“Sisus death was just as much your fault as it was'),
 (12, 'This movie was way better than fantasy island but '),
 (13, 'There is something after the credits. Something re'),
 (14, 'My favorite part of this movie is how utterly abso'),
 (15, "I don't get the ending"),
 (16, 'this movie fucking sucks'),
 (17, '“Some people, they will never accept him, but some'),
 (18, 'I loved all the jokes about Dave not being able to'),
 (19, 'Just finished the book and went to see the movie. '),
 (20, "This was one of the worst movies I've seen this ye"),
 (21, 'A kid behind me in the theater wouldn’t si

In [51]:
# Manual review, since there are only 65

In [52]:
checking = -1

In [117]:
# Run this cell multiple times to check for duplicates

checking += 1
print(checking)

comments_df[
    comments_df['comments'].apply(lambda x: x[:50])\
         == maybe_duplicate_comments[checking]
    ]

64


Unnamed: 0,id,title,comments,post_date_utc,post_year,post_month,post_day
41414,a9glap,holmes & watson,One of the worst movies I've ever seen.,1545759097.0,2018,12,25
66967,5ef0tn,bad santa 2,One of the worst movies I've ever seen.,1479869996.0,2016,11,22


In [118]:
drop_all = [
    0, 1, 4, 12, 21, 41, 42, 43, 46, 52, 
    ]

keep_one = [
    9, 14, 18, 19, 22, 25, 26, 31, 32, 33, 34, 37, 39, 40, 
    44, 47, 49, 56, 57, 
    ]

do_nothing = [
    2, 3, 5, 6, 7, 8, 10, 11, 13, 15, 16, 17, 20, 23, 24, 
    27, 28, 29, 30, 35, 36, 38, 45, 48, 50, 51, 53, 54, 55,
    58, 59, 60, 61, 62, 63, 64
    ]

In [119]:
i_to_drop = []

for i, comment in enumerate(maybe_duplicate_comments):
    if i in drop_all:
        i_to_drop.extend(comments_df[comments_df['comments'].apply(lambda x: x[:50]) == comment].index.tolist())
    elif i in keep_one: 
        i_to_drop.extend(comments_df[comments_df['comments'].apply(lambda x: x[:50]) == comment].index.tolist()[1:])

In [120]:
comments_df = comments_df.drop(index=i_to_drop)

In [121]:
##################
# Checking for other administrative comments
# They are usually the first comment

In [122]:
df_only_first_comment = comments_df.drop_duplicates(subset='id', keep='first')

In [123]:
# Checking for common phrases that would be in an administrative comment.

mask = (
    df_only_first_comment['comments'].apply(lambda x: 'heads up' in x.lower())\
        | df_only_first_comment['comments'].apply(lambda x: 'FYI' in x.lower())
            | df_only_first_comment['comments'].apply(lambda x: 'r/movies' in x.lower())\
                | df_only_first_comment['comments'].apply(lambda x: 'pinned' in x.lower())
    |                df_only_first_comment['comments'].apply(lambda x: 'AMA' in x)
)

df_only_first_comment[mask]

Unnamed: 0,id,title,comments,post_date_utc,post_year,post_month,post_day
237,vzcuye,paws of fury: the legend of hank,"So this movie starts, right? And the setup has me like, oh this must be a rip-off of Blazing Sad...",1657850837.0,2022,7,14
1493,uynerm,top gun: maverick,"As it's unpinned, don't forget to give the [Bruckheimer AMA](https://www.reddit.com/r/movies/com...",1653616846.0,2022,5,26
2311,u93jj5,the northman,[Robert Eggers and two historians who worked on the movie did a Live Talk with us this morning. ...,1650593018.0,2022,4,21
2484,tysxuy,"everything, everywhere, all at once","[Just FYI, Ke Huy Quan graced us with an AMA for this movie today.](https://www.reddit.com/r/mov...",1649383992.0,2022,4,7
4012,t68wa3,the batman,"/r/movies discusses *The Batman* on reddit talk, hosted by /u/LiteraryBoner! https://www.reddit....",1646362789.0,2022,3,3
21651,imqeot,mulan,"that 40k post about Mulan that we removed:\n\n[After seeing Mulan 2020, I'm noticing a pattern r...",1599260819.0,2020,9,4
41728,a6qh98,aquaman,This thread is for the US release of the film. [You can go here to see the original internationa...,1545361207.0,2018,12,20
42450,a4rs7c,mortal engines,This is for the US release of the film. [You can see the international release thread here.](htt...,1544756423.0,2018,12,13
46737,9bplvq,searching,Heads up Aneesh Chaganty & Sev Ohanian had an AMA on our subreddit and Sev is still answering qu...,1535680816.0,2018,8,30
48645,8yf14q,sorry to bother you,[Be sure to check out the AMA with the writer/director here](https://www.reddit.com/r/movies/com...,1531439745.0,2018,7,12


In [124]:
# Only the first one is an actual comment about a movie. That was some good search criteria.

In [125]:
df_only_first_comment[mask].index[1:]

Int64Index([ 1493,  2311,  2484,  4012, 21651, 41728, 42450, 46737, 48645,
            49145, 51053, 52846, 57755, 59713, 61258, 61658, 62610, 70932],
           dtype='int64')

In [126]:
comments_df = comments_df.drop(
    df_only_first_comment[mask].index[1:]
    )

In [127]:
# Run this once
# comments_df.to_csv("../data/comments_exploded.csv", index=True)

In [128]:
pd.read_csv("../data/comments_exploded.csv", index_col=0)

Unnamed: 0,id,title,comments,post_date_utc,post_year,post_month,post_day
0,vzcwal,the princess,Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on h...,1.657851e+09,2022,7,14
1,vzcwal,the princess,"Silly, but entertaining and non stop action",1.657851e+09,2022,7,14
2,vzcwal,the princess,"The yassification of The Raid\n\nActually, this was fun enough and mad respect to Joey King for ...",1.657851e+09,2022,7,14
3,vzcwal,the princess,"Honestly, this was pretty fun. The plot is nothing special yes.\n\nBut Joey King was actually e...",1.657851e+09,2022,7,14
4,vzcwal,the princess,"Man, I loved this movie. Yeah, it was campy, but whatever. The premise worked for me, I liked th...",1.657851e+09,2022,7,14
...,...,...,...,...,...,...,...
73535,47szbr,"crouching tiger, hidden dragon: sword of destiny",I was entertained bcuz i love Kung Fu Movies but I agree this was pretty bad.,1.456542e+09,2016,2,26
73536,47szbr,"crouching tiger, hidden dragon: sword of destiny",Is the original on netflix?,1.456542e+09,2016,2,26
73537,47szbr,"crouching tiger, hidden dragon: sword of destiny","wait, it came out?",1.456542e+09,2016,2,26
73538,47szbr,"crouching tiger, hidden dragon: sword of destiny",A sequel to a phenomal epic movie with 10 years of planning... https://m.youtube.com/watch?v=CrH...,1.456542e+09,2016,2,26
