In [1]:
import numpy as np
import pandas as pd

import re
import string

import praw

import json

pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 100
seed = 55
np.random.seed(seed)

In [2]:
# Reading in list of movies

movies_cleaned_df = pd.read_csv('../data/movies_cleaned.csv', index_col='id')

In [3]:
# Reading in the comments I collected

with open('../data/movies_comments.json') as f:
    movies_comments_dict = json.load(f)

In [4]:
# Restructuring the comments collection

for id in movies_comments_dict.keys():
    movies_comments_dict[id] = {
        'title': movies_cleaned_df.loc[id]['title'],
        'comments': movies_comments_dict[id]
    }

In [5]:
# Making the comments collection a DataFrame

df_movies_comments = pd.DataFrame(movies_comments_dict).T

In [6]:
# Combining international and domestic releases (same title; different discussion thread)

In [7]:
df_movies_comments[df_movies_comments['title'].duplicated(keep=False)]

Unnamed: 0,title,comments
aq72ni,how to train your dragon: the hidden world,[To go to the international thread for the film go here: [https://redd.it/am2yeh](https://redd.i...
am2yeh,how to train your dragon: the hidden world,"[I agree that this is the weakest entry for the series but that does not mean it is bad at all, ..."
a6qh98,aquaman,[This thread is for the US release of the film. [You can go here to see the original internation...
a4rs7c,mortal engines,[This is for the US release of the film. [You can see the international release thread here.](ht...
a4qfoa,aquaman,"[[removed], This movie was really fond of the ""tender moment suddenly interrupted by big explosi..."
a40xfn,mortal engines,"[>Andy Serkis as London\n\nI.. what? , [deleted], The movie was just a chore to sit through for ..."
8sxn3q,jurassic world: fallen kingdom,[This is the thread for the US release. To see [the international thread click here.](https://ww...
8pa372,jurassic world: fallen kingdom,[Good to know that the trope of the smart guy saying something sciency and then another dude say...
833jvx,the death of stalin,"[I love how they didn’t even attempt accents, gave it a much more real feeling, Zhukov stole eve..."
7eoph7,coco,"[**Official Pre-Film Short Discussion: Olaf's Frozen Adventure** \n\nDirected by: Kevin Deters, ..."


In [8]:
multi_discussion_movies = \
    df_movies_comments[df_movies_comments['title'].duplicated()]['title'].tolist()

In [9]:
multi_discussion_index = \
    df_movies_comments[df_movies_comments['title'].duplicated(keep=False)].index.tolist()

In [10]:
df_movies_comments[df_movies_comments['title'].isin(multi_discussion_movies)]['title'].value_counts()

how to train your dragon: the hidden world    2
aquaman                                       2
mortal engines                                2
jurassic world: fallen kingdom                2
the death of stalin                           2
coco                                          2
american made                                 2
alien: covenant                               2
guardians of the galaxy vol. 2                2
t2 trainspotting                              2
rogue one: a star wars story                  2
Name: title, dtype: int64

In [11]:
for title in multi_discussion_movies:
    duo = df_movies_comments[df_movies_comments['title'] == title]
    discussion_1 = duo.index[0]
    discussion_2 = duo.index[1]
    df_movies_comments.loc[discussion_1]['comments'] +=\
        df_movies_comments.loc[discussion_2]['comments']
    df_movies_comments = df_movies_comments.drop(index=discussion_2)

In [12]:
# Double checking no remaining duplicates

df_movies_comments\
    [(df_movies_comments['title']\
        .apply(lambda x: x.replace("the ", "") if x.startswith("the ") else x))\
            .apply(lambda x: x[:8]).duplicated(keep=False)]

Unnamed: 0,title,comments
v8wun8,jurassic world dominion,"[My favourite dino was Salad Fingers., I liked how Malcolm's line about ""you exploited people's ..."
utjdh7,downton abbey: a new era,[this was like snuggling up with a cozy blanket. loved it. i think it was better than the first ...
ujcuuw,doctor strange in the multiverse of madness,"[America: I can’t control my power\n\nDoctor strange: yeah you can\n\nAmerica: true, They really..."
tysxfs,sonic the hedgehog 2,"[My theater went absolutely nuts at the post credit sequence, With phenomenal cosmic powers, I'l..."
tvh87q,apollo 10½: a space age childhood,"[The marketing for this must have sucked.\n\nI'm usually up to date on movie releases, even no b..."
tnbhdb,the lost city,"[I love the comedy trope of casually killing a random henchman, then reeling with horror at the ..."
t7pqlt,jeen-yuhs: a kanye trilogy part 3,[The saddest part was Kanye's friend realizing that Kanye is losing his grip on reality and reas...
sv7agc,jeen-yuhs: a kanye trilogy part 1,"[If you’re a Kanye fan, or a just a hip hop fan in general, this is a must watch. It’s incredibl..."
s4a86d,hotel transylvania: transformania,[At one point the movie stops for like two minutes so all the characters can do the Cha-Cha Slid...
qjg3x9,army of thieves,"[ I would really like to see a sequel. Perhaps a heist during the zombie apocalypse., I guess I'..."


In [13]:
# Still need to merge these movies:

# - star wars: the last jedi -- 7jwxnd, 7rb3uy
# - avengers: infinity war -- 8f84h0, 8gvr6n
# - jeen-yuhs -- sv7agc, t7pqlt
# - avengers: endgame -- bh8iei, bk33kl
# - they shall not grow old -- 9x244z, a5yk8k
# - thor: ragnarok -- 7agfes, 78ivjl

In [14]:
# star wars: the last jedi -- 7jwxnd, 7rb3uy

In [15]:
df_movies_comments.loc[['7jwxnd', '7rb3uy']]

Unnamed: 0,title,comments
7jwxnd,star wars: episode viii – the last jedi,"[""Luke Skywalker projecting a force ghost across the galaxy to fuck with Kylo Ren"" is the level ..."
7rb3uy,star wars: the last jedi (thread vol. 2),"[Side-quest literally started as a videogame cutscene. \n\nMaz in a ridiculous situation, hologr..."


In [16]:
# Merging the threads

df_movies_comments.loc['7jwxnd']['comments']\
     += df_movies_comments.loc['7rb3uy']['comments']

df_movies_comments = df_movies_comments.drop(index=['7rb3uy'])

In [17]:
# avengers: infinity war -- 8f84h0, 8gvr6n

In [18]:
df_movies_comments.loc[['8f84h0', '8gvr6n']]

Unnamed: 0,title,comments
8f84h0,avengers: infinity war,[So I guess Avengers is doing pretty well huh? As you can probably tell this is one of our most ...
8gvr6n,avengers: infinity war (thread vol. 2),[This is the the second thread due to high volume of the first thread. You can [see the initial ...


In [19]:
# Removing administrative comments

df_movies_comments.loc['8f84h0']['comments']\
    = df_movies_comments.loc['8f84h0']['comments'][1:]

df_movies_comments.loc['8gvr6n']['comments']\
    = df_movies_comments.loc['8gvr6n']['comments'][1:]

In [20]:
# Merging the threads

df_movies_comments.loc['8f84h0']['comments']\
     += df_movies_comments.loc['8gvr6n']['comments']
     
df_movies_comments = df_movies_comments.drop(index=['8gvr6n'])

In [21]:
# jeen-yuhs -- sv7agc, t7pqlt

# This is more of a three-part docuseries than a movie. 
# Plus, Part 2 is missing from the dataset. 
# Better to drop it. Sorry, Kanye fans.

In [22]:
df_movies_comments = df_movies_comments.drop(['sv7agc', 't7pqlt'])

In [23]:
# avengers: endgame -- bh8iei, bk33kl

In [24]:
df_movies_comments.loc[['bh8iei', 'bk33kl']]

Unnamed: 0,title,comments
bh8iei,avengers: endgame,"[We are officially posting the new official discussion thread on **May 2nd, 21:00 (-5:00 GMT)**,..."
bk33kl,avengers: endgame (2nd thread),"[""I can do this all day.""\n\n""Yeah, ugh, I know.""\n\nMy favorite part about Cap vs Cap is that C..."


In [25]:
# Removing administrative comments

df_movies_comments.loc['bh8iei']['comments']\
    = df_movies_comments.loc['bh8iei']['comments'][1:]

In [26]:
# Merging the threads

df_movies_comments.loc['bh8iei']['comments'] +=\
    df_movies_comments.loc['bk33kl']['comments']

df_movies_comments = df_movies_comments.drop(index=['bk33kl'])

In [27]:
# they shall not grow old -- 9x244z, a5yk8k

In [28]:
df_movies_comments.loc[['9x244z', 'a5yk8k']]

Unnamed: 0,title,comments
9x244z,they shall not grow old (uk release),"[""Everyone says when you are faced with death you see your past, but when you're 19 you don't ha..."
a5yk8k,they shall not grow old,[This discussion is for the US release of the film. [You can see the international release threa...


In [29]:
# Removing administrative comments
df_movies_comments.loc['a5yk8k']['comments']\
    = df_movies_comments.loc['a5yk8k']['comments'][1:]

In [30]:
# Merging the threads

df_movies_comments.loc['a5yk8k']['comments'] +=\
    df_movies_comments.loc['9x244z']['comments']

df_movies_comments = df_movies_comments.drop(index=['9x244z'])

In [31]:
# - thor: ragnarok -- 7agfes, 78ivjl

In [32]:
df_movies_comments.loc[['7agfes', '78ivjl']]

Unnamed: 0,title,comments
7agfes,thor: ragnarok,[[Click here to go to international thread](https://www.reddit.com/r/movies/comments/78ivjl/offi...
78ivjl,thor: rangarok,"[Honestly, the Marvel movies have recently been trying a little too hard to be funny as of late...."


In [33]:
# Removing administrative comments
df_movies_comments.loc['7agfes']['comments']\
    = df_movies_comments.loc['7agfes']['comments'][1:]

In [34]:
# Merging the threads

df_movies_comments.loc['7agfes']['comments'] +=\
    df_movies_comments.loc['78ivjl']['comments']

df_movies_comments = df_movies_comments.drop(index=['78ivjl'])

In [35]:
len(df_movies_comments.loc[['7agfes']]['comments'].iloc[0])

199

In [36]:
# correct spelling

df_movies_comments.loc['7agfes']['title'] = 'thor: ragnarok'

In [37]:
############################
######################################################

# Supposedly all movies have been merged now

In [38]:
# Exploding the comments

df = df_movies_comments.explode('comments').reset_index()

In [39]:
df.columns = ['id', 'title', 'comments']

In [40]:
########################

In [41]:
# Redo everything below here. Widen criteria for duplicate comment.

In [42]:
# Check comments for duplicates
# Manually review every comment where the first 50 characters are duplicated.
# Could be: spam, deleted, removed
# Also could be same commenter in two threads about the same movie

comment_counts = df['comments'].apply(lambda x: x[:50]).value_counts()
maybe_duplicate_comments = comment_counts[comment_counts > 1].keys()
maybe_duplicate_comments = list(maybe_duplicate_comments)
len(maybe_duplicate_comments)

65

In [43]:
# Review the comments in duplicate_comment_counts. 
# Decide which are actually duplicates, which could be coincidental, 
# and which should only be kept in one thread (same movie)

In [44]:
list(enumerate(maybe_duplicate_comments))

[(0, '[deleted]'),
 (1, '[removed]'),
 (2, 'I liked it.'),
 (3, 'I liked it'),
 (4, "Heads up there's some new features on YouPoll!\n\nNo"),
 (5, 'This movie was better than it had any right to be.'),
 (6, 'I enjoyed it.'),
 (7, '\n\nInterdum et malesuada fames ac ante ipsum primis'),
 (8, 'What a boring movie'),
 (9, 'Loved it!'),
 (10, 'This is one of those films where any of the three '),
 (11, 'I cried tears of joy and passed out multiple times'),
 (12, 'My favorite part of this movie is how utterly abso'),
 (13, 'Terrible'),
 (14, 'I was pleasantly surprised by how good this movie '),
 (15, "Fuck it I'll just say it, Jeff Daniels was so fuck"),
 (16, 'No way any teenage girl would just leave her phone'),
 (17, 'Eh.'),
 (18, 'I might be going against the grain here, but I lef'),
 (19, 'I enjoyed this movie more than I thought I would. '),
 (20, 'Wow'),
 (21, 'Even in the UK the marketing tried to hide the fac'),
 (22, 'Hey all! So the creator of YouPoll who created our'),
 (23, 'Of

In [45]:
# Manual review, since there are only 65

drop_all = [
    0, 1, 4, 7, 22, 23, 43,
    46, 56, 60, 62
    ]

keep_one = [
    10, 12, 15, 16, 18,
    21, 25, 26, 27, 31, 32,
    33, 35, 38, 39, 44, 47,
    50, 53, 54, 57, 58
    ]

do_nothing = [
    2, 3, 5, 6, 8, 9, 11,
    13, 14, 17, 19, 20, 24,
    28, 29, 30, 34, 36, 37,
    40, 41, 42, 45, 48, 49,
    51, 52, 55, 59, 61, 63,
    64
    ]

In [46]:
# checking = 19
# print(checking)

# df[
#     df['comments'].apply(lambda x: x[:50])\
#          == maybe_duplicate_comments[checking]
#     ]

In [47]:
i_to_drop = []

for i, comment in enumerate(maybe_duplicate_comments):
    if i in drop_all:
        i_to_drop.extend(df[df['comments'].apply(lambda x: x[:50]) == comment].index.tolist())
    elif i in keep_one: 
        i_to_drop.extend(df[df['comments'].apply(lambda x: x[:50]) == comment].index.tolist()[1:])

In [48]:
df = df.drop(index=i_to_drop)

In [49]:
############################################

In [50]:
##################
# Checking for other administrative comments
# They are usually the first comment

In [51]:
df_only_first_comment = df.drop_duplicates(subset='id', keep='first')

In [52]:
# Checking for common phrases that would be in an administrative comment.

mask = (
    df_only_first_comment['comments'].apply(lambda x: 'heads up' in x.lower())\
        | df_only_first_comment['comments'].apply(lambda x: 'FYI' in x.lower())
            | df_only_first_comment['comments'].apply(lambda x: 'r/movies' in x.lower())\
                | df_only_first_comment['comments'].apply(lambda x: 'pinned' in x.lower())
    |                df_only_first_comment['comments'].apply(lambda x: 'AMA' in x)
)

df_only_first_comment[mask]

Unnamed: 0,id,title,comments
229,vzcuye,paws of fury: the legend of hank,"So this movie starts, right? And the setup has me like, oh this must be a rip-off of Blazing Sad..."
1485,uynerm,top gun: maverick,"As it's unpinned, don't forget to give the [Bruckheimer AMA](https://www.reddit.com/r/movies/com..."
2296,u93jj5,the northman,[Robert Eggers and two historians who worked on the movie did a Live Talk with us this morning. ...
2469,tysxuy,"everything, everywhere, all at once","[Just FYI, Ke Huy Quan graced us with an AMA for this movie today.](https://www.reddit.com/r/mov..."
3995,t68wa3,the batman,"/r/movies discusses *The Batman* on reddit talk, hosted by /u/LiteraryBoner! https://www.reddit...."
21625,imqeot,mulan (2020),"that 40k post about Mulan that we removed:\n\n[After seeing Mulan 2020, I'm noticing a pattern r..."
41702,a6qh98,aquaman,This thread is for the US release of the film. [You can go here to see the original internationa...
42424,a4rs7c,mortal engines,This is for the US release of the film. [You can see the international release thread here.](htt...
46711,9bplvq,searching,Heads up Aneesh Chaganty & Sev Ohanian had an AMA on our subreddit and Sev is still answering qu...
48619,8yf14q,sorry to bother you,[Be sure to check out the AMA with the writer/director here](https://www.reddit.com/r/movies/com...


In [53]:
# Only the first one is an actual comment about a movie. That was some good search criteria.

In [54]:
df = df.drop(
    df_only_first_comment[mask].index[1:]
    )

In [55]:
df

Unnamed: 0,id,title,comments
0,vzcwal,the princess,Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on h...
1,vzcwal,the princess,"Silly, but entertaining and non stop action"
2,vzcwal,the princess,"The yassification of The Raid\n\nActually, this was fun enough and mad respect to Joey King for ..."
3,vzcwal,the princess,"Honestly, this was pretty fun. The plot is nothing special yes.\n\nBut Joey King was actually e..."
4,vzcwal,the princess,"Man, I loved this movie. Yeah, it was campy, but whatever. The premise worked for me, I liked th..."
...,...,...,...
73509,47szbr,"crouching tiger, hidden dragon: sword of destiny",I was entertained bcuz i love Kung Fu Movies but I agree this was pretty bad.
73510,47szbr,"crouching tiger, hidden dragon: sword of destiny",Is the original on netflix?
73511,47szbr,"crouching tiger, hidden dragon: sword of destiny","wait, it came out?"
73512,47szbr,"crouching tiger, hidden dragon: sword of destiny",A sequel to a phenomal epic movie with 10 years of planning... https://m.youtube.com/watch?v=CrH...


In [56]:
df.to_csv("../data/data.csv", index=True)

In [57]:
pd.read_csv("../data/data.csv", index_col=0).equals(df)

True