In [1]:
import numpy as np
import pandas as pd

import re
import string
from datetime import datetime

import praw

import json

pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 100
seed = 55
np.random.seed(seed)

In [2]:
# Reading in list of movies

movies_cleaned_df = pd.read_csv('../data/movies_cleaned.csv', index_col='id')

In [3]:
# Reading in the comments I collected

with open('../data/movies_comments.json') as f:
    movies_comments_dict = json.load(f)

In [4]:
# # Restructuring the comments collection

# for id in movies_comments_dict.keys():
#     movies_comments_dict[id] = {
#         'title': movies_cleaned_df.loc[id]['title'],
#         'comments': movies_comments_dict[id]
#     }

In [5]:
# Making the comments collection a DataFrame

df_movies_comments = pd.DataFrame(movies_comments_dict).T

In [6]:
df_movies_comments

Unnamed: 0,title,comments,post_date_utc
vzcwal,the princess,[Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on ...,1657850947.0
vzcw0a,the man from toronto,[ O offence to Woody but I feel like the original casting of Jason Statham would have at least i...,1657850925.0
vzcvsd,the sea beast,[Absolutely crazy that Netflix dropped this and also The Mitchells Vs The Machines with almost n...,1657850907.0
vzcvkz,mrs. harris goes to paris,"[This was so cute it just made me smile the whole time. Highly recommend., The only word for th...",1657850890.0
vzcv66,where the crawdads sing,[I did enjoy her house representing the 2 different ways the men treated her . Tate was invited ...,1657850854.0
...,...,...,...
49wvnj,10 cloverfield lane,"[I kept going back and forth: ""Is he crazy? No, he's right. No he's crazy. Holy shit he's bot...",1457666387.0
48vhsf,london has fallen,"[If you go into this movie expecting good action, Gerard Butler being a badass, and a terrible p...",1457060665.0
48vhmk,whiskey tango foxtrot,"[I personally enjoyed it. Not the best, not the worst, but I liked it. I liked the way they hand...",1457060598.0
48vhc8,zootopia,"[""Hold on, Walter and Jesse are at the door.""\n\nI thought that scene felt similar to breaking b...",1457060484.0


In [7]:
# Combining international and domestic releases (same title; different discussion thread)

In [8]:
df_movies_comments[df_movies_comments['title'].duplicated(keep=False)]

Unnamed: 0,title,comments,post_date_utc
aq72ni,how to train your dragon: the hidden world,[To go to the international thread for the film go here: [https://redd.it/am2yeh](https://redd.i...,1550804401.0
am2yeh,how to train your dragon: the hidden world,"[I agree that this is the weakest entry for the series but that does not mean it is bad at all, ...",1549033910.0
a6qh98,aquaman,[This thread is for the US release of the film. [You can go here to see the original internation...,1545361207.0
a4rs7c,mortal engines,[This is for the US release of the film. [You can see the international release thread here.](ht...,1544756423.0
a4qfoa,aquaman,"[[removed], This movie was really fond of the ""tender moment suddenly interrupted by big explosi...",1544648421.0
a40xfn,mortal engines,"[>Andy Serkis as London\n\nI.. what? , [deleted], The movie was just a chore to sit through for ...",1544196791.0
8sxn3q,jurassic world: fallen kingdom,[This is the thread for the US release. To see [the international thread click here.](https://ww...,1529633205.0
8pa372,jurassic world: fallen kingdom,[Good to know that the trope of the smart guy saying something sciency and then another dude say...,1528376296.0
833jvx,the death of stalin,"[I love how they didn’t even attempt accents, gave it a much more real feeling, Zhukov stole eve...",1520564945.0
7eoph7,coco,"[**Official Pre-Film Short Discussion: Olaf's Frozen Adventure** \n\nDirected by: Kevin Deters, ...",1511330460.0


In [9]:
multi_discussion_movies = \
    df_movies_comments[df_movies_comments['title'].duplicated()]['title'].tolist()

In [10]:
multi_discussion_index = \
    df_movies_comments[df_movies_comments['title'].duplicated(keep=False)].index.tolist()

In [11]:
df_movies_comments[df_movies_comments['title'].isin(multi_discussion_movies)]['title'].value_counts()

how to train your dragon: the hidden world    2
aquaman                                       2
mortal engines                                2
jurassic world: fallen kingdom                2
the death of stalin                           2
coco                                          2
american made                                 2
alien: covenant                               2
guardians of the galaxy vol. 2                2
t2 trainspotting                              2
rogue one: a star wars story                  2
Name: title, dtype: int64

In [12]:
for title in multi_discussion_movies:
    duo = df_movies_comments[df_movies_comments['title'] == title]
    discussion_1 = duo.index[0]
    discussion_2 = duo.index[1]
    df_movies_comments.loc[discussion_1]['comments'] +=\
        df_movies_comments.loc[discussion_2]['comments']
    df_movies_comments = df_movies_comments.drop(index=discussion_2)

In [13]:
# Double checking if there are more duplicates

df_movies_comments\
    [(df_movies_comments['title']\
        .apply(lambda x: x.replace("the ", "") if x.startswith("the ") else x))\
            .apply(lambda x: x[:8]).duplicated(keep=False)]

Unnamed: 0,title,comments,post_date_utc
v8wun8,jurassic world dominion,"[My favourite dino was Salad Fingers., I liked how Malcolm's line about ""you exploited people's ...",1654826563.0
utjdh7,downton abbey: a new era,[this was like snuggling up with a cozy blanket. loved it. i think it was better than the first ...,1653014013.0
ujcuuw,doctor strange in the multiverse of madness,"[America: I can’t control my power\n\nDoctor strange: yeah you can\n\nAmerica: true, They really...",1651802362.0
tysxfs,sonic the hedgehog 2,"[My theater went absolutely nuts at the post credit sequence, With phenomenal cosmic powers, I'l...",1649383955.0
tvh87q,apollo 10½: a space age childhood,"[The marketing for this must have sucked.\n\nI'm usually up to date on movie releases, even no b...",1649013443.0
tnbhdb,the lost city,"[I love the comedy trope of casually killing a random henchman, then reeling with horror at the ...",1648173775.0
t7pqlt,jeen-yuhs: a kanye trilogy part 3,[The saddest part was Kanye's friend realizing that Kanye is losing his grip on reality and reas...,1646536518.0
sv7agc,jeen-yuhs: a kanye trilogy part 1,"[If you’re a Kanye fan, or a just a hip hop fan in general, this is a must watch. It’s incredibl...",1645153666.0
s4a86d,hotel transylvania: transformania,[At one point the movie stops for like two minutes so all the characters can do the Cha-Cha Slid...,1642215928.0
qjg3x9,army of thieves,"[ I would really like to see a sequel. Perhaps a heist during the zombie apocalypse., I guess I'...",1635647029.0


In [14]:
# Still need to merge these movies:

# - star wars: the last jedi -- 7jwxnd, 7rb3uy
# - avengers: infinity war -- 8f84h0, 8gvr6n
# - jeen-yuhs -- sv7agc, t7pqlt
# - avengers: endgame -- bh8iei, bk33kl
# - they shall not grow old -- 9x244z, a5yk8k
# - thor: ragnarok -- 7agfes, 78ivjl

In [15]:
# star wars: the last jedi -- 7jwxnd, 7rb3uy

In [16]:
df_movies_comments.loc[['7jwxnd', '7rb3uy']]

Unnamed: 0,title,comments,post_date_utc
7jwxnd,star wars: episode viii – the last jedi,"[""Luke Skywalker projecting a force ghost across the galaxy to fuck with Kylo Ren"" is the level ...",1513306809.0
7rb3uy,star wars: the last jedi (thread vol. 2),"[Side-quest literally started as a videogame cutscene. \n\nMaz in a ridiculous situation, hologr...",1516294928.0


In [17]:
# Merging the threads

df_movies_comments.loc['7jwxnd']['comments']\
     += df_movies_comments.loc['7rb3uy']['comments']

df_movies_comments = df_movies_comments.drop(index=['7rb3uy'])

In [18]:
# avengers: infinity war -- 8f84h0, 8gvr6n

In [19]:
df_movies_comments.loc[['8f84h0', '8gvr6n']]

Unnamed: 0,title,comments,post_date_utc
8f84h0,avengers: infinity war,[So I guess Avengers is doing pretty well huh? As you can probably tell this is one of our most ...,1524794408.0
8gvr6n,avengers: infinity war (thread vol. 2),[This is the the second thread due to high volume of the first thread. You can [see the initial ...,1525399266.0


In [20]:
# Removing administrative comments

df_movies_comments.loc['8f84h0']['comments']\
    = df_movies_comments.loc['8f84h0']['comments'][1:]

df_movies_comments.loc['8gvr6n']['comments']\
    = df_movies_comments.loc['8gvr6n']['comments'][1:]

In [21]:
# Merging the threads

df_movies_comments.loc['8f84h0']['comments']\
     += df_movies_comments.loc['8gvr6n']['comments']
     
df_movies_comments = df_movies_comments.drop(index=['8gvr6n'])

In [22]:
# jeen-yuhs -- sv7agc, t7pqlt

# This is more of a three-part docuseries than a movie. 
# Plus, Part 2 is missing from the dataset. 
# Better to drop it. Sorry, Kanye fans.

In [23]:
df_movies_comments = df_movies_comments.drop(['sv7agc', 't7pqlt'])

In [24]:
# avengers: endgame -- bh8iei, bk33kl

In [25]:
df_movies_comments.loc[['bh8iei', 'bk33kl']]

Unnamed: 0,title,comments,post_date_utc
bh8iei,avengers: endgame,"[We are officially posting the new official discussion thread on **May 2nd, 21:00 (-5:00 GMT)**,...",1556247619.0
bk33kl,avengers: endgame (2nd thread),"[""I can do this all day.""\n\n""Yeah, ugh, I know.""\n\nMy favorite part about Cap vs Cap is that C...",1556848803.0


In [26]:
# Removing administrative comments

df_movies_comments.loc['bh8iei']['comments']\
    = df_movies_comments.loc['bh8iei']['comments'][1:]

In [27]:
# Merging the threads

df_movies_comments.loc['bh8iei']['comments'] +=\
    df_movies_comments.loc['bk33kl']['comments']

df_movies_comments = df_movies_comments.drop(index=['bk33kl'])

In [28]:
# they shall not grow old -- 9x244z, a5yk8k

In [29]:
df_movies_comments.loc[['9x244z', 'a5yk8k']]

Unnamed: 0,title,comments,post_date_utc
9x244z,they shall not grow old (uk release),"[""Everyone says when you are faced with death you see your past, but when you're 19 you don't ha...",1542337222.0
a5yk8k,they shall not grow old,[This discussion is for the US release of the film. [You can see the international release threa...,1545102023.0


In [30]:
# Removing administrative comments
df_movies_comments.loc['a5yk8k']['comments']\
    = df_movies_comments.loc['a5yk8k']['comments'][1:]

In [31]:
# Merging the threads

df_movies_comments.loc['a5yk8k']['comments'] +=\
    df_movies_comments.loc['9x244z']['comments']

df_movies_comments = df_movies_comments.drop(index=['9x244z'])

In [32]:
# - thor: ragnarok -- 7agfes, 78ivjl

In [33]:
df_movies_comments.loc[['7agfes', '78ivjl']]

Unnamed: 0,title,comments,post_date_utc
7agfes,thor: ragnarok,[[Click here to go to international thread](https://www.reddit.com/r/movies/comments/78ivjl/offi...,1509674697.0
78ivjl,thor: rangarok,"[Honestly, the Marvel movies have recently been trying a little too hard to be funny as of late....",1508879521.0


In [34]:
# Removing administrative comments
df_movies_comments.loc['7agfes']['comments']\
    = df_movies_comments.loc['7agfes']['comments'][1:]

In [35]:
# Merging the threads

df_movies_comments.loc['7agfes']['comments'] +=\
    df_movies_comments.loc['78ivjl']['comments']

df_movies_comments = df_movies_comments.drop(index=['78ivjl'])

In [36]:
# correct spelling

df_movies_comments.loc['7agfes']['title'] = 'thor: ragnarok'

In [37]:
# correcting one more movie title:
df_movies_comments.loc['ncomky']

title                                                                             army of the dead (theater release)
comments         [I just want to say every plot point involving Dave Bautista's daughter was awful. Other than th...
post_date_utc                                                                                           1621044683.0
Name: ncomky, dtype: object

In [38]:
df_movies_comments.loc['ncomky']['title'] = 'army of the dead'

In [39]:
############################
######################################################

# Supposedly all movies have been merged now

In [40]:
# Exploding the comments

df = df_movies_comments.explode('comments').reset_index()

In [41]:
df

Unnamed: 0,index,title,comments,post_date_utc
0,vzcwal,the princess,Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on h...,1657850947.0
1,vzcwal,the princess,"Silly, but entertaining and non stop action",1657850947.0
2,vzcwal,the princess,"The yassification of The Raid\n\nActually, this was fun enough and mad respect to Joey King for ...",1657850947.0
3,vzcwal,the princess,"Honestly, this was pretty fun. The plot is nothing special yes.\n\nBut Joey King was actually e...",1657850947.0
4,vzcwal,the princess,"Man, I loved this movie. Yeah, it was campy, but whatever. The premise worked for me, I liked th...",1657850947.0
...,...,...,...,...
73531,47szbr,"crouching tiger, hidden dragon: sword of destiny",I was entertained bcuz i love Kung Fu Movies but I agree this was pretty bad.,1456541822.0
73532,47szbr,"crouching tiger, hidden dragon: sword of destiny",Is the original on netflix?,1456541822.0
73533,47szbr,"crouching tiger, hidden dragon: sword of destiny","wait, it came out?",1456541822.0
73534,47szbr,"crouching tiger, hidden dragon: sword of destiny",A sequel to a phenomal epic movie with 10 years of planning... https://m.youtube.com/watch?v=CrH...,1456541822.0


In [42]:
df.columns = ['id', 'title', 'comments', 'post_date_utc']

In [43]:
########################

In [44]:
# Check comments for duplicates
# Manually review every comment where the first 50 characters are duplicated.
# Could be: spam, deleted, removed
# Also could be same commenter in two threads about the same movie

comment_counts = df['comments'].apply(lambda x: x[:50]).value_counts()
maybe_duplicate_comments = comment_counts[comment_counts > 1].keys()
maybe_duplicate_comments = list(maybe_duplicate_comments)
len(maybe_duplicate_comments)

65

In [45]:
# Review the comments in duplicate_comment_counts. 
# Decide which are actually duplicates, which could be coincidental, 
# and which should only be kept in one thread (same movie)

In [46]:
list(enumerate(maybe_duplicate_comments))

[(0, '[deleted]'),
 (1, '[removed]'),
 (2, 'I liked it.'),
 (3, 'I liked it'),
 (4, "Heads up there's some new features on YouPoll!\n\nNo"),
 (5, 'This movie was better than it had any right to be.'),
 (6, '\n\nInterdum et malesuada fames ac ante ipsum primis'),
 (7, 'I enjoyed it.'),
 (8, 'What a boring movie'),
 (9, '“Some people, they will never accept him, but some'),
 (10, "Fuck it I'll just say it, Jeff Daniels was so fuck"),
 (11, 'This movie was better than I thought it would be. '),
 (12, "#Reminder if your comment clearly shows you haven'"),
 (13, 'I cried tears of joy and passed out multiple times'),
 (14, 'Terrible'),
 (15, 'I enjoyed this movie more than I thought I would. '),
 (16, 'This movie was way better than it had any right to'),
 (17, 'Loved it!'),
 (18, 'Lmao I didn’t watch any trailers or look at the po'),
 (19, 'This is one of those films where any of the three '),
 (20, 'Boring'),
 (21, 'I might be going against the grain here, but I lef'),
 (22, "If the goal w

In [47]:
# Manual review, since there are only 65

drop_all = [
    0, 1, 4, 7, 22, 23, 43,
    46, 56, 60, 62
    ]

keep_one = [
    10, 12, 15, 16, 18,
    21, 25, 26, 27, 31, 32,
    33, 35, 38, 39, 44, 47,
    50, 53, 54, 57, 58
    ]

do_nothing = [
    2, 3, 5, 6, 8, 9, 11,
    13, 14, 17, 19, 20, 24,
    28, 29, 30, 34, 36, 37,
    40, 41, 42, 45, 48, 49,
    51, 52, 55, 59, 61, 63,
    64
    ]

In [48]:
# Manual review, since there are only 65

drop_all = [
    0, 1, 4, 6, 12, 26, 28, 40, 54, 63
    ]

keep_one = [
    10, 18, 19, 21, 22, 23, 27, 31, 32, 36, 45, 49, 50, 52,
    53, 55, 59, 60, 62, 64
    ]

do_nothing = [
    2, 3, 5, 7, 8, 9, 11, 13, 14, 15, 16, 17, 20, 24, 25, 29,
    30, 33, 34, 35, 37, 38, 39, 41, 42, 43, 44, 46, 47, 48,
    51, 56, 57, 58, 61, 
    ]

In [49]:
# checking += 1
# print(checking)

# df[
#     df['comments'].apply(lambda x: x[:50])\
#          == maybe_duplicate_comments[checking]
#     ]

In [50]:
i_to_drop = []

for i, comment in enumerate(maybe_duplicate_comments):
    if i in drop_all:
        i_to_drop.extend(df[df['comments'].apply(lambda x: x[:50]) == comment].index.tolist())
    elif i in keep_one: 
        i_to_drop.extend(df[df['comments'].apply(lambda x: x[:50]) == comment].index.tolist()[1:])

In [51]:
df = df.drop(index=i_to_drop)

In [52]:
##################
# Checking for other administrative comments
# They are usually the first comment

In [53]:
df_only_first_comment = df.drop_duplicates(subset='id', keep='first')

In [54]:
# Checking for common phrases that would be in an administrative comment.

mask = (
    df_only_first_comment['comments'].apply(lambda x: 'heads up' in x.lower())\
        | df_only_first_comment['comments'].apply(lambda x: 'FYI' in x.lower())
            | df_only_first_comment['comments'].apply(lambda x: 'r/movies' in x.lower())\
                | df_only_first_comment['comments'].apply(lambda x: 'pinned' in x.lower())
    |                df_only_first_comment['comments'].apply(lambda x: 'AMA' in x)
)

df_only_first_comment[mask]

Unnamed: 0,id,title,comments,post_date_utc
237,vzcuye,paws of fury: the legend of hank,"So this movie starts, right? And the setup has me like, oh this must be a rip-off of Blazing Sad...",1657850837.0
1493,uynerm,top gun: maverick,"As it's unpinned, don't forget to give the [Bruckheimer AMA](https://www.reddit.com/r/movies/com...",1653616846.0
2311,u93jj5,the northman,[Robert Eggers and two historians who worked on the movie did a Live Talk with us this morning. ...,1650593018.0
2484,tysxuy,"everything, everywhere, all at once","[Just FYI, Ke Huy Quan graced us with an AMA for this movie today.](https://www.reddit.com/r/mov...",1649383992.0
4011,t68wa3,the batman,"/r/movies discusses *The Batman* on reddit talk, hosted by /u/LiteraryBoner! https://www.reddit....",1646362789.0
21647,imqeot,mulan (2020),"that 40k post about Mulan that we removed:\n\n[After seeing Mulan 2020, I'm noticing a pattern r...",1599260819.0
41724,a6qh98,aquaman,This thread is for the US release of the film. [You can go here to see the original internationa...,1545361207.0
42446,a4rs7c,mortal engines,This is for the US release of the film. [You can see the international release thread here.](htt...,1544756423.0
46733,9bplvq,searching,Heads up Aneesh Chaganty & Sev Ohanian had an AMA on our subreddit and Sev is still answering qu...,1535680816.0
48641,8yf14q,sorry to bother you,[Be sure to check out the AMA with the writer/director here](https://www.reddit.com/r/movies/com...,1531439745.0


In [55]:
# Only the first one is an actual comment about a movie. That was some good search criteria.

In [56]:
df = df.drop(
    df_only_first_comment[mask].index[1:]
    )

In [57]:
# Adding date columns

In [58]:
df['post_date_utc'] = df['post_date_utc'].astype(int)

In [59]:
df['post_year'] = df['post_date_utc'].apply(lambda x: datetime.fromtimestamp(x).year)
df['post_month'] = df['post_date_utc'].apply(lambda x: datetime.fromtimestamp(x).month)
df['post_day'] = df['post_date_utc'].apply(lambda x: datetime.fromtimestamp(x).day)

In [60]:
df.to_csv("../data/data.csv", index=True)

In [63]:
pd.read_csv("../data/data.csv", index_col=0)

Unnamed: 0,id,title,comments,post_date_utc,post_year,post_month,post_day
0,vzcwal,the princess,Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on h...,1657850947,2022,7,14
1,vzcwal,the princess,"Silly, but entertaining and non stop action",1657850947,2022,7,14
2,vzcwal,the princess,"The yassification of The Raid\n\nActually, this was fun enough and mad respect to Joey King for ...",1657850947,2022,7,14
3,vzcwal,the princess,"Honestly, this was pretty fun. The plot is nothing special yes.\n\nBut Joey King was actually e...",1657850947,2022,7,14
4,vzcwal,the princess,"Man, I loved this movie. Yeah, it was campy, but whatever. The premise worked for me, I liked th...",1657850947,2022,7,14
...,...,...,...,...,...,...,...
73531,47szbr,"crouching tiger, hidden dragon: sword of destiny",I was entertained bcuz i love Kung Fu Movies but I agree this was pretty bad.,1456541822,2016,2,26
73532,47szbr,"crouching tiger, hidden dragon: sword of destiny",Is the original on netflix?,1456541822,2016,2,26
73533,47szbr,"crouching tiger, hidden dragon: sword of destiny","wait, it came out?",1456541822,2016,2,26
73534,47szbr,"crouching tiger, hidden dragon: sword of destiny",A sequel to a phenomal epic movie with 10 years of planning... https://m.youtube.com/watch?v=CrH...,1456541822,2016,2,26
