In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import praw

import re
import string

In [2]:
pd.options.display.max_colwidth = 75

### Using PRAW library as a wrapper for the Reddit API

In [3]:
with open('../.secret/ZSDSFI_client_id.txt') as f:
    client_id = f.read()

with open('../.secret/ZSDSFI_client_secret.txt') as f:
    client_secret = f.read()

In [4]:
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent="Movie Scraper by ZSDSFI"
)

I'm using the subreddit *r/discussionarchive* to get a list of **official** movie discussions on *r/movies*.

In [5]:
posts = []

movie_archive = reddit.subreddit("discussionarchive").new(limit=1000)

for submission in movie_archive:
    posts.append((submission.title, submission.url))

Reddit has a limitation of 1000 posts per subreddit. This shouldn't be an issue for this project. I have access to movie discussions from February 2016 to July 2022. And each discussion itself contains many comments. That's a lot of data.

In [6]:
df = pd.DataFrame(posts)
df.columns = ['post_title', 'url']

In [7]:
df

Unnamed: 0,post_title,url
0,Official Discussion - The Princess [SPOILERS],https://www.reddit.com/r/movies/comments/vzcwal/official_discussion_the...
1,Official Discussion - The Man From Toronto [SPOILERS],https://www.reddit.com/r/movies/comments/vzcw0a/official_discussion_the...
2,Official Discussion - The Sea Beast [SPOILERS],https://www.reddit.com/r/movies/comments/vzcvsd/official_discussion_the...
3,Official Discussion - Mrs. Harris Goes to Paris [SPOILERS],https://www.reddit.com/r/movies/comments/vzcvkz/official_discussion_mrs...
4,Official Discussion - Where the Crawdads Sing [SPOILERS],https://www.reddit.com/r/movies/comments/vzcv66/official_discussion_whe...
...,...,...
995,Official Discussion: Whiskey Tango Foxtrot [SPOILERS],https://www.reddit.com/r/movies/comments/48vhmk/official_discussion_whi...
996,Official Discussion: Zootopia [SPOILERS],https://redd.it/48vhc8
997,Official Oscar Post Game Thread 2016,https://www.reddit.com/r/movies/comments/488mq3/official_oscar_post_gam...
998,Official Oscar Thread 2016,https://www.reddit.com/r/movies/comments/487kb1/official_oscar_thread_2...


In [8]:
df['post_title'] = df['post_title'].apply(lambda x: x.lower())

In [9]:
df['id'] = df['url'].apply(praw.models.Submission.id_from_url)

In [10]:
# There are discussions for award shows and end-of-year discussions. We don't want those.

In [11]:
df[
    df['post_title'].str.contains('golden globes') |
    df['post_title'].str.contains('oscars') |
    df['post_title'].str.contains('best of') |
    ~df['post_title'].str.contains('discussion')
    ]

Unnamed: 0,post_title,url,id
37,studio 666 [spoilers],https://www.reddit.com/r/movies/comments/t0tapr/official_discussion_stu...,t0tapr
42,official discusion - apollo 10½: a space age childhood [spoilers],https://www.reddit.com/r/movies/comments/tvh87q/official_discusion_apol...,tvh87q
410,oscars 2020: official post-game thread,https://www.reddit.com/r/movies/comments/f1l3r5/rmovies_oscars_2020_off...,f1l3r5
411,official oscars thread 2020,https://www.reddit.com/r/movies/comments/f1i94m/official_oscars_thread_...,f1i94m
425,best of 2019 discussion threads,https://www.reddit.com/r/discussionarchive/comments/ei5p69/best_of_2019...,ei5p69
543,live thread - avengers: endgame [spoilers],https://www.reddit.com/r/movies/comments/bgnl7y/live_thread_avengers_en...,bgnl7y
570,/r/movies oscars 2019: official post-game thread,https://www.reddit.com/r/movies/comments/augsew/rmovies_oscars_2019_off...,augsew
571,official oscars thread 2019,https://www.reddit.com/r/movies/comments/auexh3/official_oscars_thread_...,auexh3
597,best of 2018 discussion threads,https://www.reddit.com/r/discussionarchive/comments/ac80cv/best_of_2018...,ac80cv
748,/r/movies official 2018 golden globes post-game thread,https://www.reddit.com/r/movies/comments/7ovyyn/rmovies_official_2018_g...,7ovyyn


In [12]:
mask_out = df[
    df['post_title'].str.contains('golden globes') |
    df['post_title'].str.contains('oscars') |
    df['post_title'].str.contains('best of') |
    ~df['post_title'].str.contains('discussion')
    ].iloc[2:].index

In [13]:
df = df.drop(mask_out)

In [14]:
# Any duplicates?

In [15]:
df[df.duplicated(keep=False)]

Unnamed: 0,post_title,url,id
367,official discussion - a whisker away [spoilers],https://www.reddit.com/r/movies/comments/hgwqpd/official_discussion_a_w...,hgwqpd
368,official discussion - a whisker away [spoilers],https://www.reddit.com/r/movies/comments/hgwqpd/official_discussion_a_w...,hgwqpd


In [16]:
df = df.drop_duplicates(keep='first')

### Using ReGex to get movie names from each post.
This will take a bit of grunt work r/movies doesn't format each of its discussions the exact same way. Thankfully there is enough of a pattern that I can still do this in a few steps.

In [17]:
df['post_title']

0                                   official discussion - the princess [spoilers]
1                           official discussion - the man from toronto [spoilers]
2                                  official discussion - the sea beast [spoilers]
3                      official discussion - mrs. harris goes to paris [spoilers]
4                        official discussion - where the crawdads sing [spoilers]
                                          ...                                    
992                           official discussion: 10 cloverfield lane [spoilers]
994                             official discussion: london has fallen [spoilers]
995                         official discussion: whiskey tango foxtrot [spoilers]
996                                      official discussion: zootopia [spoilers]
999    official discussion - crouching tiger, hidden dragon: sword of destiny ...
Name: post_title, Length: 979, dtype: object

In [18]:
title_Series = df['post_title'].rename('title')

In [19]:
title_Series =  title_Series.apply(lambda x: re.sub("(official discussion(: | - )| \[spoilers])", "", x))

In [20]:
title_Series[
    title_Series.str.contains('spoilers')|
    title_Series.str.contains('discussion')|
    title_Series.str.contains('international')
    ]

139                                                  dune (international release)
472                                                       jesus is king [spoilers
483                                                              joker (spoilers)
520                                                   men in black: international
583            how to train your dragon: the hidden world (international release)
614                                                aquaman (international thread)
617                                         mortal engines (international thread)
662                                                     a simple favor (spoilers)
663                                                       the predator (spoilers)
681                                               ant man and the wasp (spoilers)
690                                                    the first purge (spoilers)
700                         jurassic world: fallen kingdom (international thread)
775             

* International releases should be treated as normal.
* "Spoilers" is sometimes written in different formats.

In [21]:
title_Series = title_Series.apply(lambda x: x.replace(" (spoilers)", ""))
title_Series = title_Series.apply(lambda x: x.replace(" [spoilers", ""))
title_Series = title_Series.apply(lambda x: x.replace(" (international release)", ""))
title_Series = title_Series.apply(lambda x: x.replace(" (international thread)", ""))

In [22]:
title_Series[
    title_Series.str.contains('spoilers')|
    title_Series.str.contains('discussion')|
    title_Series.str.contains('international')
    ]

520                                                   men in black: international
894              official international discussion - rogue one: a star wars story
900    reminder: remember to sort by "new" in order to see the discussions lis...
912                     official international release discussion: doctor strange
966                   official international release discussion thread - warcraft
970          official international release discussion thread - x-men: apocalypse
977    official international release discussion thread - captain america: civ...
Name: title, dtype: object

In [23]:
pattern = "((official international discussion)|(official international release discussion thread)|(official international release discussion))(: | - )"

title_Series = title_Series.apply(lambda x: re.sub(pattern=pattern, repl="", string=x))

In [24]:
title_Series[
    title_Series.str.contains('spoilers')|
    title_Series.str.contains('discussion')|
    title_Series.str.contains('international')
    ]

520                                                   men in black: international
900    reminder: remember to sort by "new" in order to see the discussions lis...
Name: title, dtype: object

In [25]:
# Remove row 900

In [26]:
df['title'] = title_Series

In [27]:
df = df[['title', 'post_title', 'id', 'url']]

In [28]:
df = df.drop(index=900)

In [29]:
# A few more quick fixes

In [30]:
df[df['title'].str.contains('us thread')]

Unnamed: 0,title,post_title,id,url
693,jurassic world: fallen kingdom (us thread),official discussion - jurassic world: fallen kingdom (us thread) [spoil...,8sxn3q,https://www.reddit.com/r/movies/comments/8sxn3q/official_discussion_jur...
791,american made (us thread),official discussion: american made (us thread) [spoilers],734sen,https://www.reddit.com/r/movies/comments/734sen/official_discussion_ame...


In [31]:
df[df['title'].str.contains('us release')]

Unnamed: 0,title,post_title,id,url
343,tenet (us release),official discussion - tenet (us release) [spoilers],im7etj,https://www.reddit.com/r/movies/comments/im7etj/official_discussion_ten...
572,how to train your dragon: the hidden world (us release),official discussion - how to train your dragon: the hidden world (us re...,aq72ni,https://www.reddit.com/r/movies/comments/aq72ni/official_discussion_how...
606,aquaman (us release),official discussion: aquaman (us release) [spoilers],a6qh98,https://www.reddit.com/r/movies/comments/a6qh98/official_discussion_aqu...
608,they shall not grow old (us release),official discussion: they shall not grow old (us release) [spoilers],a5yk8k,https://www.reddit.com/r/movies/comments/a5yk8k/official_discussion_the...
612,mortal engines (us release),official discussion: mortal engines (us release) [spoilers],a4rs7c,https://www.reddit.com/r/movies/comments/a4rs7c/official_discussion_mor...
727,the death of stalin (us release),official discussion: the death of stalin (us release) [spoilers],833jvx,https://www.reddit.com/r/movies/comments/833jvx/official_discussion_the...
767,coco (us release),official discussion: coco (us release) [spoilers],7eoph7,https://www.reddit.com/r/movies/comments/7eoph7/official_discussion_coc...
773,thor: ragnarok (us release),official discussion - thor: ragnarok (us release) [spoilers],7agfes,https://www.reddit.com/r/movies/comments/7agfes/official_discussion_tho...
833,alien: covenant (us release),official discussion - alien: covenant (us release) [spoilers],6c0sby,https://www.reddit.com/r/movies/comments/6c0sby/official_discussion_ali...
844,t2 trainspotting (us release),official discussion - t2 trainspotting (us release) [spoilers],65gg0a,https://www.reddit.com/r/movies/comments/65gg0a/official_discussion_t2_...


In [32]:
df['title'] = df['title'].apply(lambda x: x.replace(" (us thread)", ""))
df['title'] = df['title'].apply(lambda x: x.replace(" (us release)", ""))

In [33]:
df[df['post_title'].str.contains('discusion')]

Unnamed: 0,title,post_title,id,url
42,official discusion - apollo 10½: a space age childhood,official discusion - apollo 10½: a space age childhood [spoilers],tvh87q,https://www.reddit.com/r/movies/comments/tvh87q/official_discusion_apol...


In [34]:
df['title'].loc[42] = 'apollo 10½: a space age childhood'

In [35]:
df[df['post_title'].str.contains('american assassin')]

Unnamed: 0,title,post_title,id,url
795,american assassin • r/movies,official discussion - american assassin [spoilers] • r/movies,706ydq,https://www.reddit.com/r/movies/comments/706ydq/official_discussion_ame...


In [36]:
df['title'].loc[795] = 'american assassin'

In [37]:
df.sample(5)

Unnamed: 0,title,post_title,id,url
45,the lost city,official discussion - the lost city [spoilers],tnbhdb,https://www.reddit.com/r/movies/comments/tnbhdb/official_discussion_the...
123,halloween kills,official discussion - halloween kills [spoilers],q8e5te,https://www.reddit.com/r/movies/comments/q8e5te/official_discussion_hal...
615,once upon a deadpool,official discussion: once upon a deadpool [spoilers],a4qc91,https://www.reddit.com/r/movies/comments/a4qc91/official_discussion_onc...
756,the shape of water,official discussion: the shape of water [spoilers],7llz2i,https://www.reddit.com/r/movies/comments/7llz2i/official_discussion_the...
855,kong: skull island,official discussion - kong: skull island [spoilers],5yjq4s,https://www.reddit.com/r/movies/comments/5yjq4s/official_discussion_kon...


**Note that we now have some duplicates in the "title" column since some movies have separate US discussions and an international discussions. I will leave this as is for this notebook.**

In [38]:
df[df.duplicated(subset=['title'], keep=False)]

Unnamed: 0,title,post_title,id,url
572,how to train your dragon: the hidden world,official discussion - how to train your dragon: the hidden world (us re...,aq72ni,https://www.reddit.com/r/movies/comments/aq72ni/official_discussion_how...
583,how to train your dragon: the hidden world,official discussion - how to train your dragon: the hidden world (inter...,am2yeh,https://www.reddit.com/r/movies/comments/am2yeh/official_discussion_how...
606,aquaman,official discussion: aquaman (us release) [spoilers],a6qh98,https://www.reddit.com/r/movies/comments/a6qh98/official_discussion_aqu...
612,mortal engines,official discussion: mortal engines (us release) [spoilers],a4rs7c,https://www.reddit.com/r/movies/comments/a4rs7c/official_discussion_mor...
614,aquaman,official discussion: aquaman (international thread) [spoilers],a4qfoa,https://www.reddit.com/r/movies/comments/a4qfoa/official_discussion_aqu...
617,mortal engines,official discussion: mortal engines (international thread) [spoilers],a40xfn,https://www.reddit.com/r/movies/comments/a40xfn/official_discussion_mor...
693,jurassic world: fallen kingdom,official discussion - jurassic world: fallen kingdom (us thread) [spoil...,8sxn3q,https://www.reddit.com/r/movies/comments/8sxn3q/official_discussion_jur...
700,jurassic world: fallen kingdom,official discussion - jurassic world: fallen kingdom (international thr...,8pa372,https://www.reddit.com/r/movies/comments/8pa372/official_discussion_jur...
727,the death of stalin,official discussion: the death of stalin (us release) [spoilers],833jvx,https://www.reddit.com/r/movies/comments/833jvx/official_discussion_the...
767,coco,official discussion: coco (us release) [spoilers],7eoph7,https://www.reddit.com/r/movies/comments/7eoph7/official_discussion_coc...


**Note that some movies have a year in the title because they are remakes. I think that will be OK, at least for this notebook.**

In [39]:
df[df['title'].str.contains(pat="\(\d{4}\)", regex=True)]

Unnamed: 0,title,post_title,id,url
48,cheaper by the dozen (2022),official discussion - cheaper by the dozen (2022) [spoilers],thl0fx,https://www.reddit.com/r/movies/comments/thl0fx/official_discussion_che...
64,texas chainsaw massacre (2022),official discussion - texas chainsaw massacre (2022) [spoilers],sw0jcl,https://www.reddit.com/r/movies/comments/sw0jcl/official_discussion_tex...
83,scream (2022),official discussion - scream (2022) [spoilers],s3hh4h,https://www.reddit.com/r/movies/comments/s3hh4h/official_discussion_scr...
121,dune (2021),official discussion - dune (2021) [spoilers],qd6vqv,https://www.reddit.com/r/movies/comments/qd6vqv/official_discussion_dun...
143,cinderella (2021),official discussion - cinderella (2021) [spoilers],plccrz,https://www.reddit.com/r/movies/comments/plccrz/official_discussion_cin...
...,...,...,...,...
934,pete's dragon (2016),official discussion: pete's dragon (2016) [spoilers],4xbkux,https://www.reddit.com/r/movies/comments/4xbkux/official_discussion_pet...
944,ghostbusters (2016),official discussion: ghostbusters (2016) [spoilers],4sww1w,https://redd.it/4sww1w
979,criminal (2016),official discussion: criminal (2016) [spoilers],4ezvfe,https://redd.it/4ezvfe
982,the jungle book (2016),official discussion: the jungle book (2016) [spoilers],4euj3e,https://redd.it/4euj3e


In [40]:
df.to_csv("../data/movies_cleaned.csv", index=False)

In [41]:
pd.read_csv("../data/movies_cleaned.csv")

Unnamed: 0,title,post_title,id,url
0,the princess,official discussion - the princess [spoilers],vzcwal,https://www.reddit.com/r/movies/comments/vzcwal/official_discussion_the...
1,the man from toronto,official discussion - the man from toronto [spoilers],vzcw0a,https://www.reddit.com/r/movies/comments/vzcw0a/official_discussion_the...
2,the sea beast,official discussion - the sea beast [spoilers],vzcvsd,https://www.reddit.com/r/movies/comments/vzcvsd/official_discussion_the...
3,mrs. harris goes to paris,official discussion - mrs. harris goes to paris [spoilers],vzcvkz,https://www.reddit.com/r/movies/comments/vzcvkz/official_discussion_mrs...
4,where the crawdads sing,official discussion - where the crawdads sing [spoilers],vzcv66,https://www.reddit.com/r/movies/comments/vzcv66/official_discussion_whe...
...,...,...,...,...
973,10 cloverfield lane,official discussion: 10 cloverfield lane [spoilers],49wvnj,https://redd.it/49wvnj
974,london has fallen,official discussion: london has fallen [spoilers],48vhsf,https://redd.it/48vhsf
975,whiskey tango foxtrot,official discussion: whiskey tango foxtrot [spoilers],48vhmk,https://www.reddit.com/r/movies/comments/48vhmk/official_discussion_whi...
976,zootopia,official discussion: zootopia [spoilers],48vhc8,https://redd.it/48vhc8
