* join IMDb title and ratings dataframes
* join IMDb dataframe with df
    * Use both title and year???

In [1]:
import numpy as np
import pandas as pd
import warnings
import datetime

from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from transformers import pipelines

import re
from collections import OrderedDict, Counter
import itertools
import string

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import praw

import json

pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 100
seed = 55
np.random.seed(seed)

In [2]:
comments_df = pd.read_csv("data/comments_exploded.csv", index_col=0)
comments_df.head()

Unnamed: 0,id,title,comments,post_date_utc,post_year,post_month,post_day
0,vzcwal,the princess,Joey King needs a new agent. She’s proven she has talent but she has so many terrible films on h...,1657851000.0,2022,7,14
1,vzcwal,the princess,"Silly, but entertaining and non stop action",1657851000.0,2022,7,14
2,vzcwal,the princess,"The yassification of The Raid\n\nActually, this was fun enough and mad respect to Joey King for ...",1657851000.0,2022,7,14
3,vzcwal,the princess,"Honestly, this was pretty fun. The plot is nothing special yes.\n\nBut Joey King was actually e...",1657851000.0,2022,7,14
4,vzcwal,the princess,"Man, I loved this movie. Yeah, it was campy, but whatever. The premise worked for me, I liked th...",1657851000.0,2022,7,14


In [3]:
movies_df = pd.read_csv("data/reddit_movies_final.csv", index_col=0)
movies_df.head()

Unnamed: 0,id,title,post_year,post_month,post_day
0,vzcwal,the princess,2022,7,14
1,vzcw0a,the man from toronto,2022,7,14
2,vzcvsd,the sea beast,2022,7,14
3,vzcvkz,mrs. harris goes to paris,2022,7,14
4,vzcv66,where the crawdads sing,2022,7,14


In [4]:
###################

In [5]:
title_basics_imdb = pd.read_csv(
    "./data/imdb/title.basics.tsv.gz", 
    delimiter="\t", 
    compression="gzip",
    na_values=r"\N",
    low_memory=False
    )
title_basics_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9237277 entries, 0 to 9237276
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         float64
 5   startYear       float64
 6   endYear         float64
 7   runtimeMinutes  object 
 8   genres          object 
dtypes: float64(3), object(6)
memory usage: 634.3+ MB


In [6]:
title_basics_imdb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9237272,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0.0,2010.0,,,"Action,Drama,Family"
9237273,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0.0,2010.0,,,"Action,Drama,Family"
9237274,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0.0,2010.0,,,"Action,Drama,Family"
9237275,tt9916856,short,The Wind,The Wind,0.0,2015.0,,27,Short


In [7]:
title_basics_imdb['titleType'].value_counts()

tvEpisode       6964547
short            890210
movie            621752
video            265317
tvSeries         231314
tvMovie          137588
tvMiniSeries      45198
tvSpecial         38401
videoGame         32274
tvShort           10674
tvPilot               2
Name: titleType, dtype: int64

In [8]:
# Only include movie types that I think could be discussed in my dataset from r/movies

filtered_title_basics_imdb = title_basics_imdb[
    (title_basics_imdb['titleType'] == 'movie') &
    (title_basics_imdb['isAdult'] == 0) &
    (title_basics_imdb['startYear'] >= 2015)
    ]
filtered_title_basics_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140562 entries, 11636 to 9237217
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          140562 non-null  object 
 1   titleType       140562 non-null  object 
 2   primaryTitle    140562 non-null  object 
 3   originalTitle   140562 non-null  object 
 4   isAdult         140562 non-null  float64
 5   startYear       140562 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  103121 non-null  object 
 8   genres          134481 non-null  object 
dtypes: float64(3), object(6)
memory usage: 10.7+ MB


In [9]:
filtered_title_basics_imdb = filtered_title_basics_imdb.drop(columns=['endYear', 'isAdult'])

In [10]:
filtered_title_basics_imdb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,2019.0,,"Action,Crime"
13079,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021.0,133,Documentary
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mirror,El Tango del Viudo y Su Espejo Deformante,2020.0,70,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,2018.0,122,Drama
81238,tt0083060,movie,The Drive to Win,Sha Ou,2019.0,,"Drama,Sport"
...,...,...,...,...,...,...,...
9237042,tt9916362,movie,Coven,Akelarre,2020.0,92,"Drama,History"
9237074,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,2019.0,,"Adventure,History,War"
9237126,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019.0,123,Drama
9237167,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015.0,57,Documentary


In [11]:
title_ratings_imdb = pd.read_csv("./data/imdb/title.ratings.tsv.gz", delimiter="\t", compression="gzip")

In [12]:
imdb_data = pd.merge(
    left=filtered_title_basics_imdb, 
    right=title_ratings_imdb, 
    how='inner', 
    on="tconst"
    )

In [13]:
imdb_data

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,2021.0,133,Documentary,6.4,36
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mirror,El Tango del Viudo y Su Espejo Deformante,2020.0,70,Drama,6.4,161
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,2018.0,122,Drama,6.7,7366
3,tt0083060,movie,The Drive to Win,Sha Ou,2019.0,,"Drama,Sport",6.6,33
4,tt0100275,movie,The Wandering Soap Opera,La Telenovela Errante,2017.0,80,"Comedy,Drama,Fantasy",6.4,333
...,...,...,...,...,...,...,...,...,...
69427,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,2020.0,84,Thriller,5.8,1357
69428,tt9916362,movie,Coven,Akelarre,2020.0,92,"Drama,History",6.4,4872
69429,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,2019.0,,"Adventure,History,War",3.8,14
69430,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019.0,123,Drama,8.3,6


In [14]:
primaryTitle_Series = imdb_data['primaryTitle'].copy()

In [15]:
imdb_data['primaryTitle'] = imdb_data['primaryTitle'].str.lower()

In [16]:
imdb_data.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
startYear         float64
runtimeMinutes     object
genres             object
averageRating     float64
numVotes            int64
dtype: object

In [17]:
imdb_data['runtimeMinutes'] = imdb_data['runtimeMinutes'].astype(float)
imdb_data['startYear'] = imdb_data['startYear'].astype(int)

In [18]:
imdb_data.isna().sum()

tconst               0
titleType            0
primaryTitle         0
originalTitle        0
startYear            0
runtimeMinutes    5972
genres             796
averageRating        0
numVotes             0
dtype: int64

In [19]:
# For now this is fine

In [20]:
movies_df.head()

Unnamed: 0,id,title,post_year,post_month,post_day
0,vzcwal,the princess,2022,7,14
1,vzcw0a,the man from toronto,2022,7,14
2,vzcvsd,the sea beast,2022,7,14
3,vzcvkz,mrs. harris goes to paris,2022,7,14
4,vzcv66,where the crawdads sing,2022,7,14


In [23]:
reddit_imdb_movies = pd.merge(
    left=imdb_data, 
    right=movies_df, 
    how='inner',
    left_on=['primaryTitle', 'startYear'],
    right_on=['title', 'post_year']
    # left_on='primaryTitle',
    # right_on='title'
    )

In [27]:
reddit_imdb_movies

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,id,title,post_year,post_month,post_day
0,tt0069049,movie,the other side of the wind,The Other Side of the Wind,2018,122.0,Drama,6.7,7366,9t6tc7,the other side of the wind,2018,11,2
1,tt0360556,movie,fahrenheit 451,Fahrenheit 451,2018,100.0,"Drama,Sci-Fi,Thriller",4.9,20429,8kpukl,fahrenheit 451,2018,5,19
2,tt0385887,movie,motherless brooklyn,Motherless Brooklyn,2019,144.0,"Crime,Drama,Mystery",6.8,57144,dps11v,motherless brooklyn,2019,10,31
3,tt0437086,movie,alita: battle angel,Alita: Battle Angel,2019,122.0,"Action,Adventure,Sci-Fi",7.3,265007,aoc6mr,alita: battle angel,2019,2,12
4,tt0448115,movie,shazam!,Shazam!,2019,132.0,"Action,Adventure,Comedy",7.0,329951,b94c5i,shazam!,2019,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,tt9779516,movie,i still believe,I Still Believe,2020,116.0,"Biography,Drama,Music",6.4,12058,fht3jn,i still believe,2020,3,12
782,tt9784798,movie,judas and the black messiah,Judas and the Black Messiah,2021,126.0,"Biography,Drama,History",7.4,77737,lj1cfb,judas and the black messiah,2021,2,13
783,tt9812474,movie,lamb,Lamb,2021,106.0,"Drama,Fantasy,Horror",6.3,26411,q3nxg1,lamb,2021,10,7
784,tt9827834,movie,sylvie's love,Sylvie's Love,2020,114.0,"Drama,Music,Romance",6.8,5887,kkbe0l,sylvie's love,2020,12,25


In [28]:
# There are quite a few duplicates.
# Checking numVotes and the movies' IMDb pages (imdb.com/title/<tconst>), these are almost all coincidences.
# I.e., they are similarly named movies that came out in the same year, 
# but aren't the same major motion picture that was discussed on Reddit.
# A few exceptions:
# - "The Girl on the Train," is a 2016 movie with a 2021 Indian remake, and both were discussed on Reddit
# - "The Promise" is actually a 2016 movie, but Reddit discussed it in 2017, so it got matched with the wrong movie on IMDb.
# Luckily, I can filter out the duplicates easily by dropping the movie that has fewer votes on IMDb.

reddit_imdb_movies[reddit_imdb_movies['title'].duplicated(keep=False)]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,id,title,post_year,post_month,post_day
26,tt10155932,movie,cinderella,Cinderella,2021,113.0,"Comedy,Family,Fantasy",4.2,41769,plccrz,cinderella,2021,9,9
27,tt8907882,movie,cinderella,Cinderella,2021,122.0,"Drama,Horror",6.0,1721,plccrz,cinderella,2021,9,9
37,tt10333426,movie,bliss,Bliss,2021,103.0,"Drama,Romance,Sci-Fi",5.3,17817,ldxwd6,bliss,2021,2,6
38,tt12092584,movie,bliss,Glück,2021,91.0,"Drama,Romance",5.5,266,ldxwd6,bliss,2021,2,6
39,tt10342730,movie,spiral,Spiral: From the Book of Saw,2021,93.0,"Crime,Horror,Mystery",5.2,52440,nbyiaw,spiral,2021,5,13
40,tt9278312,movie,spiral,Spiral,2021,92.0,Drama,7.1,19,nbyiaw,spiral,2021,5,13
64,tt10832274,movie,swan song,Swan Song,2021,105.0,Drama,7.0,2448,rlvt8t,swan song,2021,12,21
65,tt13207508,movie,swan song,Swan Song,2021,112.0,"Drama,Romance,Sci-Fi",6.8,13848,rlvt8t,swan song,2021,12,21
90,tt11252248,movie,dog,Dog,2022,101.0,"Comedy,Drama",6.5,28949,sv796z,dog,2022,2,17
91,tt19880966,movie,dog,Dog,2022,83.0,"Drama,Thriller",8.2,7,sv796z,dog,2022,2,17


In [54]:
duplicate_titles = reddit_imdb_movies[
    reddit_imdb_movies['title'].duplicated(keep=False)
    ].drop(index=[349, 738, 403, 429])['title'].tolist()
duplicate_titles = list(set(duplicate_titles))

duplicate_indices = reddit_imdb_movies[
    reddit_imdb_movies['title'].duplicated(keep=False)
    ].drop(index=[349, 738, 403, 429]).index.tolist()

In [61]:
reddit_imdb_movies[reddit_imdb_movies['title'].isin(duplicate_titles)]['title'].value_counts()

cinderella                 2
coco                       2
they shall not grow old    2
cuties                     2
robin hood                 2
the promise                2
polar                      2
stronger                   2
extinction                 2
the circle                 2
rocketman                  2
bliss                      2
anna                       2
love and monsters          2
the princess               2
run                        2
val                        2
dog                        2
swan song                  2
spiral                     2
ava                        2
Name: title, dtype: int64

In [None]:
# For loop that picks the movie with more IMDB votes.

In [None]:
# imdb_data[
#     (imdb_data['primaryTitle'].str.contains(r"\(\d{4}\)"))
#     ]

In [None]:
# with open('./.secret/ZSDSFI_client_id.txt') as f:
#     client_id = f.read()

# with open('./.secret/ZSDSFI_client_secret.txt') as f:
#     client_secret = f.read()

In [None]:
# reddit = praw.Reddit(
#     client_id=client_id,
#     client_secret=client_secret,
#     user_agent="Movie Scraper by ZSDSFI"
# )