In [49]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import os
from collections import defaultdict, Counter
import json
import re

In [50]:
df = pd.read_csv('dataset/tmdb_5000_movies.csv')
# filter only US movies
df = df[df.production_countries.str.contains("United States of America")]
df = df.sort_values(by='release_date', ascending=False)
df = df.reset_index(drop=True)

In [51]:
subs = glob("dataset/subtitles/*")

In [52]:
# TODO: movies with a slash had issues with the subs. needs a fix.
for t in df.title.tolist():
    if "/" in t:
        print(t)

Self/less
50/50
Frost/Nixon
Fahrenheit 9/11
Crazy/Beautiful
Face/Off
Nine 1/2 Weeks


In [53]:
def srt_to_list(srt_fname):
    with open(srt_fname, "r", encoding='iso-8859-1') as f:
        sub_text = f.read()
    blocks = sub_text.strip().split("\n\n")
    sub_lines = []
    for block in blocks:
        lines = block.split("\n")[2:]
        # remove empty strings and html tags
        lines = [re.sub(r'<.*?>', "", t) for t in lines if t]
        sub_lines.append("\n".join(lines))
    return sub_lines


def count(sentence_list):
    clean_list = [t.replace("\n", " ").lower().strip() for t in sentence_list]
    return Counter(clean_list)

In [54]:
all_titles = df.title.tolist()
sub_dict = defaultdict(str)
counter = Counter()
for sub in tqdm(subs):
    title = os.path.basename(sub).replace(".en.srt", "")
    if title not in all_titles:
        print("Error: ", title)
        continue
    try:
        sub_lines = srt_to_list(sub)
        sub_dict[title] = sub_lines
        counter += count(sub_lines)
    except Exception as e:
        print(title)
        print(e)

 12%|█▏        | 406/3443 [00:07<01:30, 33.54it/s]

Error:  less


 21%|██        | 719/3443 [00:21<02:23, 19.04it/s]

Error:  subtitles.json


 48%|████▊     | 1636/3443 [01:43<02:36, 11.53it/s]

Error:  Off


 57%|█████▋    | 1959/3443 [02:24<02:31,  9.78it/s]

Error:  Beautiful


 85%|████████▌ | 2930/3443 [05:33<01:26,  5.96it/s]

Error:  2 Weeks


100%|██████████| 3443/3443 [07:07<00:00,  5.35it/s]


In [55]:
counter.most_common(20)

[('', 22431),
 ('yeah.', 9639),
 ('what?', 7676),
 ('okay.', 7660),
 ('no.', 7285),
 ('thank you.', 6154),
 ('yes.', 4281),
 ('come on.', 3892),
 ('all right.', 3231),
 ('hey.', 3094),
 ('no!', 2856),
 ('oh.', 2456),
 ("i don't know.", 2361),
 ("i'm sorry.", 2304),
 ('thanks.', 2243),
 ('hello?', 2236),
 ('hi.', 2066),
 ('hey!', 1971),
 ('oh, my god.', 1910),
 ('come on!', 1896)]

In [56]:
# exploring repeating patterns

n = 0
for k, v in sorted(counter.items(), key=lambda x: -x[1]):
    if len(k) > 25:
        print(k)
        n += 1
    if n > 50:
        break

what are you talking about?
subtitles downloaded from podnapisi.net
advertise your product or brand here contact www.opensubtitles.org today
support us and become vip member  to remove all ads from www.opensubtitles.org
what the hell are you doing?
i don't know what you're talking about.
what's the matter with you?
- thank you. - you're welcome.
- good night. - good night.
come on, come on, come on.
what's that supposed to mean?
what the fuck are you doing?
what the hell is going on?
what do you want me to do?
let me tell you something.
what the fuck are you talking about?
what the hell are you talking about?
that's what i'm talking about.
where do you think you're going?
"at sundown... punjabis have a ball."
are you fucking kidding me?
- thanks. - you're welcome.
don't even think about it.
what the fuck is wrong with you?
what the hell are you doing here?
- good morning. - good morning.
âª it's time for the percolator âª
i want to show you something.
(indistinct conversations)
what 

In [60]:
with open('dataset/subtitles/subtitles.json', 'w') as f:
    json.dump(sub_dict, f)

In [57]:
# use code below in your notebooks to use subtitles

In [62]:
# find json download link in slack

In [19]:
with open('dataset/subtitles/subtitles.json', 'r') as f:
    sub_dict = json.load(f)
sub_dict = defaultdict(str, sub_dict)

In [63]:
# df["subtitles"] = df.title.apply(lambda title: sub_dict[title]) # keep blocks separated
df["subtitles"] = df.title.apply(
    lambda title: "\n\n".join(sub_dict[title]))  # single string
df.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,subtitles
0,8000000,"[{""id"": 18, ""name"": ""Drama""}]",,374461,"[{""id"": 1650, ""name"": ""cook""}, {""id"": 6054, ""n...",en,Mr. Church,A unique friendship develops when a little gir...,7.828459,"[{""name"": ""Envision Media Arts"", ""id"": 19456},...",...,2016-09-16,0,104.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,He was the one person she could always count on.,Mr. Church,7.0,129,Henry Joseph Church\ncould have been...\n\nAny...
1,0,"[{""id"": 12, ""name"": ""Adventure""}]",http://www.focusfeatures.com/kicks,385736,"[{""id"": 3405, ""name"": ""blow job""}, {""id"": 1173...",en,Kicks,When his hard-earned kicks get snatched by a l...,3.467923,"[{""name"": ""Bystorm Films"", ""id"": 2903}, {""name...",...,2016-09-09,0,80.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,They aren't just shoes,Kicks,7.5,18,"Minus 15...\n\nT-minus ten, nine,\neight, seve..."
2,8500000,"[{""id"": 18, ""name"": ""Drama""}]",http://www.foxsearchlight.com/thebirthofanation/,339408,"[{""id"": 2831, ""name"": ""slavery""}]",en,The Birth of a Nation,"Nat Turner, a former slave in America, leads a...",9.452808,"[{""name"": ""Phantom Four"", ""id"": 423}, {""name"":...",...,2016-09-09,15861566,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Untold Story of Nat Turner,The Birth of a Nation,6.5,178,Bring him here.\n\nIn the time of our ancestor...
