In [1]:
import pandas as pd
import numpy as np
import json
import pickle

import re

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from pprint import pprint

### Create DataFrame - Line Counts

In [5]:
%%time
with open('scrape_seinfeld/line_count.json') as json_data:
    json_lc = json.load(json_data)

CPU times: user 6.91 ms, sys: 1.49 ms, total: 8.4 ms
Wall time: 7.69 ms


In [6]:
# Investigate json_lc
print(type(json_lc))
print(len(json_lc))

<class 'list'>
176


In [7]:
# Create DataFrame of line counts "df_lc"
# Only take final element of json_lc because spider yielded cumulative results when ran
df_lc = pd.DataFrame.from_dict(json_lc[len(json_lc)-1], orient = 'index').reset_index()

df_lc.columns = ['ep_num', 'line_count']
df_lc['ep_num'] = df_lc.ep_num.astype('int')

In [8]:
# There are about 140K total lines (pre-cleaning)
df_lc.line_count.sum()

140288

### Create DataFrame - Scripts from All Episodes

In [9]:
%%time
with open('scrape_seinfeld/all_eps_3.json') as json_data:
    json_scripts = json.load(json_data)

CPU times: user 169 ms, sys: 37 ms, total: 206 ms
Wall time: 313 ms


In [10]:
# Investigate json_scripts
print(type(json_scripts))
print(len(json_scripts))

<class 'list'>
176


In [11]:
# Check that the line counts in json_scripts match those in df_lc
for i in range(len(json_scripts)):
    ep_num = int(re.findall('\d+', json_scripts[i]['line'][0])[0])
    num_lines = len(json_scripts[i]['line'])
    num_check = df_lc[df_lc.ep_num == ep_num].line_count.iloc[0]
    
    if (num_lines != num_check):
        print('ep_num: ', ep_num, ' ', num_lines, ' ', num_check)
    


In [12]:
# Calling the DataFrame "df1" because it will undrgo some currently unknown number of cleaning operations and re-saves
df1 = pd.DataFrame(json_scripts[0])

for i in range(1, len(json_scripts)):
    df1 = pd.concat([df1, pd.DataFrame(json_scripts[i])])

In [13]:
df1.shape

(141031, 2)

In [14]:
# Not sure why this mismatch exists. Investigate after creation of MVP
141031 - 140288

743

In [15]:
df1.head()

Unnamed: 0,line,line_index
0,"Episode 1 - Good News, Bad News",1_0
1,\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\tpc...,1_1
2,\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\tBr...,1_2
3,\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t(T...,1_3
4,\n\t\t\t\t\t\t\t\t\t\t,1_4


### Clean Seinfeld DataFrame

In [16]:
# Remove '\n' and '\t' characters from the lines
df1['line'] = df1.line.str.replace('\\n','')
df1['line'] = df1.line.str.replace('\\t','')

In [17]:
# Remove all rows that are not either lines or text denoting a new scene
# Upon inspecting the data, I found that the first 11 episodes have text beginning with 'INT.' to denote a new scene,
# and the rest of the episodes contain text enclosed in square brackets

cond_line = df1.line.str.contains(':')
cond_scene1 = df1.line.str.startswith('INT.')
cond_scene2 = df1.line.str.startswith('[')

In [18]:
# Apply conditions defined in above cell
mask = (cond_line | cond_scene1 | cond_scene2)
df2 = df1[mask]

In [19]:
df2.iloc[:5]

Unnamed: 0,line,line_index
1,"pc: 101, season 1, episode 1 (Pilot)",1_1
2,"Broadcast date: July 5, 1989",1_2
11,Regulars:,1_11
16,Guest Stars:,1_16
24,INT. COMEDY CLUB  NIGHT,1_24


In [20]:
# Looking at the rows returned above, additional conditions are to be imposed on the data:
# Row shouldn't contain both "season" and "episode"
# Row shouldn't contain "Broadcast date:"
# Row shouldn't end with a colon
# An alternate strategy would be to create a whitelist of character names that appear before colons in lines
# and use that whitelist to filter the data (while also including scene breaks)

cond_sn_ep = ~(df2.line.str.contains("season") & df2.line.str.contains("episode"))
cond_bdcast = ~df2.line.str.contains("Broadcast")
cond_colon = df2.line.map(lambda x: x[-1] != ":")

In [21]:
# Apply conditions defined in above cell
mask = (cond_sn_ep & cond_bdcast & cond_colon)
df3 = df2[mask]

In [22]:
df3.head()

Unnamed: 0,line,line_index
24,INT. COMEDY CLUB  NIGHT,1_24
28,JERRY: Do you know what this is all about? Do ...,1_28
32,INT. PETES LUNCHEONETTE  DAY,1_32
36,"JERRY: (pointing at Georges shirt) See, to me...",1_36
38,GEORGE: Are you through?,1_38


In [23]:
df3.tail()

Unnamed: 0,line,line_index
813,JERRY: Now thats not fair! So where did you m...,2_813
815,ELAINE: I staked out his health club.,2_815
817,"JERRY: Uh huh. When youre on a stakeout, do y...",2_817
823,INT. COMEDY CLUB  NIGHT,2_823
827,JERRY: Yknow I think that even if youve had ...,2_827


In [24]:
# Before the scene number is assigned to each line, ep_num and line_num are split and converted to integer
# These two new columns functionally replace the col 'line_index'
df4 = (df3
       .assign(ep_num = [int(x[0]) for x in df3.line_index.str.split("_")],
               line_num_orig = [int(x[1]) for x in df3.line_index.str.split("_")])
       .drop('line_index', axis = 1)
       .sort_values(['ep_num', 'line_num_orig'])
      )
df4.head()

Unnamed: 0,line,ep_num,line_num_orig
24,INT. COMEDY CLUB  NIGHT,1,24
28,JERRY: Do you know what this is all about? Do ...,1,28
32,INT. PETES LUNCHEONETTE  DAY,1,32
36,"JERRY: (pointing at Georges shirt) See, to me...",1,36
38,GEORGE: Are you through?,1,38


In [25]:
%%time

# To make this more pythonic, use reduce()?
# FIX TO RESET NUM WHEN EPISODE CHANGES!
new_scene = []
scene_num = []
num = 0
last_episode = 0

for ind, val in df4.iterrows():
    new_bool = (val.line.find("INT.") != -1) | (val.line.find("[") == 0)
    new_scene.append(new_bool)
    
    num += int(new_bool)
    
    if val.ep_num > last_episode:
        num = 1

    scene_num.append(num)
    
    last_episode = val.ep_num

CPU times: user 7.67 s, sys: 61.5 ms, total: 7.73 s
Wall time: 8.13 s


In [26]:
# Just checking...
print(len(new_scene))
print(len(scene_num))
df4.shape

57575
57575


(57575, 3)

In [27]:
df5 = df4.assign(scene_num = scene_num)
df5.head(100)

Unnamed: 0,line,ep_num,line_num_orig,scene_num
24,INT. COMEDY CLUB  NIGHT,1,24,1
28,JERRY: Do you know what this is all about? Do ...,1,28,1
32,INT. PETES LUNCHEONETTE  DAY,1,32,2
36,"JERRY: (pointing at Georges shirt) See, to me...",1,36,2
38,GEORGE: Are you through?,1,38,2
40,"JERRY: You do of course try on, when you buy?",1,40,2
42,"GEORGE: Yes, it was purple, I liked it, I don...",1,42,2
44,"JERRY: Oh, you dont recall?",1,44,2
46,"GEORGE: (on an imaginary microphone) Uh, no, n...",1,46,2
48,"JERRY: Well, senator, Id just like to know, w...",1,48,2


In [28]:
# Define a function determine_line that returns true if and only if the line begins with a letter and it also
# does not begin with "INT."
# This will be used to filter out "lines" that set a new scene or describe what is happening
# (The first 11 episodes use "INT." instead of "[" to denote a new scene)
def determine_line(x):
    a = bool(re.match('^[A-Za-z]', x))
    b = a and not bool(re.match('^INT.', x))
    
    return b
             
real_line = list(map(determine_line, df5.line))         

print(len(real_line))
print(real_line[:5])

57575
[False, True, False, True, True]


In [29]:
# Now let's apply the filter created in the cell above
df6 = df5[real_line]
df6.head()

Unnamed: 0,line,ep_num,line_num_orig,scene_num
28,JERRY: Do you know what this is all about? Do ...,1,28,1
36,"JERRY: (pointing at Georges shirt) See, to me...",1,36,2
38,GEORGE: Are you through?,1,38,2
40,"JERRY: You do of course try on, when you buy?",1,40,2
42,"GEORGE: Yes, it was purple, I liked it, I don...",1,42,2


In [30]:
# Now let's add a new line_num column
line_num = df6.assign(ones = 1).groupby(by=['ep_num', 'scene_num']).ones.cumsum()
df7 = (df6
       .assign(line_num = line_num)
       .rename(index=str, columns={"line": "line_old"}) #set-up for next step
       )
df7.head()

Unnamed: 0,line_old,ep_num,line_num_orig,scene_num,line_num
28,JERRY: Do you know what this is all about? Do ...,1,28,1,1
36,"JERRY: (pointing at Georges shirt) See, to me...",1,36,2,1
38,GEORGE: Are you through?,1,38,2,2
40,"JERRY: You do of course try on, when you buy?",1,40,2,3
42,"GEORGE: Yes, it was purple, I liked it, I don...",1,42,2,4


In [31]:
# Move the character to it's own column
character = df7.line_old.apply(lambda x: x[:x.find(':')])
line = df7.line_old.apply(lambda x: x[x.find(':')+1:].strip())

df8 = (df7
      .assign(character = character,
              line = line)
      .drop('line_old', axis = 1)
      .drop('line_num_orig', axis = 1))
df8.head()

Unnamed: 0,ep_num,scene_num,line_num,character,line
28,1,1,1,JERRY,Do you know what this is all about? Do you kno...
36,1,2,1,JERRY,"(pointing at Georges shirt) See, to me, that ..."
38,1,2,2,GEORGE,Are you through?
40,1,2,3,JERRY,"You do of course try on, when you buy?"
42,1,2,4,GEORGE,"Yes, it was purple, I liked it, I dont actual..."


In [32]:
# To do: take out () and text therein from character (in addition to line) (only drewback is converting ALL (except [CHAR]) to ALL),
# replace '\x91' and other '\x's' (92)
# could also filter that CHAR == CHAR.to_upper() - well, some ELAINE (and JERRY) lines would be filtered out without furth proc
# 
# A couple of actual lines still made it into the CHAR list - just filter them out?
char_list = df8.character.sort_values().unique()
print(len(char_list))
list(char_list)

1533


['AARON',
 'AARON & HELEN',
 'ABBY',
 'ADA',
 'ADAM',
 'ADE',
 'AENT',
 'AGENCY REP',
 'AGENT',
 'AL',
 'AL ROKER',
 'ALAN',
 'ALEC',
 'ALEX',
 'ALEX TREBEK',
 'ALICIA',
 'ALISON',
 'ALL',
 'ALL (EXCEPT MORTY)',
 'ALL (chanting)',
 'ALL (singing)',
 'ALL (singing, competing)',
 'ALL FOUR MEN',
 'ALL THREE',
 'ALLBRIGHT (O.C.)',
 'ALLISON',
 'ALLSION',
 'ALTON',
 'AMANDA',
 'AMY',
 'ANGELA',
 'ANGRY MAN',
 'ANNA',
 'ANNOUNCEMENT',
 'ANNOUNCER',
 'ANNOUNCER #1',
 'ANNOUNCER #2',
 'ANNOUNCER ON P.A. SYSTEM',
 'ANNOUNCER ON TV',
 'ANSWERING MACHINE',
 'ANTONIO',
 'APPLICANT',
 'ARNIE',
 'ARONSON',
 'ARTIE',
 'ASSISTANT',
 'ASSISTANT DRESSER',
 'ATTENDANT',
 'ATTENDANT #1',
 'ATTENDANT #2',
 'AUCTIONEER',
 'AUDIENCE',
 'AUDREY',
 'AUNT MAY',
 'AVA',
 'Announcer',
 'Another man',
 'BABS',
 'BABU',
 "BABU'S BROTHER",
 "BABU'S FRIEND",
 'BAILIFF',
 'BANIA',
 'BANK EMPLOYEE',
 'BARBARA',
 "BARNEY'S SALES ASSOCIATE",
 'BARRY',
 'BARTENDER',
 'BECK',
 'BECKY',
 'BEN',
 'BENES',
 'BETH',
 'BETSY',

In [112]:
# Define a function get_paren_depth to ensure that the paren removal function will work properly
def get_paren_depth(string):
    depth = 0
    for s in string:
        if s == '(':
            depth += 1
        if s == ')':
            depth -= 1
        if depth > 1:
            return 1
        if depth < 0:
            return -1
    
    return depth

# Testing the func
test_cases = ['a (but really b)', '', 'no paren', 'yes (but no (well maybe) ...owl!) yeah!', ' oy ) vey (oops)']
[get_paren_depth(case) for case in test_cases]

[0, 0, 0, 1, -1]

In [34]:
# Look's good!
paren_test_char = [get_paren_depth(char) for char in char_list]
print(max(paren_test_char))
print(min(paren_test_char))

0
0


In [69]:
# Okay, now let's define the paren_removal function
def paren_removal(string):
    beg = string.find('(')
    
    if beg != -1:
        end = string.find(')')
        s_new = string[:beg] + string[end + 1:]
        s_new = s_new.replace('  ', ' ').strip()
        return s_new
    else:
        return string

# This function was defined for cleaining the lines after the above version was used to clean the characters
# It took a while when used on a full dataframe, so it will be avoided for now
def paren_removal_rec(string):
    beg = string.find('(')
    
    if beg != -1:
        end = string.find(')')
        s_new = string[:beg] + string[end + 1:]
        s_new = s_new.replace('  ', ' ').strip()
        return paren_removal_rec(s_new)
    else:
        return string    
# Test the function
test_cases = ['blah(blah)BLAH', 'blah (blah)BLAH', 'blah(blah) BLAH', 'blah (blah) BLAH', 'blah BLAH',
              'blah BLAH (wut)', '(blah) (blah) blah BLAH']
pprint([paren_removal(case) for case in test_cases])
pprint([paren_removal_rec(case) for case in test_cases])

['blahBLAH',
 'blah BLAH',
 'blah BLAH',
 'blah BLAH',
 'blah BLAH',
 'blah BLAH',
 '(blah) blah BLAH']
['blahBLAH',
 'blah BLAH',
 'blah BLAH',
 'blah BLAH',
 'blah BLAH',
 'blah BLAH',
 'blah BLAH']


In [36]:
char_list_2 = [paren_removal(char) for char in char_list]
print(len(char_list_2))
list(zip(char_list, char_list_2))

1533


[('AARON', 'AARON'),
 ('AARON & HELEN', 'AARON & HELEN'),
 ('ABBY', 'ABBY'),
 ('ADA', 'ADA'),
 ('ADAM', 'ADAM'),
 ('ADE', 'ADE'),
 ('AENT', 'AENT'),
 ('AGENCY REP', 'AGENCY REP'),
 ('AGENT', 'AGENT'),
 ('AL', 'AL'),
 ('AL ROKER', 'AL ROKER'),
 ('ALAN', 'ALAN'),
 ('ALEC', 'ALEC'),
 ('ALEX', 'ALEX'),
 ('ALEX TREBEK', 'ALEX TREBEK'),
 ('ALICIA', 'ALICIA'),
 ('ALISON', 'ALISON'),
 ('ALL', 'ALL'),
 ('ALL (EXCEPT MORTY)', 'ALL'),
 ('ALL (chanting)', 'ALL'),
 ('ALL (singing)', 'ALL'),
 ('ALL (singing, competing)', 'ALL'),
 ('ALL FOUR MEN', 'ALL FOUR MEN'),
 ('ALL THREE', 'ALL THREE'),
 ('ALLBRIGHT (O.C.)', 'ALLBRIGHT'),
 ('ALLISON', 'ALLISON'),
 ('ALLSION', 'ALLSION'),
 ('ALTON', 'ALTON'),
 ('AMANDA', 'AMANDA'),
 ('AMY', 'AMY'),
 ('ANGELA', 'ANGELA'),
 ('ANGRY MAN', 'ANGRY MAN'),
 ('ANNA', 'ANNA'),
 ('ANNOUNCEMENT', 'ANNOUNCEMENT'),
 ('ANNOUNCER', 'ANNOUNCER'),
 ('ANNOUNCER #1', 'ANNOUNCER #1'),
 ('ANNOUNCER #2', 'ANNOUNCER #2'),
 ('ANNOUNCER ON P.A. SYSTEM', 'ANNOUNCER ON P.A. SYSTEM'),
 ('A

In [37]:
# Let's update the character column
df9 = df8.copy()
df9['character'] = [paren_removal(char) for char in df8.character]

char_list3 = df9.character.sort_values().unique()
print(len(char_list3))
list(char_list3)

1121


['AARON',
 'AARON & HELEN',
 'ABBY',
 'ADA',
 'ADAM',
 'ADE',
 'AENT',
 'AGENCY REP',
 'AGENT',
 'AL',
 'AL ROKER',
 'ALAN',
 'ALEC',
 'ALEX',
 'ALEX TREBEK',
 'ALICIA',
 'ALISON',
 'ALL',
 'ALL FOUR MEN',
 'ALL THREE',
 'ALLBRIGHT',
 'ALLISON',
 'ALLSION',
 'ALTON',
 'AMANDA',
 'AMY',
 'ANGELA',
 'ANGRY MAN',
 'ANNA',
 'ANNOUNCEMENT',
 'ANNOUNCER',
 'ANNOUNCER #1',
 'ANNOUNCER #2',
 'ANNOUNCER ON P.A. SYSTEM',
 'ANNOUNCER ON TV',
 'ANSWERING MACHINE',
 'ANTONIO',
 'APPLICANT',
 'ARNIE',
 'ARONSON',
 'ARTIE',
 'ASSISTANT',
 'ASSISTANT DRESSER',
 'ATTENDANT',
 'ATTENDANT #1',
 'ATTENDANT #2',
 'AUCTIONEER',
 'AUDIENCE',
 'AUDREY',
 'AUNT MAY',
 'AVA',
 'Announcer',
 'Another man',
 'BABS',
 'BABU',
 "BABU'S BROTHER",
 "BABU'S FRIEND",
 'BAILIFF',
 'BANIA',
 'BANK EMPLOYEE',
 'BARBARA',
 "BARNEY'S SALES ASSOCIATE",
 'BARRY',
 'BARTENDER',
 'BECK',
 'BECKY',
 'BEN',
 'BENES',
 'BETH',
 'BETSY',
 'BETTE',
 'BIKER',
 'BILL',
 'BILLY',
 'BLAINE',
 'BLIND MAN',
 'BLIND VIOLIN PLAYER',
 'BOB',

In [38]:
weird_char_list = [
    "Micheal, Sandi, and Tom, and finally Jerry, and the four of them make a toast while it's written",
    "Inside Kew Gardens Reprinted from an article in the October 1982 issue of Metropolis Magazine. http",
    "GEORGE, ELAINE, KRAMER, JERRY, & STAN",
    "DUSTIN - Locker room Attendant #2",
    "Check #1246, dated Dec. 15 \x9196, Made out to"]

print([len(weirdo) for weirdo in weird_char_list])

[96, 99, 37, 33, 43]


In [39]:
mask = list(map(lambda x: len(x) > 40, df9.character))
df9[mask]

# standup bits without JERRY there: 54-6-1, 57-15-1, 65-16-1
# description with a colon somewhere in there: 64-8-1
# Let's delete all of these rows

Unnamed: 0,ep_num,scene_num,line_num,character,line
635,54,6,1,I always get confused in the movie theater by ...,"``Oh, you mean that was the same guy from the ..."
535,57,15,1,"I am not gay. I am, however, thin, single and ...","""Y'know I think Joe might be a little... [wave..."
434,64,8,1,"Micheal, Sandi, and Tom, and finally Jerry, an...","""Created by Jerry Seinfeld and George Costanza""."
860,65,16,1,"The female orgasm is kinda like the bat cave, ...",the real and the fake. And uh I'll tell you ri...
31,90,2,1,I think the thing I admire most about the Chin...,"thousands of years ago, Chinese farmer gets up..."
826,90,20,9,Inside Kew Gardens Reprinted from an article i...,//www.metropolismag.com/
33,115,2,1,I always feel bad for the silver medal winner ...,- How much did you lose by? - I don't even kno...
974,122,26,14,This is the guy who traded Jay Buhner for Ken ...,"""How could you trade Jay Buhner?!"" ""My people ..."
139,145,4,1,"Check #1246, dated Dec. 15 96, Made out to",Columbus Deli for $40.00


In [40]:
df10 = df9[[not x for x in mask]]

char_list4 = df10.character.sort_values().unique()
print(len(char_list4))
pprint(list(char_list4))

1112
['AARON',
 'AARON & HELEN',
 'ABBY',
 'ADA',
 'ADAM',
 'ADE',
 'AENT',
 'AGENCY REP',
 'AGENT',
 'AL',
 'AL ROKER',
 'ALAN',
 'ALEC',
 'ALEX',
 'ALEX TREBEK',
 'ALICIA',
 'ALISON',
 'ALL',
 'ALL FOUR MEN',
 'ALL THREE',
 'ALLBRIGHT',
 'ALLISON',
 'ALLSION',
 'ALTON',
 'AMANDA',
 'AMY',
 'ANGELA',
 'ANGRY MAN',
 'ANNA',
 'ANNOUNCEMENT',
 'ANNOUNCER',
 'ANNOUNCER #1',
 'ANNOUNCER #2',
 'ANNOUNCER ON P.A. SYSTEM',
 'ANNOUNCER ON TV',
 'ANSWERING MACHINE',
 'ANTONIO',
 'APPLICANT',
 'ARNIE',
 'ARONSON',
 'ARTIE',
 'ASSISTANT',
 'ASSISTANT DRESSER',
 'ATTENDANT',
 'ATTENDANT #1',
 'ATTENDANT #2',
 'AUCTIONEER',
 'AUDIENCE',
 'AUDREY',
 'AUNT MAY',
 'AVA',
 'Announcer',
 'Another man',
 'BABS',
 'BABU',
 "BABU'S BROTHER",
 "BABU'S FRIEND",
 'BAILIFF',
 'BANIA',
 'BANK EMPLOYEE',
 'BARBARA',
 "BARNEY'S SALES ASSOCIATE",
 'BARRY',
 'BARTENDER',
 'BECK',
 'BECKY',
 'BEN',
 'BENES',
 'BETH',
 'BETSY',
 'BETTE',
 'BIKER',
 'BILL',
 'BILLY',
 'BLAINE',
 'BLIND MAN',
 'BLIND VIOLIN PLAYER',
 '

In [41]:
# Found some more 'characters' to remove by manually looking at the character list above
# (Also, The character 'SeConrad' should be 'SECRETARY', but that's fine)
more_chars_to_remove = ['Beginnings', "Jerry's stand-up", 'Opening Monolog', 'Opening scene', 'Performed by', 
    'Published', 'Song over the end credits', 'Sung by',' Turning Point',# "We gotta go! It's 8",
    'Where to Find More', 'Written by', 'http', 'rc', 'so far']

df11 = df10[[char not in more_chars_to_remove for char in df10.character]]

In [120]:
# About 20 lines can't be processed by get_paren_depth
paren_test_line = [get_paren_depth(line) for line in df11.line]
paren_test_line2 = [test != 0 for test in paren_test_line]

df_bad_lines = df11[paren_test_line2]
df_bad_lines

Unnamed: 0,ep_num,scene_num,line_num,character,line
416,12,11,3,VIC,"Hey, you see that sign right there? (Points to..."
128,25,2,47,KRAMER,All right Come on....again attempts to pull he...
477,26,10,8,JERRY,(to himself: Uh oh. My organs are playing ches...
140,27,3,3,GEORGE,(waving: Eva.
180,27,3,20,JERRY,(Patting his head: I'm a comedian.
339,82,5,90,ELAINE,(under her breath) Ya that'll happen)
814,95,19,19,ELAINE,"(Tilts her head down, looking over her glasses..."
92,105,2,29,GEORGE,"""Aahhrgh...it wouldn't take...... long pause, ..."
264,105,5,53,JERRY,"""Kom Pau(sp?"""
750,105,18,14,STEIN,"""Yes it is. Well lets see what I have today. D..."


In [121]:
# Copy and paste
for line in df_bad_lines.line:
    print('"' + line + '",')

"Hey, you see that sign right there? (Points to a sign saying "Not Responsible for Valuables"",
"All right Come on....again attempts to pull headphones off)",
"(to himself: Uh oh. My organs are playing chess again.",
"(waving:  Eva.",
"(Patting his head:  I'm a comedian.",
"(under her breath) Ya that'll happen)",
"(Tilts her head down, looking over her glasses in amazement of",
""Aahhrgh...it wouldn't take...... long pause, audience laughter) ten minutes from now, I'll be sweating all over again ,I can feel it. I'm a human heat pump!"",
""Kom Pau(sp?"",
""Yes it is. Well lets see what I have today. Darn it It's ham & Cheese again and she forgot the fancy mustard. I told her I like that fancy mustard. You could put that fancy mustard on a shoe and it would taste pretty good to me. oh! she made it up with a cupcake though. Hey look at this. you know I got a new system for eating these things. `I used to peel off the chocolate now I turn them upside down , I eat the cake first and save th

In [122]:
good_lines = [
    "Hey, you see that sign right there?",
    "All right Come on...",
    "Uh oh. My organs are playing chess again.",
    "Eva.",
    "I'm a comedian.",
    "Ya that'll happen",
    "", #hmmm
    "Aahhrgh...it wouldn't take...... ten minutes from now, I'll be sweating all over again, I can feel it. I'm a human heat pump!",
    "Kom Pau",
    "Yes it is. Well lets see what I have today. Darn it It's ham & Cheese again and she forgot the fancy mustard. I told her I like that fancy mustard. You could put that fancy mustard on a shoe and it would taste pretty good to me. oh! she made it up with a cupcake though. Hey look at this. you know I got a new system for eating these things. I used to peel off the chocolate now I turn them upside down , I eat the cake first and save the frosting for the end.",
    "Yeah.",
    "It's a long journey from Milan to Minsk. wait wait. Hold it stop, (sob) I'm sorry, I have to start it over, my shoelace. (sob) I can't do it like this. Please let me start over. (sob) Please. (sob) Please. . . .",
    "Gimme that rye!",
    "Now, what are you thinkin'? You think that I'm not able to wear jeans anymore? Is that what you're sayin'? Because if that's what you're sayin', Jerry, I'll go and I'll buy some jeans. I swear to God I will!",
    "I mean all that stuff is so small. stupid.",
    "Yeah. Well, count me out. I'm swimmin'. Old man Leland is bustin' my hump over these reports. If I don't get 'em done by nine, I'm toast.",
    "Oh, right! Right! Hey, hey.. I love the floors in here. It's like a gymnasium in here! Try and guard me!",
    "Uh. Well, I'm sick o' waiting. I am springin' ahead riiight now.",
    "Uh huh.",
    "Spite Never sleeps",
    "Elaine, Elaine"]

In [123]:
# Needed to use assign method to make the code in this cell work for reasons unknown
test_zip = zip(good_lines, df_bad_lines.iterrows())
df12 = df11.copy(deep = True)

for good, (ind, val) in test_zip:
    mask = (df11.ep_num == val.ep_num) & (df11.scene_num == val.scene_num) & (df11.line_num == val.line_num)
#     print(df11[mask].line)
#     df12[mask]['line'] = good
#     df12[mask].loc[:, 'line'] = good
    df12[mask] = df12[mask].assign(line = good)
#     print(df11[mask].line)
    print(good)
    print(df12[mask]['line'])

Hey, you see that sign right there?
416    Hey, you see that sign right there?
Name: line, dtype: object
All right Come on...
128    All right Come on...
Name: line, dtype: object
Uh oh. My organs are playing chess again.
477    Uh oh. My organs are playing chess again.
Name: line, dtype: object
Eva.
140    Eva.
Name: line, dtype: object
I'm a comedian.
180    I'm a comedian.
Name: line, dtype: object
Ya that'll happen
339    Ya that'll happen
Name: line, dtype: object

814    
Name: line, dtype: object
Aahhrgh...it wouldn't take...... ten minutes from now, I'll be sweating all over again, I can feel it. I'm a human heat pump!
92    Aahhrgh...it wouldn't take...... ten minutes f...
Name: line, dtype: object
Kom Pau
264    Kom Pau
Name: line, dtype: object
Yes it is. Well lets see what I have today. Darn it It's ham & Cheese again and she forgot the fancy mustard. I told her I like that fancy mustard. You could put that fancy mustard on a shoe and it would taste pretty good to me. oh! s

In [125]:
# Look's good!
paren_test_line = [get_paren_depth(line) for line in df12.line]
print(max(paren_test_line))
print(min(paren_test_line))

0
0


In [129]:
# Let's update the line column
df13 = df12.copy()
i = 0

while True:
    # Update line column
    df13['line'] = [paren_removal(line) for line in df13.line]
    
    # Update counter
    i += 1
    
    # Tests
    paren_test_line = [get_paren_depth(line) for line in df13.line]
    too_many_begs = max(paren_test_line)
    too_many_ends = min(paren_test_line)
    num_opens = ['(' in line for line in df13.line]
    num_closes = [')' in line for line in df13.line]
    no_closes = [')' not in line for line in df13.line]
#     df_look_at = df13[num_opens and no_closes]
    
    print(i, ' ', too_many_begs, ' ', too_many_ends, ' ', sum(num_opens), ' ', sum(num_closes))
    
    if (sum(num_opens) == 0) & (sum(num_closes) == 0):
        break

1   0   0   1107   1107
2   0   0   203   203
3   0   0   64   64
4   0   0   21   21
5   0   0   12   12
6   0   0   6   6
7   0   0   4   4
8   0   0   3   3
9   0   0   1   1
10   0   0   0   0


In [130]:
df13.head()

Unnamed: 0,ep_num,scene_num,line_num,character,line
28,1,1,1,JERRY,Do you know what this is all about? Do you kno...
36,1,2,1,JERRY,"See, to me, that button is in the worst possib..."
38,1,2,2,GEORGE,Are you through?
40,1,2,3,JERRY,"You do of course try on, when you buy?"
42,1,2,4,GEORGE,"Yes, it was purple, I liked it, I dont actual..."


In [140]:
! pwd

/Users/warren/Data_Science/Metis/github/project-kojak


### Pickle the file

In [141]:
df13.to_pickle('seinfeld_scripts_clean.pkl')

### Read Pickle File

In [4]:
df13 = pickle.load(open('seinfeld_scripts_clean.pkl','rb'))

### Further processing

In [None]:
import string

In [None]:
# Do more standard preproc now!
# Remove the x\'s
# Remove "AUDIENCE" lines

# nltk ipynb stuff?: normalization, punc removal, tokenization (pretty sure yes)

In [6]:
char_x = ['\\x' in char for char in df13.character]
line_x = ['\\x' in line for line in df13.line]

In [7]:
# I'm skeptical but let's move on I suppose
print(sum(char_x))
print(sum(line_x))

0
0


In [8]:
rename = ['CLERK' in char and 'VOICE' in char for char in df13.character]
df13[rename].character

659    CLERKS VOICE
Name: character, dtype: object

In [9]:
rename = ['JERRY' in char and 'MESSAGE' in char for char in df13.character]
df13[rename].character

414             JERRYS MESSAGE
711    JERRYS OUTGOING MESSAGE
Name: character, dtype: object

In [10]:
# Only want to remove the first two of these
rename = ['AUDIENCE' in char for char in df13.character]
df13.reset_index().drop('index', axis = 1)[rename]

Unnamed: 0,ep_num,scene_num,line_num,character,line
212,2,2,2,AUDIENCE,Cheque.
7884,29,10,3,AUDIENCE,Oooooh.
46809,157,22,5,MAN IN AUDIENCE,I have cancer!
49494,165,29,2,AUDIENCE,"Hi, Jason."


In [11]:
df14 = (df13
        .reset_index()
        .drop('index', axis = 1)
        .drop([212, 7884]))

In [12]:
# Only want to remove the first two of these
rename = ['AUDIENCE' in char for char in df14.character]
df14[rename]

Unnamed: 0,ep_num,scene_num,line_num,character,line
46809,157,22,5,MAN IN AUDIENCE,I have cancer!
49494,165,29,2,AUDIENCE,"Hi, Jason."


In [13]:
# Let's remove scenes with only one character in them
# These are presumably most Jerry's stand-up scenes
multi_char_scenes = (df14
          .groupby(['ep_num', 'scene_num'])
          .character
          .nunique()
          .reset_index()
          .rename(index = str, columns = {'character' : 'num_chars'})
          .query('num_chars != 1'))
multi_char_scenes.head()

Unnamed: 0,ep_num,scene_num,num_chars
1,1,2,4
2,1,3,2
4,1,5,2
5,1,6,4
7,1,8,5


In [14]:
df15 = pd.merge(df14, multi_char_scenes[['ep_num', 'scene_num']], how = 'inner', on = ['ep_num', 'scene_num'])
df15.head()

Unnamed: 0,ep_num,scene_num,line_num,character,line
0,1,2,1,JERRY,"See, to me, that button is in the worst possib..."
1,1,2,2,GEORGE,Are you through?
2,1,2,3,JERRY,"You do of course try on, when you buy?"
3,1,2,4,GEORGE,"Yes, it was purple, I liked it, I dont actual..."
4,1,2,5,JERRY,"Oh, you dont recall?"


In [15]:
for line in df15.head(10).line:
    print(line)

See, to me, that button is in the worst possible spot. The second button literally makes or breaks the shirt, look at it. Its too high! Its in no-mans-land. You look like you live with your mother.
Are you through?
You do of course try on, when you buy?
Yes, it was purple, I liked it, I dont actually recall considering the buttons.
Oh, you dont recall?
Uh, no, not at this time.
Well, senator, Id just like to know, what you knew and when you knew it.
Mr. Seinfeld. Mr. Costanza.
Are, are you sure this is decaf? Wheres the orange indicator?
Its missing, I have to do it in my head: decaf left, regular right, decaf left, regular right...its very challenging work.


In [16]:
# Filter out empty lines
nonempty_lines = [len(line) != 0 for line in df15.line]
df16 = df15[nonempty_lines]

print(df15.shape)
print(df16.shape)

(53373, 5)
(53327, 5)


In [None]:
# One issue is when a character is quoting someone else

In [17]:
line_end_chars = {line[-1] for line in df16.line}
len(line_end_chars)

53

In [18]:
df17 = df16.assign(line = df16.line.str.strip())

In [29]:
# If final character in a line is a '/', it should be a '?'
mask = df17.line.apply(lambda x: x[-1] == '/')
df17.line[mask]

9212         Huh. Well this is a little awkward, isn't it/
9260                                   Hello. How are you/
39447    Why don't you just cut the stupid pants and ge...
Name: line, dtype: object

In [33]:
df18 = (df17
       .assign(line = df17.line.apply(lambda x: x[:-1] + '?' if x[-1] == '/' else x)))

In [34]:
df18[mask]

Unnamed: 0,ep_num,scene_num,line_num,character,line
9212,34,17,43,ELAINE,"Huh. Well this is a little awkward, isn't it?"
9260,35,4,2,JERRY,Hello. How are you?
39447,133,21,2,MICKEY,Why don't you just cut the stupid pants and ge...


In [36]:
mask = df18.line.apply(lambda x: '..' in x)
df18.line[mask].head()

9     Its missing, I have to do it in my head: deca...
17    I thought I told you about it, yes, she teache...
19                     Theres no milk in here, what...
20     Wait wait wait, what is she... What is she like?
21    Oh, shes really great. I mean, shes got like...
Name: line, dtype: object

In [39]:
# Reduce multiple dots into a period
temp = df18.line[mask].iloc[0]
temp

'It\x92s missing, I have to do it in my head: decaf left, regular right, decaf left, regular right...it\x92s very challenging work.'

In [45]:
rx = re.compile(r'(\.)\1{2,}')
print(rx.search(temp))
print(temp)
print(re.sub(rx, ".",  temp))

<_sre.SRE_Match object; span=(94, 97), match='...'>
Its missing, I have to do it in my head: decaf left, regular right, decaf left, regular right...its very challenging work.
Its missing, I have to do it in my head: decaf left, regular right, decaf left, regular right.its very challenging work.


In [46]:
# Replace repeated periods with single periods
dots = re.compile(r'(\.)\1{2,}')
df19 = (df18
       .assign(line = df18.line.apply(lambda x: re.sub(dots, '.', x))))

In [71]:
printable = set(string.printable)
pprint(''.join(filter(lambda x: x in printable, temp)))

('Its missing, I have to do it in my head: decaf left, regular right, decaf '
 'left, regular right.its very challenging work.')


In [72]:
# Remove non-ascii characters (e.g. '\x92')
def clean_nonascii(s):
    printable = set(string.printable)
    s_clean = ''.join(filter(lambda char: char in printable, s))
    return s_clean

df20 = (df19
       .assign(line = df19.line.apply(clean_nonascii)))

In [97]:
# Extract first and last sentences
def first_sentence(s, stop_seq = 'XQX'):
    if ('.' not in s) and ('?' not in s) and ('!' not in s):
        return s
    
    s2 = s.replace('.', stop_seq).replace('?', stop_seq).replace('!', stop_seq)
    first_stop = s2.find(stop_seq)
    return s[:first_stop+1].strip()

tests = ['', 'hi', 'hi.' ,'hello. how are you?', df20.line[mask].iloc[0]]
[first_sentence(test) for test in tests]

['',
 'hi',
 'hi.',
 'hello.',
 'Its missing, I have to do it in my head: decaf left, regular right, decaf left, regular right.']

In [119]:
# Extract first and last sentences
def last_sentence(s, stop_seq = 'XQX'):
    if ('.' not in s) and ('?' not in s) and ('!' not in s):
        return s
    
    if s.count('.') + s.count('?') + s.count('!') == 1:
        return s
    
    s2 = s.replace('.', stop_seq).replace('?', stop_seq).replace('!', stop_seq)[::-1]
    first_stop = s2.find(stop_seq)
    sec_stop = s2[first_stop+1:].find(stop_seq) - len(stop_seq) + 1
    return s[::-1][:sec_stop+1].strip()[::-1]
    
tests = ['', 'hi', 'hi.' ,'hello. how are you?', df20.line[mask].iloc[0], df20.line.iloc[0]]
[last_sentence(test) for test in tests]

['',
 'hi',
 'hi.',
 'how are you?',
 'its very challenging work.',
 'You look like you live with your mother.']

In [101]:
df21.line.iloc[0]

'See, to me, that button is in the worst possible spot. The second button literally makes or breaks the shirt, look at it. Its too high! Its in no-mans-land. You look like you live with your mother.'

In [120]:
# Add first and last sentences to the dataframe
df21 = (df20
       .assign(first_sentence = df20.line.apply(first_sentence),
               last_sentence = df20.line.apply(last_sentence)))

In [121]:
df21.head()

Unnamed: 0,ep_num,scene_num,line_num,character,line,first_sentence,last_sentence
0,1,2,1,JERRY,"See, to me, that button is in the worst possib...","See, to me, that button is in the worst possib...",You look like you live with your mother.
1,1,2,2,GEORGE,Are you through?,Are you through?,Are you through?
2,1,2,3,JERRY,"You do of course try on, when you buy?","You do of course try on, when you buy?","You do of course try on, when you buy?"
3,1,2,4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...","Yes, it was purple, I liked it, I dont actuall...","Yes, it was purple, I liked it, I dont actuall..."
4,1,2,5,JERRY,"Oh, you dont recall?","Oh, you dont recall?","Oh, you dont recall?"


### Restructure the data into calls and responses

In [126]:
# Create prompt dataframe
df_prompt = (df21.copy()
               .drop(['line', 'first_sentence'], axis = 1)
               .rename(index = str, columns = {"character": "char_prompt", "last_sentence": "prompt"}))
df_prompt.head()

Unnamed: 0,ep_num,scene_num,line_num,char_prompt,prompt
0,1,2,1,JERRY,You look like you live with your mother.
1,1,2,2,GEORGE,Are you through?
2,1,2,3,JERRY,"You do of course try on, when you buy?"
3,1,2,4,GEORGE,"Yes, it was purple, I liked it, I dont actuall..."
4,1,2,5,JERRY,"Oh, you dont recall?"


In [131]:
# Create response dataframe
df_response = (df21.copy()
               .assign(line_num = df21.line_num - 1)
               .drop(['line', 'last_sentence'], axis = 1)
               .rename(index = str, columns = {"character": "char_response", "first_sentence": "response"}))
df_response.head()

Unnamed: 0,ep_num,scene_num,line_num,char_response,response
0,1,2,0,JERRY,"See, to me, that button is in the worst possib..."
1,1,2,1,GEORGE,Are you through?
2,1,2,2,JERRY,"You do of course try on, when you buy?"
3,1,2,3,GEORGE,"Yes, it was purple, I liked it, I dont actuall..."
4,1,2,4,JERRY,"Oh, you dont recall?"


In [132]:
# Merge the prompt and response dataframes
df_pr_1 = df_prompt.merge(df_response, on = ['ep_num', 'scene_num', 'line_num'])
df_pr_1.head()

Unnamed: 0,ep_num,scene_num,line_num,char_prompt,prompt,char_response,response
0,1,2,1,JERRY,You look like you live with your mother.,GEORGE,Are you through?
1,1,2,2,GEORGE,Are you through?,JERRY,"You do of course try on, when you buy?"
2,1,2,3,JERRY,"You do of course try on, when you buy?",GEORGE,"Yes, it was purple, I liked it, I dont actuall..."
3,1,2,4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",JERRY,"Oh, you dont recall?"
4,1,2,5,JERRY,"Oh, you dont recall?",GEORGE,"Uh, no, not at this time."


In [152]:
# Remove unnecessary characters fix contractions.
def clean_text(text):

    text = text.lower().strip()
    
    text = re.sub(r" {2,}", " ", text)
    text = re.sub(r"\n", "",  text)
    text = re.sub(r"[-()]", "", text)
    text = re.sub(r"\.", " .", text)
    text = re.sub(r"\!", " !", text)
    text = re.sub(r"\?", " ?", text)
    text = re.sub(r"\,", " ,", text)
    text = re.sub(r"\*", "", text)
    text = re.sub(r":", "", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"ahh", "ah", text)
    
    return text

In [153]:
# Apply clean_text
df_clean = (df_pr_1
            .assign(prompt = df_pr_1.prompt.apply(clean_text),
                    response = df_pr_1.response.apply(clean_text)))
df_clean.head()

Unnamed: 0,ep_num,scene_num,line_num,char_prompt,prompt,char_response,response
0,1,2,1,JERRY,you look like you live with your mother .,GEORGE,are you through ?
1,1,2,2,GEORGE,are you through ?,JERRY,"you do of course try on , when you buy ?"
2,1,2,3,JERRY,"you do of course try on , when you buy ?",GEORGE,"yes , it was purple , i liked it , i dont actu..."
3,1,2,4,GEORGE,"yes , it was purple , i liked it , i dont actu...",JERRY,"oh , you dont recall ?"
4,1,2,5,JERRY,"oh , you dont recall ?",GEORGE,"uh , no , not at this time ."


In [159]:
# Find the length of lines
# Could probably get away with only doing this for prompts or responses, but did both
line_length_p = df_clean.prompt.apply(lambda x: len(x.split()))
line_length_r = df_clean.prompt.apply(lambda x: len(x.split()))

pprint(line_length_p.describe())
pprint(line_length_r.describe())

count    50371.000000
mean         7.240714
std          5.129954
min          0.000000
25%          4.000000
50%          6.000000
75%          9.000000
max         59.000000
Name: prompt, dtype: float64
count    50371.000000
mean         7.240714
std          5.129954
min          0.000000
25%          4.000000
50%          6.000000
75%          9.000000
max         59.000000
Name: prompt, dtype: float64


In [160]:
# Can keep up to 99th percentile
pprint([np.percentile(line_length_p, perc) for perc in [80, 85, 90, 95, 99]])
pprint([np.percentile(line_length_r, perc) for perc in [80, 85, 90, 95, 99]])

[10.0, 12.0, 14.0, 17.0, 26.0]
[10.0, 12.0, 14.0, 17.0, 26.0]


In [162]:
# Apply max length limit
max_line_length = 26

shorter_lines = [(p <= max_line_length) & (r <= max_line_length) for p, r in zip(line_length_p, line_length_r)]
df_short = df_clean[shorter_lines]

In [163]:
print(df_clean.shape)
print(df_short.shape)

(50371, 7)
(49928, 7)


### Continue with only Kramer responses

In [184]:
df_kramer_01 = df_short[df_short.char_response == 'KRAMER']
df_kramer_01

Unnamed: 0,ep_num,scene_num,line_num,char_prompt,prompt,char_response,response
137,1,6,30,GEORGE,did you need something .,KRAMER,do you handle any of that commercial .
437,2,9,6,JERRY,"you cant look in there , were playing !",KRAMER,hi .
440,2,9,9,MORTY,kramer !,KRAMER,hey morty !
446,2,9,15,JERRY,"dad , shes cheating !",KRAMER,quo ?
470,2,9,39,HELEN,32 .,KRAMER,"no , you dont have to challenge that ."
472,2,9,41,JERRY,i am challenging .,KRAMER,quone .
478,2,9,47,HELEN,why did you make me put that down ?,KRAMER,"nah , we need a medical dictionary !"
590,3,3,10,JERRY,!,KRAMER,"uh , jer , well you know , i was cookin and i ..."
592,3,3,12,JERRY,"wait , you left the lock open or the door open ?",KRAMER,the door .
594,3,3,14,JERRY,you left the door open ?,KRAMER,"yeah , well , i was gonna bring the spatula ri..."


In [181]:
# Create a dictionary for the frequency of the vocabulary
vocab = {}
for line in df_kramer_01.response:
    for word in line.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

CPU times: user 32.4 ms, sys: 12.7 ms, total: 45.1 ms
Wall time: 62.3 ms


In [182]:
# Limit the vocabulary to words used more than 3 times.
threshold = 3
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1

In [185]:
print("Size of total vocab:", len(vocab))
print("Size of vocab to use:", count)

Size of total vocab: 4050
Size of vocab to use: 1032


In [186]:
# Create dictionaries to provide a unique integer for each word.
word2int = {}

word_num = 0
for k,v in vocab.items():
    if v >= threshold:
        word2int[k] = word_num
        word_num += 1


In [187]:
# Add the unique tokens to the vocabulary dictionaries.
codes = ['<PAD>','<EOS>','<UNK>','<GO>']

for code in codes:
    word2int[code] = len(word2int)+1


In [188]:
# Create inverse dictionary of word2int: int2word.
int2word = {val: key for key, val in word2int.items()}

In [189]:
# Check the length of the dictionaries.
print(len(word2int))
print(len(int2word))

1036
1036


In [None]:
# Convert the text to integers. 
# Replace any words that are not in the respective vocabulary with <UNK> (unknown)

# Convert to list comprehension?
# Create function then apply to df col?
source_int = []
for line in df_kramer_01.prompt:
    sentence = []
    for word in line.split():
        if word not in word2int:
            sentence.append(word2int['<UNK>'])
        else:
            sentence.append(word2int[word])
    source_int.append(sentence)
    
target_int = []
for line in df_kramer_01.response:
    sentence = []
    for word in line.split():
        if word not in word2int:
            sentence.append(word2int['<UNK>'])
        else:
            sentence.append(word2int[word])
    target_int.append(sentence)

In [None]:
# Check the lengths # necessary?
print(len(source_int))
print(len(target_int))