# Seinfield Analysis:   Data Acquisition

## DATA 620 Web Analytics, CUNY Spring 2018

__Team:__ Andy Carson, Nathan Cooper, Walt Wells

### Data Source:  Seinology Scripts

http://www.seinology.com/scripts.shtml

### Scrape Inspirations

* https://github.com/amanthedorkknight/the-seinfeld-chronicles
* https://data.world/rickyhennessy/seinfeld-scripts/workspace/file?filename=seinfeld_scrape.py

### What else did we do?

In order to facilitate SNA, we extracted additional information regarding scenes and dialogue order in scenes.   The data is organized and saved as follows:  

*  Seinfield_Metadata.csv:   Episode metadata - KEY: Ep/SeasonNum, AirDate, EpTitle, Director
*  Seinfield_Cast.csv:   Cast - KEY Ep/SeasonNum,  Actor Name, Character
*  Seinfield_Writers.csv:   Writers - KEY Ep/SeasonNum,  Writer Name
*  Seinfield_Dialogue.csv:   Dialogue - KEY Ep/SeasonNum,  Character,  Text, SceneNum, DialogueIndex 

Of particular interest here is the organization of the Script data.   The data is organized by scene so that SNA could potentially be performed showing when characters are interacting in a scene.   The Index helps show the order of the dialogue in the scene.  It resets for each scene.  

### Episode Details

We extracted all episodes except 100-101 and 177-178 which were clip shows.   They followed a different format and other than some short perfunctory introductions by main characters, their data is all contained in the original episodes.

In [1]:
from bs4 import BeautifulSoup 
import re
import urllib 
import requests
import string
import pandas as pd

In [13]:
base_url = 'http://www.seinology.com/scripts/script-' 
episode_numbers = (
    list(map(lambda n: '%02d' % n, range(1, 82))) +
    ['82and83'] +
    list(map(lambda n: '%02d' % n, range(84, 100))) +
    list(map(lambda n: '%02d' % n, range(102, 177))) +
    ['179and180']
)

### Get Metadata, Writers, Cast, and Dialogue

In [8]:
def DataGetter(episode_numbers, baseurl):
    '''collect and organize all information around seinfield from seinology.com scripts'''
    
    def get_episode_html(no, base_url):
        ## Helper for getting raw html
        url = base_url + str(no) + '.shtml'
        source_code = requests.get(url)
        html = source_code.text
        return html
    
    # initialize empty objects 
    metadatadf = pd.DataFrame([])
    castdf = pd.DataFrame([])
    writerdf = pd.DataFrame([])
    dialoguedf = pd.DataFrame([])
    
    for episode in episode_numbers:
        
        
        html = get_episode_html(episode, base_url)
        groups = re.search(r'pc: .*? season (\d+), episode (\d+)', html).groups()
        season_num = int(groups[0])
        episode_num = int(groups[1])
        seid = 'S'+ str('%02d' % season_num) + 'E' + str('%02d' % episode_num)
        if episode == '01':
            seid = 'S01E00'
                                                             
        print("Scraping Episode: %s" % seid)                                                    
        
        html_split = re.split(r'={30}.*', html)
        top = html_split[0]
        if episode == '179and180':
            script = html_split[2]
        else:
            script = html_split[1]
        html_split2 = re.split(r'-{30}.*', top)
        cast = html_split2[1]
        
        ### Get Metadata DF
        title = re.search(r'Episode \d+(.*?) - (.*?)<', html).groups()[1]
        title = re.sub(r'[^\x00-\x7f]',r'', title)
        date = re.search(r'Broadcast date: (.*?)<', html).groups()[0]
        director = re.search(r'Directed [bB]y (.*?)<', html).groups()[0]
        
        tempdf = pd.DataFrame({"Season": [season_num], 
                               "Episode": [episode_num],
                               "AirDate": [date], 
                               "Director": [director],
                               "Title": [title],
                               "SEID": [seid]})
        
        metadatadf = metadatadf.append(tempdf, ignore_index=True)
        
        ### Get Writer DF
        writers = re.search(r'Written [bB]y([:]|&nbsp;)? (.*?)<', html).groups()[1]
        writers = ', '.join(tuple([w.strip() for w in re.split(r',|&amp;', writers) if w]))
        writers = ', '.join(tuple([w.strip() for w in re.split(r'\band\b', writers) if w]))
        writers = writers.split(',').strip()
        writers = [''.join(x for x in par if x not in string.punctuation) for par in writers]
        
        tempdf = pd.DataFrame({"Writers": writers})
        tempdf['SEID'] = seid
        
        writerdf = writerdf.append(tempdf, ignore_index=True)
                                                             
        ### Get Cast DF
        castSoup = BeautifulSoup(cast)
        castlist = list(filter(None, castSoup.find('body').text.replace('\t', '').split('\n')))
        castlist = [j for i, j in enumerate(castlist) if j.find('...') > 0]
        
        Actor = []
        Character = []
        for c in castlist:
            pair = c.split('..', 1)
            Actor.append(pair[0].replace(u'\xa0', u' ').replace(u'rc: ', u'').encode('utf-8').strip())
            Character.append(pair[1].replace(u'\xa0', u' ').encode('utf-8').replace('.', '').strip())

        tempdf = pd.DataFrame({"Actor": Actor, "Character": Character})
        tempdf['SEID'] = seid
        
        castdf = castdf.append(tempdf, ignore_index=True)
        
        ### Get Dialogue DF
        soup = BeautifulSoup(script)
        dialogues = list(filter(None, soup.find('body').text.replace('\t', '').split('\n')))
        
        script_df = pd.DataFrame([])
        scene = 0
        dialogueIndex = 1
        sceneID = seid + '_' + str(1)
        
        for dialogue in dialogues:
            if 'The End' in dialogue:
                break
            if episode in episode_numbers[0:11]:
                if dialogue.isupper():
                    scene += 1
                    sceneID = seid + '_' + str(scene)
            else: 
                if '[' in dialogue:
                    scene += 1
                    sceneID = seid + '_' + str(scene)
            if (len(dialogue.split(':')) <= 1):
                continue
            if '[' in dialogue: 
                continue
            if dialogue.isupper():
                continue
            dialogue_split = dialogue.split(':')
            character = dialogue_split.pop(0).encode('utf-8').strip()
            line = ''.join(dialogue_split).strip()
            line = re.sub(r'[^\x00-\x7f]',r'', line).encode('utf-8').strip()
            try: 
                if dialoguedf.SceneNum.iloc[-1] == sceneID:
                    dialogueIndex += 1
                else: 
                    dialogueIndex = 1
            except:
                pass
            tempdf = pd.DataFrame({"Character": [character], 
                                   "Dialogue": [line],
                                   "SEID": seid,
                                   "SceneNum": sceneID,
                                   "DialogueIndex": dialogueIndex})
            dialoguedf = dialoguedf.append(tempdf, ignore_index = True)
        
    return metadatadf, writerdf, castdf, dialoguedf 

In [9]:
metadatadf, writerdf, castdf, dialoguedf = DataGetter(episode_numbers, base_url)

Scraping Episode: S01E00
Scraping Episode: S01E01
Scraping Episode: S01E02
Scraping Episode: S01E03
Scraping Episode: S01E04
Scraping Episode: S02E01
Scraping Episode: S02E02
Scraping Episode: S02E03
Scraping Episode: S02E04
Scraping Episode: S02E05
Scraping Episode: S02E06
Scraping Episode: S02E07
Scraping Episode: S02E08
Scraping Episode: S02E09
Scraping Episode: S02E10
Scraping Episode: S02E11
Scraping Episode: S02E12
Scraping Episode: S03E01
Scraping Episode: S03E02
Scraping Episode: S03E03
Scraping Episode: S03E04
Scraping Episode: S03E05
Scraping Episode: S03E06
Scraping Episode: S03E07
Scraping Episode: S03E08
Scraping Episode: S03E09
Scraping Episode: S03E10
Scraping Episode: S03E11
Scraping Episode: S03E12
Scraping Episode: S03E13
Scraping Episode: S03E14
Scraping Episode: S03E15
Scraping Episode: S03E16
Scraping Episode: S03E17
Scraping Episode: S03E18
Scraping Episode: S03E19
Scraping Episode: S03E20
Scraping Episode: S03E21
Scraping Episode: S03E22
Scraping Episode: S03E23


In [10]:
metadatadf.to_csv('Data/Seinfield_Metadata.csv', index=False)
writerdf.to_csv('Data/Seinfield_Writers.csv', index=False)
castdf.to_csv('Data/Seinfield_Cast.csv', index=False)
dialoguedf.to_csv('Data/Seinfield_Dialogue.csv', index=False)

###  Additional Cleanup

In [14]:
writers = pd.read_csv('Data/Seinfield_Writers.csv')

In [15]:
writers['Writers'] = writers['Writers'].str.strip()

In [16]:
writers['Writers'][writers['Writers'] == 'Larry Charles  Story By Marc Jaffe'] = "Larry Charles"
writers['Writers'][writers['Writers'] == 'Buck Dancer Larry David pseudonym'] = "Larry David"

In [17]:
writers['Writers'].value_counts()

Larry David            58
Peter Mehlman          19
Larry Charles          19
Jerry Seinfeld         18
Alec Berg              14
Jeff Schaffer          14
Tom Gammill            13
Max Pross              13
Andy Robin             13
Gregg Kavet            11
Spike Feresten          9
David Mandel            9
Jennifer Crittenden     6
Carol Leifer            6
Dan OKeefe              5
Steve Koren             5
Marjorie Gross          4
Bill Masters            3
Bruce Kirschbaum        3
Bruce Eric Kaplan       3
Steve ODonnell          2
Elaine Pope             2
Matt Goldman            2
Tom Leopold             2
Larry Levin             2
Bob Shaw                2
Jon Hayman              1
Greg Daniels            1
Andy Cowan              1
Ron Hague               1
Steve Lookner           1
Darin Henry             1
Don McEnery             1
Charlie Rubin           1
Steve Skrovan           1
Jill Franklyn           1
Lawrence H Levy         1
Fred Stoller            1
Sam Kass    

In [18]:
writers.to_csv('Data/Seinfield_Writers.csv', index=False)

In [27]:
metadata = pd.read_csv('Data/Seinfield_Metadata.csv')

In [None]:
metadata['Director'][metadata['Director'] == 'David&nbsp; Steinberg'] = "David Steinberg"

In [32]:
metadata.to_csv('Data/Seinfield_Metadata.csv', index=False)

In [90]:
dialogue = pd.read_csv('Data/Seinfield_Dialogue.csv')

In [77]:
#dialogue = dialogue.drop(dialogue[dialogue['Character']=='(from the movie we hear this dialogue'].index)

In [85]:
dialogue['Character']=dialogue['Character'].str.replace(r"\(.*\)","")

In [89]:
dialogue.to_csv('Data/Seinfield_Dialogue.csv', index=False)

###  Create subset of Dialogue Data

Since this is still a bit messy, let's lop off some of the observations.   Let's just take Characters that spoke more than once.   This removes all the edge cases that didn't parse correctly. 

In [99]:
subset = pd.DataFrame(dialogue['Character'].value_counts())

In [103]:
subset = subset[subset['Character'] > 1]
subsetlist = list(subset.index.values)

In [104]:
subsetlist

['JERRY',
 'GEORGE',
 'ELAINE',
 'KRAMER',
 'NEWMAN',
 'MORTY',
 'HELEN',
 'FRANK',
 'SUSAN',
 'ESTELLE',
 'PETERMAN',
 'PUDDY',
 'WOMAN',
 'MAN',
 'JACK',
 'MICKEY',
 'JERRY ',
 'BANIA',
 'STEINBRENNER',
 'DOCTOR',
 'ELAINE ',
 'WILHELM',
 'CLERK',
 'TIM',
 'LIPPMAN',
 'LEO',
 'LLOYD',
 'KAREN',
 'JACKIE',
 'UNCLE LEO',
 'GEORGE ',
 'WAITRESS',
 'HOYT',
 'CAROL',
 'MIKE',
 'AUDREY',
 'DONNA',
 'JANE',
 'JOEL',
 'NINA',
 'RUSSELL',
 'KRAMER ',
 'KRUGER',
 'MR. ROSS',
 'BABU',
 'RAY',
 'GUY',
 'MARLA',
 'TONY',
 'BOB',
 'MRS. ROSS',
 'RACHEL',
 'DAVID',
 'KEVIN',
 'SALLY',
 'IZZY',
 'MANAGER',
 'COP',
 'MR. PITT',
 'VANESSA',
 'ATTENDANT',
 'POPPIE',
 'MR. LIPPMAN',
 'STEVE',
 'SUE ELLEN',
 'MICHAEL',
 'CYNTHIA',
 'SID',
 'CHERYL',
 'MAESTRO',
 'MERYL',
 'KEITH',
 'GINA',
 'JAKE',
 'CRAIG',
 'JODI',
 'JEAN-PAUL',
 'BEN',
 'SHEILA',
 'CARL',
 'AARON',
 'SOUP NAZI',
 'WENDY',
 'DRIVER',
 'BRETT',
 'FRED',
 'PITT',
 'RABBI',
 'BETH',
 'RICK',
 'Notice',
 'SIDRA',
 'JIMMY',
 'WILLIE',
 'SAL

In [105]:
df = dialogue[dialogue['Character'].isin(subsetlist)]

In [107]:
df.Character.value_counts()

JERRY              14725
GEORGE              9649
ELAINE              7930
KRAMER              6616
NEWMAN               634
MORTY                505
HELEN                468
FRANK                435
SUSAN                375
ESTELLE              286
PETERMAN             191
PUDDY                162
WOMAN                156
MAN                  142
JACK                 124
MICKEY               110
JERRY                104
STEINBRENNER         100
BANIA                100
DOCTOR                91
ELAINE                88
WILHELM               84
CLERK                 81
TIM                   78
LIPPMAN               78
KAREN                 72
LLOYD                 72
LEO                   72
JACKIE                71
UNCLE LEO             70
                   ...  
GROUP                  2
JEANINE                2
SAL BASS               2
AL ROKER               2
NUN                    2
MR.THOMASSOULO         2
C.K.                   2
MOVIE PHONE GUY        2
UNCLE LEO              2


In [108]:
df.to_csv('Data/Seinfield_DialogueSUBSET.csv', index=False)