## Preprocess Scripts from transcripts.wikia

In [15]:
import os
import re
import nltk
import itertools
import csv


# set working directory
wd = os.path.join(os.getcwd(), 'marvel_scripts')
print wd, '\n'

# view script files
files = os.listdir(wd)
print files, '\n'

# get files for scripts from transcripts.wikia
scripts_tw = [file[:-4] for file in files if file.endswith('_tw.txt')]
print scripts_tw, '\n'

# get files for scripts from IMSDB
scripts_imsdb = [file[:-4] for file in files if file.endswith('_imsdb.txt')]
print scripts_imsdb, '\n'

# preview scripts
for s in scripts_tw + scripts_imsdb:
    with open(os.path.join(wd, s + '.txt')) as script:
        head = [next(script) for x in xrange(10)]
    #print s, '\n', head, '\n'


/home/jim_chen/w266_project/marvel_scripts 

['ant-man_tw.txt', 'x-men_the_last_stand_tw.txt', 'the_amazing_spider-man_tw.txt', 'x-men_days_of_future_past_tw.txt', 'avengers_age_of_ultron_tw.txt', 'fantastic_four_imsdb.txt', 'ghost_rider_imsdb.txt', 'captain_america_civil_war_tw.txt', 'thor_the_dark_world_tw.txt', 'the_amazing_spider-man_2_tw.txt', 'iron_man_3_tw.txt', 'thor_tw.txt', 'lego_marvel_super_heroes_tw.txt', 'captain_america_the_winter_soldier_tw.txt', 'spider-man_imsdb.txt', 'the_avengers_tw.txt', 'x-men_first_class_tw.txt', 'x-men_imsdb.txt', 'x-men_origins_wolverine_imsdb.txt', 'captain_america_the_first_avenger_tw.txt', 'no_char_tags', 'guardians_of_the_galaxy_tw.txt', 'the_wolverine_tw.txt', 'x-men_apocalypse_tw.txt'] 

['ant-man_tw', 'x-men_the_last_stand_tw', 'the_amazing_spider-man_tw', 'x-men_days_of_future_past_tw', 'avengers_age_of_ultron_tw', 'captain_america_civil_war_tw', 'thor_the_dark_world_tw', 'the_amazing_spider-man_2_tw', 'iron_man_3_tw', 'thor_tw', 'lego_

## Helper Functions

In [16]:
# encoding: utf-8

### preprocess scripts from transcripts.wikia

### special cases:
# x-men_the_last_stand_tw, x-men_apocalypse_tw
# - new lines after each sentence within dialogue
# - blank lines between dialogues 

# the_amazing_spider-man_2_tw, avengers_age_of_ultron_tw
# - narration in same line as dialogue


### preprocess_line()
# - convert to lowercase
# - remove special chars
# - add narrator as speaker for scene descriptions (in bracketed text)
# - return list with speaker and dialogue
def preprocess_line(line):
   
    # convert to lowercase
    #line = line.lower() , '\''
    
    # replace special chars
    for c in ['*', '\n']:
        line = line.replace(c, '')
    line = re.sub("'[']+",'', line)
    
    # return narrator dialogue
    if is_narration(line):
        # remove brackets
        for c in ['[', ']']:
            line = line.replace(c, '')
        return ['narrator', line]
    
    # return character dialogue
    return line.split(': ')


### is_dialogue() - checks to see if line contains :, denoting dialogue
def is_dialogue(line):
    return not re.search(r'[:]', line) is None


### is_dialogue() - checks to see if line contains [], denoting narration
def is_narration(line):
    return not re.search(r'^[\[]', line) is None


### merge_dialogue() - merges fragments of dialogue from preprocessed script
def merge_dialogue(lines):
    merged = []
    speaker, dialogue = '', ''

    for line in lines:
        # if start of dialogue
        if len(line) == 2:

            # append speaker and merged dialogue to merged results
            if not speaker == '':
                merged.append([speaker, dialogue])
                
            # update speaker, fragment
            speaker = line[0]
            dialogue = line[1]
        
        # append fragment to current dialogue
        if len(line) == 1:
            dialogue += ' ' + line[0]
    return merged


### print_script() - print specific lines of script
def print_script(title, script, start=0, end=1):
    print title
    for i in xrange(start, end+1):
        print script[i]


## Preprocess Scripts

In [17]:
# dict for preprocessed scripts: key = file, value = preprocessed script [character, dialogue]
scripts_prep = {}

# preprocess scripts
for s in scripts_tw:
    with open(os.path.join(wd, s + '.txt')) as script:

        # store and preprocess lines, remove 1st element (embed image)
        lines = script.readlines()
        lines = [preprocess_line(line) for line in lines][1:]
        
        # remove blank lines
        lines = [line for line in lines if line != ['']]
        
        # merge dialogues
        lines = merge_dialogue(lines)
        
    # add lines to script dict
    scripts_prep[s] = lines


# print first n lines of 
start, end = 0, 20
for k, v in scripts_prep.items():
    print_script(k, v, start, end)
    print '\n'

# set working directory
wd = os.path.join(os.getcwd(), 'prep_scripts')
print wd, '\n'

# write script to CSV files
for k, v in scripts_prep.items():
    with open(os.path.join(wd, k + '.csv'), 'wb') as f:
        writer = csv.writer(f)
        writer.writerows(v)

thor_the_dark_world_tw
['Odin', '[voice over] Long before the birth of light there was darkness, and from that darkness, came the Dark Elves. Millennia ago, the most ruthless of their kind, Malekith, sought to transform our universe back into one of eternal night.']
['narrator', 'Malekith is looking up at the Convergence']
['Odin', 'Such evil was possible through the power of the Aether, an ancient force of infinite destruction.']
['narrator', 'the Aether is shown']
['Odin', 'The noble armies of Asgard, led by my father, King Bor, waged a mighty war against these creatures.']
['narrator', 'the Dark Elves and Asgardians are shown fighting']
['narrator', 'Kurse walks up to Malekith']
['Kurse', 'Malekith! Asgardian forces are upon us.']
['narrator', 'the Bifrost opens, and bor steps out with Asgardian reinforcements']
['Malekith', 'Send the Kursed.']
['narrator', 'some soldiers crush objects in their hands and become giant hulk-like creatures']
['narrator', 'Malekith looks up at the Conve