## Preprocess Scripts from transcripts.wikia

In [9]:
import os
import re
import nltk
import itertools
import csv


# set working directory
wd = os.path.join(os.getcwd(), 'marvel_scripts')
print wd, '\n'

# view script files
files = os.listdir(wd)
print files, '\n'

# get files for scripts from transcripts.wikia
scripts_tw = [file[:-4] for file in files if file.endswith('_tw.txt')]
print scripts_tw, '\n'

# get files for scripts from IMSDB
scripts_imsdb = [file[:-4] for file in files if file.endswith('_imsdb.txt')]
print scripts_imsdb, '\n'

# preview scripts
for s in scripts_tw + scripts_imsdb:
    with open(os.path.join(wd, s + '.txt')) as script:
        head = [next(script) for x in xrange(10)]
    print s, '\n', head, '\n'


/Users/winlin/Desktop/w266-nlp/w266_project/marvel_scripts 

['.DS_Store', 'ant-man_tw.txt', 'avengers_age_of_ultron_tw.txt', 'captain_america_civil_war_tw.txt', 'captain_america_the_first_avenger_tw.txt', 'captain_america_the_winter_soldier_tw.txt', 'fantastic_four_imsdb.txt', 'ghost_rider_imsdb.txt', 'guardians_of_the_galaxy_tw.txt', 'Icon\r', 'iron_man_3_tw.txt', 'lego_marvel_super_heroes_tw.txt', 'no_char_tags', 'spider-man_imsdb.txt', 'the_amazing_spider-man_2_tw.txt', 'the_amazing_spider-man_tw.txt', 'the_avengers_tw.txt', 'the_wolverine_tw.txt', 'thor_the_dark_world_tw.txt', 'thor_tw.txt', 'x-men_apocalypse_tw.txt', 'x-men_days_of_future_past_tw.txt', 'x-men_first_class_tw.txt', 'x-men_imsdb.txt', 'x-men_origins_wolverine_imsdb.txt', 'x-men_the_last_stand_tw.txt'] 

['ant-man_tw', 'avengers_age_of_ultron_tw', 'captain_america_civil_war_tw', 'captain_america_the_first_avenger_tw', 'captain_america_the_winter_soldier_tw', 'guardians_of_the_galaxy_tw', 'iron_man_3_tw', 'lego_marvel

## Helper Functions

In [10]:
# encoding: utf-8

### preprocess scripts from transcripts.wikia

### special cases:
# x-men_the_last_stand_tw, x-men_apocalypse_tw
# - new lines after each sentence within dialogue
# - blank lines between dialogues 

# the_amazing_spider-man_2_tw, avengers_age_of_ultron_tw
# - narration in same line as dialogue


### preprocess_line()
# - convert to lowercase
# - remove special chars
# - add narrator as speaker for scene descriptions (in bracketed text)
# - return list with speaker and dialogue
def preprocess_line(line):
   
    # convert to lowercase
    line = line.lower()
    
    # replace special chars
    for c in ['*', '\'''', '\'', '\n']:
        line = line.replace(c, '')
    
    # return narrator dialogue
    if is_narration(line):
        # remove brackets
        for c in ['[', ']']:
            line = line.replace(c, '')
        return ['narrator', line]
    
    # return character dialogue
    return line.split(': ')


### is_dialogue() - checks to see if line contains :, denoting dialogue
def is_dialogue(line):
    return not re.search(r'[:]', line) is None


### is_dialogue() - checks to see if line contains [], denoting narration
def is_narration(line):
    return not re.search(r'^[\[]', line) is None


### merge_dialogue() - merges fragments of dialogue from preprocessed script
def merge_dialogue(lines):
    merged = []
    speaker, dialogue = '', ''

    for line in lines:
        # if start of dialogue
        if len(line) == 2:

            # append speaker and merged dialogue to merged results
            if not speaker == '':
                merged.append([speaker, dialogue])
                
            # update speaker, fragment
            speaker = line[0]
            dialogue = line[1]
        
        # append fragment to current dialogue
        if len(line) == 1:
            dialogue += ' ' + line[0]
    return merged


### print_script() - print specific lines of script
def print_script(title, script, start=0, end=1):
    print title
    for i in xrange(start, end+1):
        print script[i]


## Preprocess Scripts

In [11]:
# dict for preprocessed scripts: key = file, value = preprocessed script [character, dialogue]
scripts_prep = {}

# preprocess scripts
for s in scripts_tw:
    with open(os.path.join(wd, s + '.txt')) as script:

        # store and preprocess lines, remove 1st element (embed image)
        lines = script.readlines()
        lines = [preprocess_line(line) for line in lines][1:]
        
        # remove blank lines
        lines = [line for line in lines if line != ['']]
        
        # merge dialogues
        lines = merge_dialogue(lines)
        
    # add lines to script dict
    scripts_prep[s] = lines


# print first n lines of 
start, end = 0, 20
for k, v in scripts_prep.items():
    print_script(k, v, start, end)
    print '\n'

# set working directory
wd = os.path.join(os.getcwd(), 'prep_scripts')
print wd, '\n'

# write script to CSV files
for k, v in scripts_prep.items():
    with open(os.path.join(wd, k + '.csv'), 'wb') as f:
        writer = csv.writer(f)
        writer.writerows(v)

x-men_the_last_stand_tw
['eric', 'i still dont know why im here. couldnt you just make them say yes?']
['charles', 'yes, i could, but its not my way. and i would expect you, of all people, would understand my feelings on misuse of power.']
['eric', 'ah, power corrupts and all that. yes, i know, charles. - when are you going to stop lecturing me?']
['charles', 'when you start listening. youre here because i need you.']
['eric', 'we dont have to meet every one of them in person?']
['charles', 'no. this ones special.']
['mrs. grey', 'what a beautiful campus. john, dont you think?']
['mr. grey', 'yeah, the brochure is great. but what about jean? what about her illness?']
['eric', 'illness?']
['mrs. grey', 'john.']
['eric', 'you think your daughter is sick, mr. grey?']
['charles', 'perhaps it would be best if we were to speak to her. alone.']
['mrs. grey', 'of course. jean, can you come down, dear?']
['mr. grey', 'well leave you, then.']
['charles', 'its very rude to read my thoughts, or mr