In [None]:
#Cloud Object Storage setup    
import ibm_boto3
from ibm_botocore.client import Config
import os
import json
import warnings
import time

# Create IBM Cloud Object Storage resource. 

****** WARNING ******
It includes your credentials.
You might want to remove those credentials before you share your notebook.
**********************



In [None]:
cos_credentials = {
  "apikey": "",
  "cos_hmac_keys": {
    "access_key_id": "",
    "secret_access_key": ""
  },
  "endpoints": "https://cos-service.bluemix.net/endpoints",
  "iam_apikey_description": "Auto generated apikey during resource-key operation for Instance - crn:v1:bluemix:public:cloud-object-storage:global:a/75029dd30d1da5f954d14288c5faff73:1cf51121-8381-4bc7-a2e5-94147e0b46c2::",
  "iam_apikey_name": "auto-generated-apikey-c91c0624-3d9c-4efd-822c-94781004c258",
  "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Writer",
  "iam_serviceid_crn": "crn:v1:bluemix:public:iam-identity::a/75029dd30d1da5f954d14288c5faff73::serviceid:ServiceId-cd4d53b8-962e-4527-871e-e760d10aad32",
  "resource_instance_id": "crn:v1:bluemix:public:cloud-object-storage:global:a/75029dd30d1da5f954d14288c5faff73:1cf51121-8381-4bc7-a2e5-94147e0b46c2::"
}

# You need to save the apikey and resource_instance_id to create the COS resource object later.
api_key = cos_credentials['apikey']
service_instance_id = cos_credentials['resource_instance_id']

# Define endpoint information.
service_endpoint = 'https://s3-api.us-geo.objectstorage.softlayer.net'
# Define the authorization endpoint.
auth_endpoint = 'https://iam.bluemix.net/oidc/token'
# Create a COS resource.
cos = ibm_boto3.resource ('s3',
                         ibm_api_key_id=api_key,
                         ibm_service_instance_id=service_instance_id,
                         ibm_auth_endpoint=auth_endpoint,
                         config=Config(signature_version='oauth'),
                         endpoint_url=service_endpoint)

# Download Cornell Movie-Dialogs Corpus

In [None]:
#Download training data
!pip install wget --upgrade
import wget

# Cornell Movie-Dialogs Corpus
link = 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip'
data_dir = 'cornell'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

if not os.path.isfile(os.path.join(data_dir, os.path.join(link.split('/')[-1]))):
    wget.download(link, out=data_dir)  
        
!ls cornell         

In [None]:
!unzip cornell/cornell_movie_dialogs_corpus.zip -d cornell_movie_dialogs_corpus

In [None]:
!ls cornell_movie_dialogs_corpus

In [None]:
!cat 'cornell movie-dialogs corpus'/README.txt

In [None]:
def upload_file_cos(data_dir, local_file_name, key): 
    try: 
        bucket_obj = cos.Bucket('chandlerping-donotdelete-pr-ggdhghghghggfg')
        #bucket_obj = cos.Bucket('chandler-ping-training')
        with open(os.path.join(data_dir, local_file_name), 'rb') as data: 
            bucket_obj.upload_file(os.path.join(data_dir, local_file_name), key)
        print('{} is uploaded.'.format(local_file_name)) 
        for obj in bucket_obj.objects.all():
            print('Object key: {}'.format(obj.key))
            print('Object size (kb): {}'.format(obj.size/1024))
    except Exception as e:
        print(Exception, e)
    else:
        print('File Uploaded')

In [None]:
upload_file_cos('cornell_movie_dialogs_corpus/cornell movie-dialogs corpus', 'movie_conversations.txt', 'movie_conversations.txt')

In [None]:
upload_file_cos('cornell_movie_dialogs_corpus/cornell movie-dialogs corpus', 'movie_lines.txt', 'movie_lines.txt')

# Create training set from Cornell Movie-Dialogs Corpus (uploaded in COS)

In [None]:
#Access objects from cloud storage
import sys
import types
import pandas as pd

# clean up movie_conversations
file = cos.Object('chandlerping-donotdelete-pr-dgfhghhhm','movie_conversations.txt')

In [None]:
# i found out that the encoding was ISO-8859-2 from Kaggle(this dataset is uploaded there also) 
body = file.get()['Body'].read().decode("ISO-8859-1")

In [None]:
# This is the format : u0 +++$+++ u2 +++$+++ m0 +++$+++ [‘L194’, ‘L195’, ‘L196’, ‘L197’]
#"+++$+++" :the separator
#u0, u2 : the IDs of the two characters involved in the discussion, 
#m0 : the ID of the movie 
#[‘L194’, ‘L195’, ‘L196’, ‘L197’] : the list of sentences IDs (or utterances, to be more precise) in chronological order
#What we need is a clean list of sentence IDs
lines = body.split('\n')
lines

In [None]:
import re
#conversations_chunks = [line.split(" +++$+++ ")[-1] for line in lines]
#conversations_chunks = [line[34:].strip(']').replace("'", "").replace(",", "").split(" ") for line in lines]
conversations_chunks = [re.sub(r'u.+\[','',line).strip(']').replace("'", "").replace(",", "").split(" ") for line in lines]
#conversations_chunks = [chunk.strip("[").strip("]").split(',') for chunk in conversations_chunks]
conversations_chunks

In [None]:
conversations_chunks[56]

In [None]:
# clean up movie lines
file = cos.Object('chandlerping-donotdelete-pr-5fdfgfgfhgh','movie_lines.txt')
body = file.get()['Body'].read().decode("ISO-8859-1")
lines = body.split('\n')

In [None]:
lines

In [None]:
lines_chunks = [line.split(" +++$+++ ") for line in lines]
lines_chunks

In [None]:
#strip() with no arguments (or None as the first argument) removes all whitespace at the start and end, including spaces, tabs, newlines and carriage returns. 
#Leaving it in doesn't do any harm, and allows your program to deal with unexpected extra whitespace inserted into the file.
lines_dict = {line[0]: line[-1].strip() for line in lines_chunks}

In [None]:
lines_dict['L1045']

In [None]:
utterances = []
responses = []
for index, conversation in enumerate(conversations_chunks):
    for i in range(len(conversation) - 1):
        #print(conversation[i])
        utterances.append(lines_dict[conversation[i]])
        responses.append(lines_dict[conversation[i+1]])   

In [None]:
print(utterances[0])
print(responses[0])

In [None]:
lines_dict['L194']

In [None]:
lines_dict['L195']

In [None]:
lines_dict['L196']

In [None]:
lines_dict['L197']

In [None]:
print(utterances[1])
print(responses[1])

In [None]:
print(utterances[2])
print(responses[2])

In [None]:
print(utterances[3])
print(responses[3])

In [None]:
lines_dict['L200']

In [None]:
lines_dict['L201']

In [None]:
lines_dict['L202']

In [None]:
lines_dict['L203']

In [None]:
lines_dict['L204']

In [None]:
lines_dict['L205']

In [None]:
lines_dict['L206']

In [None]:
#import os
#data_dir = 'cornell'
#if not os.path.isdir(data_dir):
#    os.mkdir(data_dir)
#utter_cornell = open('utter_corn.txt','w')
#utter_response = open('respo_corn.txt','w')

In [None]:
print(len(utterances))
print(len(responses))

In [None]:
#for i in range(len(utterances)):
#    utter_cornell.write(utterances[i]+'<EOS>')
#    utter_response.write(responses[i]+ '<EOS>')

In [None]:
#upload_file_cos('', 'utter_corn.txt', 'utter_corn.txt')

In [None]:
#upload_file_cos('', 'respo_corn.txt', 'respo_corn.txt')

In [None]:
#file = cos.Object('chandlerping-donotdelete-pr-5d1gylpa2fimey','utter_corn.txt')

In [None]:
#body = file.get()['Body'].read().decode("ISO-8859-2")

In [None]:
#lines = body.split('\n')
#lines

In [None]:
#file = cos.Object('chandlerping-donotdelete-pr-5d1gylpa2fimey','respo_corn.txt')

In [None]:
#body = file.get()['Body'].read().decode("ISO-8859-2")

In [None]:
#lines = body.split('\n')
#lines

# Data Collection - Sitcom Scripts

In [None]:
import bs4 #this is beautiful soup
import requests
import pandas as pd
import re

def get_tree(url):
    """
    Returns the BeautifulSoup parse tree 
    """
    # using 'requests' instead of urllib coz it takes care of bad url encoding
    try:
        source = requests.get(url)
        source.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(err)
        return None
    bs_tree = bs4.BeautifulSoup(source.text)
    return bs_tree

In [None]:
def extract_dialogues_seinfield(text, scene_id):   
    lines = text[0].contents
    lines = str(lines[0]).replace("\t", "").replace("\n", "").split('<br/>')
    lines = [x for x in lines if x] #remove empty strings from the list
    rows = []
    scene_change = False
    dialogue = False
    dialogue_pattern = re.compile('[A-Z]*:') # lines always start with character name in Upper case followed by ':'
    scene_change_pattern = re.compile('INT.*') # scenes always start with 'INT.*'
    for line in lines:
        scene_change = scene_change_pattern.match(line)
        dialogue  = dialogue_pattern.match(line)
        if scene_change:
            scene_id += 1
        if dialogue:
            character = line.split(':')[0]
            text = re.sub(r'\([^)]*\)', '', line.split(':')[1])
            rows.append([character, text, scene_id])
    return rows, scene_id 

In [None]:
def format_url(sitcom_name, url_format, episode_num):
    if sitcom_name == 'Seinfield':
        url_parts = url_format.split('<insert>')
        if type(episode_num) != str:
            episode_num =str(episode_num)
        return url_parts[0]+episode_num+url_parts[1]
    if sitcom_name == 'Fraiser' or 'Friends':
        #well, episode_num is not a 'num' it is the variable part of the url with the season no. and the episode no.
        return re.sub('<insert>', episode_num, url_format)  
    return 

In [None]:
import sys
import time
def scrape_sitcom_scripts(sitcom_name, episodes, url_format):
    '''
    sitcom_name : 'Seinfield'/'Friends'/'Fraiser'
    episodes : a list or range of numbers
    url_format : 'http://www.**********.com/<insert>.html'
    '''
    df = pd.DataFrame()
    scene_id = 0
    for episode_num in episodes:
        try:
            print(episode_num)
            url = format_url(sitcom_name, url_format, episode_num)
            bs_tree = get_tree(url)
            if bs_tree is None:
                print("error fetching :",url)
                break;       
            if sitcom_name == 'Seinfield':
                rows, scene_id = extract_dialogues_seinfield(bs_tree.findAll("p"), scene_id)
            if sitcom_name == 'Fraiser':
                rows, scene_id = extract_dialogues_fraiser(bs_tree.findAll('pre'), scene_id)
            if sitcom_name == 'Friends':
                rows, scene_id = extract_dialogues_friends(bs_tree.findAll("p"), scene_id)
            #if sitcom_name == 'HIMYM':
                #rows, scene_id = extract_dialogues_fraiser(bs_tree.findAll('pre'), scene_id)
            df = df.append(rows)
        except IndexError:
            print(len(rows))
        time.sleep(2)
    df.columns = ['Character', 'Line', 'Scene']
    return df

## Seinfield - 180 episodes web scraping using BeautifulSoup and Regular Expressions

In [None]:
#Scrape http://www.seinology.com for 180 episodes of Seinfield 
l1  = ['01', '02', '03', '04', '05', '06', '07', '08', '09']
l2 = [ str(i) for i in range(10,82)]
l3 = ['82and83']
l4 = [ str(i) for i in range(84,100)]
l5 = ['100and101']
l6 = [ str(i) for i in range(102,177)]
l7 = ['177and178']
l8 = ['179and180']
episodes = l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 
df = scrape_sitcom_scripts('Seinfield', episodes, 'http://www.seinology.com/scripts/script-<insert>.shtml')

In [None]:
df

In [None]:
set(df['Character'].values)

In [None]:
df[df['Character'] == 'NOTE']

In [None]:
df = df[df['Character'] != 'NOTE']

In [None]:
set(df['Character'].values)

In [None]:
df.to_csv('Seinfield.txt', header=None, index=None, sep='+', mode='a')   

In [None]:
!head 'Seinfield.txt'

In [None]:
upload_file_cos('', 'Seinfield.txt', 'Seinfield.txt')

## 'Fraiser' (Screen Scraping) 

In [None]:
import re
bs_tree = get_tree('http://www.kacl780.net/frasier/transcripts/')
episodes = []
if bs_tree is None: print("error fetching")   
for li in bs_tree.findAll('li'):
    if 'episode' in str(li): 
        m = re.search('/transcripts(.+?)"', str(li))
        if m: 
            episodes.append(m.group(1))

In [None]:
episodes

In [None]:
def clean_lines(element, end_tag_not_found):
    #print("inside clean lines")
    tag_found = False
    for start, end  in zip(['[', '(', '<'], [']', ')', '</']):

        if (element.find(start) == -1) and (element.find(end) == -1): continue
        tag_found = True
        if (start == '<' and end == '</'):
            if (element.find(start) > -1 and element[element.find(start)+ 1] == 'u') or (element.find(end) > -1 and element[element.find(end)+ 2] == 'u'):
                if element.find(start) > -1 : 
                    element = element[0:element.find(start)] + element[element.find('>')+1:]
                if element.find(end) > -1 : 
                    end_tag_not_found = False
                    element = element[0:element.find(end)] + element[element.find('>')+1:]
                continue
      
        if (element.find(start) > -1) and (element.find(end) > -1): 
            if element.find(start) != element.find(end):
                element = re.sub(r'{}.*?{}'.format(re.escape(start),re.escape(end)),'',element)  
                continue
            
        if (element.find(start) == -1) or (element.find(start) == element.find(end)):
            end_tag_not_found = False
            #print("start missing:",element.find(end))
            if element[element.find(end)] == len(element)-1:
                element = ''
                continue
            if end == '</':
                element = element[element.find('>')+1:] 
            else:
                element =  element[element.find(end)+1:]
            continue
            
        if element.find(end) == -1:
            end_tag_not_found = True
            if element[element.find(start)] == 0:
                element = ''
                continue
            element =  element[0:element.find(start)]
            continue
            
    if end_tag_not_found and not(tag_found):
        element = ''
    
    return element, end_tag_not_found 

In [None]:
import re
def extract_dialogues_fraiser(text, scene_id):
    rows = []
    dialogue = False
    scene_change = False
    character_flag = False
    line = ""
    character = ""
    end_tag_not_found = False
    character_pattern = re.compile('\s*<b>[A-Za-z]*:\s*</b>')
    for s in text:
        #if'Scene' in str(s): print(s)
        if '<i>Scene' not in str(s):
            if 'Scene' not in str(s): 
                continue
        tags = str(s).split('\n')
        for i,element in enumerate(tags):
            #print(str(i) + 'mndbbn:'+str(element))
            if element == '': continue

            if element.startswith('Scene') or '<i>Scene' in element: 
                #print('Scene :'+str(i) + ':'+str(element))
                #scene_id += 1
                scene_change = True
                dialogue = False
                continue
            if '<center>' in element : continue
            character_flag  = character_pattern.match(element)
            if not(character_flag or dialogue):
                if end_tag_not_found and any(char in element for char in [']', ')', '>']):
                    end_tag_not_found = False
                continue
                         
            #print(str(i) + ':'+str(element))
            if character_flag:
                #print('character :'+str(i) + ':'+str(element))
                m = re.search('<b>(.+?):\s*</b>', element)
                if line is not "" :rows.append([character, line, scene_id])
                if scene_change: 
                    scene_id += 1 
                    scene_change = False
                line = ""
                if m: 
                    character = m.group(1)
                    #print(character)
                dialogue = True 
            if dialogue:
                if re.compile('<i>(.+?)</i>').match(element): continue
                element = re.sub('<b>(.+?):\s*</b>', ' ',element) #remove character name from the line
                #remove html tags , asides, comments etc
                element,end_tag_not_found = clean_lines(element, end_tag_not_found)
                if any(char in element for char in ['[', '(', '<' ,']', ')', '</']):
                    element,end_tag_not_found = clean_lines(element, end_tag_not_found)
                if 'i>' in element: element = re.sub('i>', '', element)   
                line = line + element
                line = re.sub("\s\s+", " ", line)
                line = line.split(':')[0]
                #print(str(i) + ': line :'+line)
    #print(rows)
    return rows, scene_id

In [None]:
df = scrape_sitcom_scripts('Fraiser', episodes, 'http://www.kacl780.net/frasier/transcripts<insert>')

In [None]:
df

In [None]:
set(df["Character"].values)

In [None]:
df = df.loc[~df['Character'].isin(['Credits' , 'INSERT'])]

In [None]:
df

In [None]:
df.to_csv('Fraiser.txt', header=None, index=None, sep='+', mode='a')

In [None]:
!head Fraiser.txt

In [None]:
upload_file_cos('', 'Fraiser.txt', 'Fraiser.txt')

## 'How I met your Mother' scripts 
Luckily, some guy has made these available on github.

In [None]:
!wget https://raw.githubusercontent.com/mneedham/neo4j-himym/master/data/import/sentences.csv 

In [None]:
!head sentences.csv

In [None]:
import pandas as pd
df = pd.read_csv('sentences.csv')

In [None]:
df = df.dropna()  

In [None]:
#First two episodes have 'Scene one', 'Scene Two 'etc to mark the beginning of a scene 
from itertools import chain
scene_markers = []
for num in  ['Scene One', 'Scene Two', 'Scene Three', 'Scene Four', 'Scene Five', 'Scene Six', 'Scene Seven']:
   scene_markers.append(list(df[df["Sentence"].str.contains(num)].SentenceId))
scene_markers = list(chain.from_iterable(scene_markers))
len(scene_markers)

In [None]:
df1 = df[df["Sentence"].str.startswith('[')] 

In [None]:
df2 = df[df["Sentence"].str.endswith(']')] 

In [None]:
print(len(df1))

In [None]:
print(len(df2))

In [None]:
1016 - 996

In [None]:
#sentences that are not c
problem_list = []
import collections
counter=dict(collections.Counter(list(df1.SentenceId) + list(df2.SentenceId)))
for key in counter.keys():
    if counter[key] < 2:
        problem_list.append(key)

In [None]:
len(problem_list)

In [None]:
df[df.SentenceId.isin(problem_list)]

In [None]:
#We can pretend these dont have  problem for the time being

In [None]:
scene_markers = scene_markers + list(df1.SentenceId.values) + list(df2.SentenceId.values)

In [None]:
scene_markers = sorted(set(scene_markers))

In [None]:
df[df.SentenceId.isin(scene_markers)]

In [None]:
df[~df.SentenceId.isin(scene_markers)]

In [None]:
df.EpisodeId.values

In [None]:
#return the first line of every episode 
first_lines = []
for i in set(df.EpisodeId.values):
    first_lines.append(df[df.EpisodeId == i].iloc[0].SentenceId)

In [None]:
df[df.SentenceId.isin(first_lines)] 

In [None]:
df = df[~df.SentenceId.isin(scene_markers)]

In [None]:
df

In [None]:
def split_charslines(sentence):
    if len(sentence.split(':')) > 1:
        character = sentence.split(':')[0]
        line = sentence.split(':')[1] 
    else:
        character = None
        line = None
    return pd.Series([character, line], index=['Character','Line'])

df[['Character','Line']] = df['Sentence'].apply(split_charslines)     

In [None]:
df.head()

In [None]:
s = df['Character'].value_counts() == 1

In [None]:
problem_list2 = list(s[s].index)

In [None]:
'1' in problem_list2

In [None]:
import nltk
nltk.download('punkt')

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
#print(len(problem_list2))
remove_list = []
problem_list2 = list(df["Character"])
for i,x in enumerate(problem_list2):
    if len(x.split()) > 10:
        remove_list.append(problem_list2[i])
        #print("length {} : {} ".format(problem_list2[i],len(x.split())))
        continue
    if len(x.split()) == 1 and x.startswith('(') and x.endswith(')'):
        continue
    if len(x.split()) == 1 and len(list(x)) == 1:
        remove_list.append(problem_list2[i])
        #print("length",problem_list2[i])
        continue
    for word in x.split():
        #print(word)
        if word.lower() in ['i', 'you', 'our', 'mine', 'me', 'my', 'us', 'your', 'yours', 'ours', 'i''ll', 'okay', 'oK', 'yeah', 'just', 'yes', 'no', 'just', 'i''m'
                             'nothing', 'i''ve', 'but', 'though', 'although', 'just', 'well', 'hey', 'never', 'ever', 'would', 'should', 'could', 'what', 'why', 'how', 'was' 
                             ,'were']:
            #print("word in list",problem_list2[i])
            remove_list.append(problem_list2[i])
            break
            
    #tokens = nltk.word_tokenize(x)
    #tagged = nltk.pos_tag(tokens)
    #found = False
    #for word, tag in tagged:
        #if tag in ['NN', 'NNS', 'NNP', 'NNPS']: 
            #found = True
    #if not found: 
        #remove_list.append(problem_list2[i])
        #print("pos tag",problem_list2[i])

In [None]:
remove_list

In [None]:
len(remove_list)

In [None]:
df[df.Character.isin(remove_list)].dropna()

In [None]:
df = df.dropna()

In [None]:
df = df[df.Line != ""]

In [None]:
df = df[~df["Character"].isin(remove_list)] 

In [None]:
def check_numeric(x):
    try:
        float(x)
        return True
    except ValueError:
        return False
#df = df[~df['Character'].apply(check_numeric)] 
df = df[~df['Line'].apply(check_numeric)]

In [None]:
first_lines

In [None]:
sorted(scene_markers + first_lines)

In [None]:
list(df["SentenceId"])

In [None]:
counter = 0
scene = []
j=0
for i in list(df["SentenceId"]):
    #print("in df ", i)
    if (i in first_lines) or ((i+1) in scene_markers):
        #print("flagged ", i)
        counter += 1
    scene.append(counter)
    #print("counter ", counter)
#df["Scene"] = pd.Series(scene)
#df
scene = pd.Series(scene)
df["Scene"] = scene.values

In [None]:
import re
def clean_lines(line):
     line = re.sub(r'{}.*?{}'.format(re.escape('('),re.escape(')')),'',line)
     return line
df["Line"] = df["Line"].apply(clean_lines)

In [None]:
df = df[~df.Character.isin(remove_list)]

In [None]:
df.to_csv('HIMYM.txt', header=None, index=None, sep='+', mode='a')

In [None]:
upload_file_cos('', 'HIMYM.txt', 'HIMYM.txt')

## 'Friends' Transcripts 

In [None]:
import re
bs_tree = get_tree('https://fangj.github.io/friends/')
episodes = []
if bs_tree is None: print("error fetching")   
for li in bs_tree.findAll('li'):
    if 'season' in str(li): 
        m = re.search('season/(.+?)"', str(li))
        if m: 
            episodes.append(m.group(1))

In [None]:
episodes

In [None]:
#text = bs_tree.findAll("p")
def extract_dialogues_friends(text, scene_id): 
    from copy import copy
    rows = []
    scenes = []
    lines = []
    scene_change = False
    _line = ""
    character = ""
    orphan_bracket = False
    orphan_para = False
    for p in text: lines.append(p.get_text())
    for line in lines:
        #print(line)
        #print((line == 'End'))
        #print((line == 'END'))
        #if line.startswith('[') or line.endswith(']'):
        #if'Scene' in str(s): print(s)
        if 'Written by' in line: continue
        if 'WRITTEN BY' in line: continue
        if 'Transcribed by' in line :continue
        if 'Originally written by' in line:continue
        #print(line)
        if (line == 'End') or (line == 'END') : continue
        if orphan_bracket:
            #print("Orphan",line)
            if (line.find(']') == -1): 
                #print("no orphan pair found :",line)
                continue
            #print("pair found",line)
            orphan_bracket = False
            
        if (line.find('[') == -1) and (line.find(']') == -1):##no scene change
            _line = copy(line)
            #print("no scene change :",_line)
        else:
            if (line.find('[') > -1): 
                orphan_bracket = True
                scene_id += 1
                #print("open bracket :",line)
            if ((line.find('[') > -1 and line.index('[') == 0) and (line.find(']') > -1 and line.index(']') == len(line)-1)):
                #print("whole line scene change:",line)
                orphan_bracket = False
                continue
            if (line.find('[') > -1) and (line.find(']') > -1):
                #print("inbetween scene change:",line)
                orphan_bracket = False
                _line = line[0:line.index('[')] + line[line.index(']')+1:] 
            if (line.find('[') > -1) and (line.find(']') == -1): 
                #print("only beginning scene change:",line)
                _line = line.split('[')[0] 
            if (line.find('[') == -1) and (line.find(']') > -1): 
                #print("only end scene change:",line)
                _line = line.split(']')[1] 
            
        if orphan_para:
            if (_line.find(')') == -1): 
                continue
                #print("no pair para :",_line)
            orphan_para = False
            #print("pair para :",_line)
        if (_line.find('(') == -1) and (_line.find(')') == -1):
            pass
        else:
            if (_line.find('(') > -1): 
                orphan_para = True
            if ((_line.find('(') > -1 and _line.index('(') == 0) and (_line.find(')') > -1 and _line.index(')') == len(_line)-1)):
                orphan_para = False
                continue
            for _ in range(5):
                if (_line.find('(') > -1) and (_line.find(')') > -1):
                    _line = _line[0:_line.index('(')] + _line[_line.index(')')+1:] 
                    orphan_para = False
            if (_line.find('(') > -1) and (_line.find(')') == -1): 
                _line = _line.split('(')[0] 
            if (_line.find('(') == -1) and (_line.find(')') > -1): 
                _line = _line.split(')')[1] 
                
        if len(_line.split(':')) < 2: continue
            
        character = _line.split(':')[0]
        #print("character :",character)
        if len(_line.split(':')) > 2:
            _line = _line.split(':')[1] + _line.split(':')[2]
        else:
            #print(_line)
            _line = _line.split(':')[1]       
        if _line in  ["" ," "]: continue    
        _line = re.sub('\n',' ',_line)
        _line = re.sub("\s\s+", " ", _line)
        if _line in ["", " "]:continue
        rows.append([character, _line, scene_id])
    return rows, scene_id

In [None]:
sitcom_name = 'Friends'
url_format = 'https://fangj.github.io/friends/season/<insert>'
df = pd.DataFrame()
scene_id = 0
test_episodes = []
test_episodes.append('0101.html')
for episode_num in test_episodes:
    #try:
        #print(episode_num)
        url = format_url(sitcom_name, url_format, episode_num)
        bs_tree = get_tree(url)
        if bs_tree is None:
            print("error fetching :",url)
            break; 
        #print(bs_tree.findAll("p"))
        rows, scene_id = extract_dialogues_friends(bs_tree.findAll("p"), scene_id)
    #except IndexError: print("vhg")
        #print(len(rows))
print(rows)

In [None]:
df = scrape_sitcom_scripts('Friends', episodes, 'https://fangj.github.io/friends/season/<insert>')

In [None]:
df['Character'].value_counts()

In [None]:
df[df["Character"] == 'CHANDLER']

In [None]:
len(df)

In [None]:
df[df["Scene"]==3412]["Line"][1]


In [None]:
s = df[df["Scene"]==3412]["Line"][1]
s.encode('latin1')

In [None]:
s = df[df["Scene"]==3412]["Line"][1]
try:
    s.encode('latin1')
except UnicodeEncodeError:
    print("problem character in ", s)

In [None]:
s_new = ''
for idx,_ in enumerate(s):
    try:
        s[idx].encode('latin1')
        s_new = s_new + s[idx]
    except UnicodeEncodeError:
        print(s[idx], ord(s[idx]))

In [None]:
s_new

In [None]:
df1 = df.copy()

In [None]:
def replace_junk(line):
    try:
        line.encode('latin1')
        return line
    except UnicodeEncodeError:
        print("problem character in ", line)
        s_new = ''
        for idx,_ in enumerate(line):
            if ord(line[idx]) == 65533:
                continue
            else:                
                s_new = s_new + line[idx]
        return s_new

In [None]:
def replace_junk(line):
    s_new = ''
    for idx,_ in enumerate(line):
        if ord(line[idx]) == 65533:
            print(line)
            continue
        else: 
            s_new = s_new + line[idx]
    return s_new

In [None]:
df1["Line"] = df1["Line"].apply(lambda line : replace_junk(line)) 

In [None]:
df1.to_csv('Friends.txt', header=None, index=None, sep='+', mode='a')

In [None]:
df1[df1["Character"]=='CHANDLER']

In [None]:
upload_file_cos('', 'Friends.txt', 'Friends.txt')

In [None]:
df1

In [None]:
df1.reset_index(inplace=True)

In [None]:
[x for x in range(3850) if x not in set(df1["Scene"])] # some scene nos are missing

In [None]:
df1[df1["Scene"] ==13]

In [None]:
df1[df1["Scene"] ==15]

In [None]:
df2 = df1.copy()

In [None]:
df2

In [None]:
chandler_lines = list(df2[df2.Character.isin(['Chandler','CHANDLER'])].index)

In [None]:
for _,Scene_grp in df2.groupby('Scene'):
    for row in Scene_grp.values:
        print(row[2])

In [None]:
utterances=[]
responses=[]
for _,Scene_grp in df2.groupby('Scene'):
    utterance = ''
    for row in Scene_grp.values:
        if row[0] in chandler_lines:
            utterances.append(utterance)
            utterance = ''
            responses.append(row[2])
        else:
            utterance = utterance + ' '+ row[2]

In [None]:
utterances[0]

In [None]:
responses[0]

In [None]:
utterances[55]

In [None]:
responses[55]

In [None]:
len(utterances)

In [None]:
len(responses)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def plotScatterLengths(title, x_title, y_title, x_lengths, y_lengths):
    plt.scatter(x_lengths, y_lengths)
    plt.title(title)
    plt.xlabel(x_title)
    plt.ylabel(y_title)
    plt.ylim(0, 200)
    plt.xlim(0, 200)
    plt.show()

In [None]:
from keras.preprocessing.text import text_to_word_sequence
utter_lengths = [len(text_to_word_sequence(x)) for x in utterances]
respo_lengths = [len(text_to_word_sequence(x)) for x in responses ]

In [None]:
max(utter_lengths)

In [None]:
max(respo_lengths)

In [None]:
plt.hist(utter_lengths)

In [None]:
long_utterances = [i for i,x in enumerate(utterances) if len(text_to_word_sequence(x)) > 150]
long_responses = [i for i,x in enumerate(responses) if len(text_to_word_sequence(x)) > 150]

In [None]:
[utterances[i] for i in long_utterances]

In [None]:
[responses[i] for i in long_utterances]

In [None]:
[responses[i] for i in long_responses]

In [None]:
chandler_utter = open('chandler_utter.txt','w')
chandler_respo = open('chandler_respo.txt','w')

In [None]:
for i in range(len(utterances)):
    chandler_utter.write(utterances[i]+'<EOS>')
    chandler_respo.write(responses[i]+ '<EOS>')

In [None]:
upload_file_cos('', 'chandler_utter.txt', 'chandler_utter.txt')
upload_file_cos('', 'chandler_respo.txt', 'chandler_respo.txt')

In [None]:
#text = bs_tree.findAll("p")
def extract_dialogues_friends(text, scene_id): 
    from copy import copy
    rows = []
    scenes = []
    lines = []
    scene_change = False
    _line = ""
    character = ""
    orphan_bracket = False
    orphan_para = False
    for p in text: lines.append(p.get_text())
    for line in lines:
        #print(line)
        #print((line == 'End'))
        #print((line == 'END'))
        #if line.startswith('[') or line.endswith(']'):
        #if'Scene' in str(s): print(s)
        if 'Written by' in line: continue
        if 'WRITTEN BY' in line: continue
        if 'Transcribed by' in line :continue
        if 'Originally written by' in line:continue
        #print(line)
        if (line == 'End') or (line == 'END') : continue
        if orphan_bracket:
            #print("Orphan",line)
            if (line.find(']') == -1): 
                #print("no orphan pair found :",line)
                continue
            #print("pair found",line)
            orphan_bracket = False
            
        if (line.find('[') == -1) and (line.find(']') == -1):##no scene change
            _line = copy(line)
            #print("no scene change :",_line)
        else:
            if (line.find('[') > -1): 
                orphan_bracket = True
                scene_id += 1
                #print("open bracket :",line)
            if ((line.find('[') > -1 and line.index('[') == 0) and (line.find(']') > -1 and line.index(']') == len(line)-1)):
                #print("whole line scene change:",line)
                orphan_bracket = False
                continue
            if (line.find('[') > -1) and (line.find(']') > -1):
                #print("inbetween scene change:",line)
                orphan_bracket = False
                _line = line[0:line.index('[')] + line[line.index(']')+1:] 
            if (line.find('[') > -1) and (line.find(']') == -1): 
                #print("only beginning scene change:",line)
                _line = line.split('[')[0] 
            if (line.find('[') == -1) and (line.find(']') > -1): 
                #print("only end scene change:",line)
                _line = line.split(']')[1] 
            
        if orphan_para:
            if (_line.find(')') == -1): 
                continue
                #print("no pair para :",_line)
            orphan_para = False
            #print("pair para :",_line)
        if (_line.find('(') == -1) and (_line.find(')') == -1):
            pass
        else:
            if (_line.find('(') > -1): 
                orphan_para = True
            if ((_line.find('(') > -1 and _line.index('(') == 0) and (_line.find(')') > -1 and _line.index(')') == len(_line)-1)):
                orphan_para = False
                continue
            for _ in range(5):
                if (_line.find('(') > -1) and (_line.find(')') > -1):
                    _line = _line[0:_line.index('(')] + _line[_line.index(')')+1:] 
                    orphan_para = False
            if (_line.find('(') > -1) and (_line.find(')') == -1): 
                _line = _line.split('(')[0] 
            if (_line.find('(') == -1) and (_line.find(')') > -1): 
                _line = _line.split(')')[1] 
                
        if len(_line.split(':')) < 2: continue
            
        character = _line.split(':')[0]
        #print("character :",character)
        if len(_line.split(':')) > 2:
            _line = _line.split(':')[1] + _line.split(':')[2]
        else:
            #print(_line)
            _line = _line.split(':')[1]       
        if _line in  ["" ," "]: continue    
        _line = re.sub('\n',' ',_line)
        _line = re.sub("\s\s+", " ", _line)
        if _line in ["", " "]:continue
        rows.append([character, _line, scene_id])
    return rows, scene_id

# Data Visualization (Cornell Movie Corpus)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def plotScatterLengths(title, x_title, y_title, x_lengths, y_lengths):
    plt.scatter(x_lengths, y_lengths)
    plt.title(title)
    plt.xlabel(x_title)
    plt.ylabel(y_title)
    plt.ylim(0, 200)
    plt.xlim(0, 200)
    plt.show()

In [None]:
from keras.preprocessing.text import text_to_word_sequence
utter_lengths = [len(text_to_word_sequence(x)) for x in utterances]
respo_lengths = [len(text_to_word_sequence(x)) for x in responses ]

In [None]:
max(utter_lengths)

In [None]:
max(respo_lengths)

In [None]:
plt.hist(utter_lengths)

In [None]:
#most of them have less than 200 words

In [None]:
plt.hist(respo_lengths)

In [None]:
long_utterances = [i for i,x in enumerate(utterances) if len(text_to_word_sequence(x)) > 150]
long_responses = [i for i,x in enumerate(responses) if len(text_to_word_sequence(x)) > 150]

In [None]:
len(long_utterances)

In [None]:
len(long_responses)

In [None]:
[utterances[i] for i in long_utterances]

In [None]:
[responses[i] for i in long_responses]

In [None]:
long_list = long_utterances + long_responses

In [None]:
len(long_list)

In [None]:
cleaned_utterances = [x for i,x in enumerate(utterances) if i not in long_list]

In [None]:
len(cleaned_utterances)

In [None]:
221616 - 221475

In [None]:
cleaned_responses = [x for i,x in enumerate(responses) if i not in long_list]

In [None]:
len(cleaned_responses)

In [None]:
# for cornell database lets forget sentences greater than 150. For sitcom scripts (especially that of 'Friends') we may have to employ text summarization to 
#condense the utterances that came before Chandlers responses.

# Data Preparation 


In [None]:
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
def tokenize(X, max_len, reverse=False):
    '''
    X : sentences, separated by '/n'
    max_len : maximum number of words allowed in sentences after which they will be truncated
    reverse : reverse the order of words in X
    '''
    #X = X.split('\n')

    if reverse: 
        #X = [text_to_word_sequence(x)[::-1] for x in X if len(x.split()) > 0 and len(x.split()) <= max_len]
        X = [text_to_word_sequence(x)[::-1] for x in X ]
        
    else: 
        X = [text_to_word_sequence(x) for x in X ]
        #X = [text_to_word_sequence(x) for x in X if len(x.split()) > 0 and len(x.split()) <= max_len]
    return X

In [None]:
from nltk import FreqDist
import numpy as np
def create_vocabulary(X, vocab_size):
    '''
    X : List of tokenized sentences
    vocab_size :  words which appear only a few times (typically once or twice) in the whole text may not have a significant impact on the learning of our network. 
    So, what we do first is to count the frequency which a word appears in the text, then we create the vocabulary set using only words with highest frequencies
    (first <vocab_size> words)
    '''
    dist = FreqDist(np.hstack(X))
    X_vocab = dist.most_common(vocab_size-1)
    return X_vocab

In [None]:
def create_mapping(vocabulary):
    '''
    We need a dictionary to map from a word to its corresponding index value, and another dictionary for the same purpose, but in reverse direction.
    We need two for encoding and decoding
    vocabulary : words appearing in the training data
    outputs:
    idx_to_word : index-to-word list
    word_to_idx : word-to-IndexError dictionary
    '''
    idx_to_word = [word[0] for word in vocabulary]
    # Adding the word "ZERO" to the beginning of the array
    idx_to_word.insert(0, 'EOL')
    # Adding the word 'UNK' to the end of the array (stands for UNKNOWN words)
    idx_to_word.append('UNK')
    # Creating the word-to-index dictionary from the array created above
    word_to_idx = {word:idx for idx, word in enumerate(idx_to_word)}
    return idx_to_word, word_to_idx    

### Using Padding

In [None]:
from keras.preprocessing.sequence import pad_sequences
def convert_to_numbers(X, word_to_idx, pad=True,  padding='pre'):
    '''
    We need to convert words to numbers(i.e their corresponding indices) coz computers understand only numbers. 
    steps:
    1) Convert each word in the sentence to its index value
    2) If 'pad = True' ,pad zeros into our sequences, so as all the sequences will have a same length(this is an alternative to bucketing)
  
    '''
    #1) Convert each word in the sentence to its index value
    for i, sentence in enumerate(X):
        for j, word in enumerate(sentence):
            if word in word_to_idx:
                X[i][j] = word_to_idx[word]
            else:
                X[i][j] = word_to_idx['UNK']
    #X_max_len = max([len(sentence) for sentence in X])
    if pad:
        X_max_len = max([len(sentence) for sentence in X])
        X = pad_sequences(X, maxlen=X_max_len, dtype='int32', padding=padding)
    return X, X_max_len      

In [None]:
#utterances = cos.Object('chandlerping-donotdelete-pr-5d1gylpa2fimey','utter_corn.txt')
#responses = cos.Object('chandlerping-donotdelete-pr-5d1gylpa2fimey','respo_corn.txt')

In [None]:
#len( utterances.get()['Body'].read().decode("ISO-8859-2").split('<EOS>'))

In [None]:
#len(responses.get()['Body'].read().decode("ISO-8859-2").split('<EOS>'))

In [None]:
#X = cleaned_utterances.copy()

In [None]:
#taking a subsample 
X = cleaned_utterances[:200].copy()
y = cleaned_responses[:200].copy()

In [None]:
#y = cleaned_responses.copy()

In [None]:
#[i for i,x in enumerate(X) if len(x) == 1903]

In [None]:
#X[144063] #135505

In [None]:
#X[135505]

In [None]:
#y[135505]

In [None]:
#y[144063]

In [None]:
X[:10]

In [None]:
y[:10]

In [None]:
#Dataset parameters
MAX_LEN = 50
VOCAB_SIZE = 50000

X = tokenize(X, MAX_LEN, reverse=True)

In [None]:
X[:10]

In [None]:
y = tokenize(y, MAX_LEN, reverse=False)
y

In [None]:
len(X)

In [None]:
len(y)

In [None]:
X_vocab = create_vocabulary(X, VOCAB_SIZE)

In [None]:
len(X_vocab)

In [None]:
X_vocab[0:10]

In [None]:
y_vocab = create_vocabulary(y, VOCAB_SIZE)

In [None]:
len(y_vocab)

In [None]:
y_vocab[0:10]

In [None]:
X_idx_to_word, X_word_to_idx = create_mapping(X_vocab)

In [None]:
X_idx_to_word[0:10]

In [None]:
X_word_to_idx['EOL']

In [None]:
X_word_to_idx['and']

In [None]:
y_idx_to_word, y_word_to_idx = create_mapping(y_vocab)

In [None]:
y_idx_to_word[0:10]

In [None]:
y_word_to_idx['EOL']

In [None]:
y_word_to_idx['special']

In [None]:
y_idx_to_word[661]

In [None]:
X[0]

In [None]:
X, X_max_len = convert_to_numbers(X, X_word_to_idx, pad=True, padding='pre')

In [None]:
X[0]

In [None]:
X_max_len

In [None]:
X_idx_to_word[8860]

In [None]:
y, y_max_len = convert_to_numbers(y, y_word_to_idx, pad=True, padding='post')

In [None]:
y[0]

In [None]:
def prepare_data(X, y, MAX_LEN, VOCAB_SIZE):
    '''
    1) Tokenize sentences, both X(Utterances) and y(responses)
    An alternative for attention would be to have reversed input.
        When we use Padding :
        The end the input sequences sometimes are not meaningful. It is just PAD, PAD, PAD, and so on. 
        So if you try to build your thought vector base on it, it will result in poor outcomes
        when you reverse, you have your words at the end of the sequence. 
        When using bucketing :
        The encoder is the utterance by human, and the decoder is the response. 
        We assume that in normal conversations, people listen to the ﬁrst part and somewhat zone out to think of the answer, so we
        reverse the encoder so that the model can retain more information from the beginning of the utterance.
        From paper :
        (Ilya Sutskever, Oriol Vinyals and Quoc V. Le. Sequence to Sequence Learning with Neural Networks)
        We found it extremely valuable to reverse the order of the words of the input sentence. 
        So for example, instead of mapping the sentence a, b, c to the sentence α, β, γ, the LSTM is asked to map c, b, a to α, β, γ, 
        where α, β, γ is the translation of a, b, c. This way, a is in close proximity to α, b is fairly close to β, and so on, 
        a fact that makes it easy for SGD to “establish communication” between the input and the output. 
        We found this simple data transformation to greatly boost the performance of the LSTM.
    2)Create vocabulary
    3)Create mappings for words in the vocabulary.(index-to-word and  word-to-index)
    4)Convert words in the sentences to their corresponding indices

    '''
    X = tokenize(X, MAX_LEN, reverse=True)
    y = tokenize(y, MAX_LEN, reverse=False)
    X_vocab = create_vocabulary(X, VOCAB_SIZE)
    X_vocab_len = len(X_vocab)
    y_vocab = create_vocabulary(y, VOCAB_SIZE)
    y_vocab_len = len(y_vocab)
    X_idx_to_word, X_word_to_idx = create_mapping(X_vocab)
    y_idx_to_word, y_word_to_idx = create_mapping(y_vocab)
    X, X_max_len = convert_to_numbers(X, X_word_to_idx, pad=True, padding='pre')
    y, y_max_len = convert_to_numbers(y, y_word_to_idx, pad=True, padding='post')
    return X, X_idx_to_word, X_word_to_idx, X_max_len, y, y_idx_to_word, y_word_to_idx, y_max_len

In [None]:
#Dataset parameters
MAX_LEN = 50
#VOCAB_SIZE = 50000
VOCAB_SIZE = 500 #smaller vocabulary for the subsample
X, X_idx_to_word, X_word_to_idx, X_max_len, y, y_idx_to_word, y_word_to_idx, y_max_len = prepare_data(X, y, MAX_LEN, VOCAB_SIZE)

In [None]:
#X[144063]

In [None]:
X[0]

In [None]:
y[0]

In [None]:
 X_idx_to_word[0:10]

In [None]:
X_word_to_idx['nope']

In [None]:
X_idx_to_word[1479]

In [None]:
X_max_len

In [None]:
#[i for i,x in enumerate(X) if 0 not in x]

In [None]:
#X[135505]

In [None]:
#y[135505]

In [None]:
y_idx_to_word[0:10]

In [None]:
y_word_to_idx['rebel']

In [None]:
y_max_len

In [None]:
y_idx_to_word[8543]

In [None]:
import os
parameters = open('parameters.npy', 'w')
X_idx_to_word_file = open('X_idx_to_word.npy', 'w')
y_idx_to_word_file = open('y_idx_to_word.npy', 'w')
X_word_to_idx_file = open('X_word_to_idx.npy', 'w')
y_word_to_idx_file = open('y_word_to_idx.npy', 'w')

In [None]:
import numpy as np
params = {'MAX_LEN':MAX_LEN, 'VOCAB_SIZE':VOCAB_SIZE, 'X_max_len': X_max_len, ' y_max_len': y_max_len}
np.save('parameters.npy', params) 

In [None]:
!ls 

In [None]:
#upload_file_cos('', 'parameters.npy', 'parameters.npy')
upload_file_cos('', 'parameters.npy', 'parameters_small.npy')

In [None]:
np.save('X_idx_to_word.npy', X_idx_to_word) 

In [None]:
#upload_file_cos('', 'X_idx_to_word.npy', 'X_idx_to_word.npy')
upload_file_cos('', 'X_idx_to_word.npy', 'X_idx_to_word_small.npy')

In [None]:
np.save('y_idx_to_word.npy', y_idx_to_word) 

In [None]:
#upload_file_cos('', 'y_idx_to_word.npy', 'y_idx_to_word.npy')
upload_file_cos('', 'y_idx_to_word.npy', 'y_idx_to_word_small.npy')

In [None]:
np.save('X_word_to_idx.npy', X_word_to_idx) 
np.save('y_word_to_idx.npy', y_word_to_idx) 
#upload_file_cos('', 'X_word_to_idx.npy', 'X_word_to_idx.npy')
#upload_file_cos('', 'y_word_to_idx.npy', 'y_word_to_idx.npy')
upload_file_cos('', 'X_word_to_idx.npy', 'X_word_to_idx_small.npy')
upload_file_cos('', 'y_word_to_idx.npy', 'y_word_to_idx_small.npy')

In [None]:
len(X)

In [None]:
len(y)

In [None]:
X_train, X_val, X_test = X[:221400], X[221400:221425], X[221425:]
y_train, y_val, y_test = y[:221400], y[221400:221425], y[221425:]

In [None]:
len(y_train)

In [None]:
len(y_val)

In [None]:
len(y_test)

In [None]:
#import pickle
#f = open('X_train.pickle', 'wb')
#pickle.dump(X_train, f, pickle.HIGHEST_PROTOCOL)
#f.close()

In [None]:
import pickle
f = open('X_train_small.pickle', 'wb')
pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)
f.close()

In [None]:
#upload_file_cos('', 'X_train.pickle', 'X_train.pickle')
upload_file_cos('', 'X_train_small.pickle', 'X_train_small.pickle')

In [None]:
f = open('X_test.pickle', 'wb')
pickle.dump(X_test, f, pickle.HIGHEST_PROTOCOL)
f.close()

In [None]:
upload_file_cos('', 'X_test.pickle', 'X_test.pickle')

In [None]:
f = open('X_val.pickle', 'wb')
pickle.dump(X_val, f, pickle.HIGHEST_PROTOCOL)
f.close()

In [None]:
upload_file_cos('', 'X_val.pickle', 'X_val.pickle')

In [None]:
#f = open('y_train.pickle', 'wb')
#pickle.dump(y_train, f, pickle.HIGHEST_PROTOCOL)
#f.close()
#upload_file_cos('', 'y_train.pickle', 'y_train.pickle')
f = open('y_train_small.pickle', 'wb')
pickle.dump(y, f, pickle.HIGHEST_PROTOCOL)
f.close()
upload_file_cos('', 'y_train_small.pickle', 'y_train_small.pickle')

In [None]:
f = open('y_test.pickle', 'wb')
pickle.dump(y_test, f, pickle.HIGHEST_PROTOCOL)
f.close()
upload_file_cos('', 'y_test.pickle', 'y_test.pickle')

In [None]:
f = open('y_val.pickle', 'wb')
pickle.dump(y_val, f, pickle.HIGHEST_PROTOCOL)
f.close()
upload_file_cos('', 'y_val.pickle', 'y_val.pickle')

### Use bucketing 

###### Data for Baseline model (Cornell Movie Database alone)

In [None]:
import random
import re
TESTSET_SIZE = 50

In [None]:
def get_lines(movie_lines):
    id2line = {}
    lines_chunks = [line.split(" +++$+++ ") for line in movie_lines]
    id2line = {line[0]: line[-1].strip() for line in lines_chunks}
    return id2line

In [None]:
def get_convos(movie_conver):
    """ Get conversations from the raw data """
    convos = []
    convos = [re.sub(r'u.+\[','',line).strip(']').replace("'", "").replace(",", "").split(" ") for line in movie_conver]
    #print("get_convos",convos)
    return convos

In [None]:
def utterances_responses(id2line, convos):
    """ Divide the dataset into two sets: questions and answers. """
    utterances, responses = [], []
    for index, conversation in enumerate(convos):
        for i in range(len(conversation) - 1):
        #print(conversation[i])
            utterances.append(id2line[conversation[i]])
            responses.append(id2line[conversation[i+1]])   
    assert len(utterances) == len(responses)
    return utterances, responses

In [None]:
def prepare_dataset(utterances, responses):

    # random convos to create the test set
    test_ids = random.sample([i for i in range(len(utterances))],TESTSET_SIZE)
    
    filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec']
    files = []
    for filename in filenames:
        files.append(open(filename,'w'))

    for i in range(len(utterances)):
        if i in test_ids:
            files[2].write(utterances[i] + '\n')
            files[3].write(responses[i] + '\n')
        else:
            files[0].write(utterances[i] + '\n')
            files[1].write(responses[i] + '\n')

    for file in files:
        file.close()
        
def prepare_raw_data(movie_conver, movie_lines):
    print('Preparing raw data into train set and test set ...')
    id2line = get_lines(movie_lines)
    convos = get_convos(movie_conver)
    utterances, responses = utterances_responses(id2line, convos)
    print(len(utterances))
    prepare_dataset(utterances, responses)

In [None]:
movie_conver = cos.Object('chandlerping-donotdelete-pr-5d1gylpa2fimey','movie_conversations.txt')
movie_conver = movie_conver.get()['Body'].read().decode("ISO-8859-1").split('\n')
movie_lines = cos.Object('chandlerping-donotdelete-pr-5d1gylpa2fimey','movie_lines.txt')
movie_lines = movie_lines.get()['Body'].read().decode("ISO-8859-1").split('\n')

In [None]:
movie_lines

In [None]:
prepare_raw_data(movie_conver, movie_lines)

In [None]:
!ls

In [None]:
from keras.preprocessing.text import text_to_word_sequence

THRESHOLD = 2

PAD_ID = 0
UNK_ID = 1
START_ID = 2
EOS_ID = 3


BUCKETS = [(19, 19), (28, 28), (33, 33), (40, 43), (50, 53), (60, 63)]


CONTRACTIONS = [("i ' m ", "i 'm "), ("' d ", "'d "), ("' s ", "'s "), ("don ' t ", "do n't "), ("didn ' t ", "did n't "), ("doesn ' t ", "does n't "),
                ("can ' t ", "ca n't "), ("shouldn ' t ", "should n't "), ("wouldn ' t ", "would n't "),("' ve ", "'ve "), ("' re ", "'re "), ("in ' ", "in' ")]

def basic_tokenizer(line):
    """ A basic tokenizer to tokenize text into tokens.
    """
    words = []
    words = text_to_word_sequence(line) 
    return words

def build_vocab(filename, normalize_digits=True):
    in_path = filename
    out_path = 'vocab.{}'.format(filename[-3:])

    vocab = {}
    with open(in_path, 'r') as f:
        for line in f.readlines():
            for token in basic_tokenizer(line):
                if not token in vocab:
                    vocab[token] = 0
                vocab[token] += 1

    sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
    with open(out_path, 'w') as f:
        f.write('<pad>' + '\n')
        f.write('<unk>' + '\n')
        f.write('<s>' + '\n')
        f.write('<\s>' + '\n') 
        index = 4
        for word in sorted_vocab:
            if vocab[word] < THRESHOLD:
                break
            f.write(word + '\n')
            index += 1
        with open('config.py', 'a') as cf:
            if filename[-3:] == 'enc':
                cf.write('ENC_VOCAB = ' + str(index) + '\n')
            else:
                cf.write('DEC_VOCAB = ' + str(index) + '\n')
        
def load_vocab(vocab_path):
    with open(vocab_path, 'r') as f:
        words = f.read().splitlines()
    return words, {words[i]: i for i in range(len(words))}

def sentence2id(vocab, line):
    return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)]

def token2id(data, mode):
    """ Convert all the tokens in the data into their corresponding
    index in the vocabulary. """
    vocab_path = 'vocab.' + mode
    in_path = data + '.' + mode
    out_path = data + '_ids.' + mode

    _, vocab = load_vocab(vocab_path)
    in_file = open(in_path, 'r')
    out_file = open(out_path, 'w')
    
    lines = in_file.read().splitlines()
    for line in lines:
        if mode == 'dec': # we only care about '<s>' and </s> in encoder
            ids = [vocab['<s>']]
        else:
            ids = []
        ids.extend(sentence2id(vocab, line))
        # ids.extend([vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)])
        if mode == 'dec':
            ids.append(vocab['<\s>'])
        out_file.write(' '.join(str(id_) for id_ in ids) + '\n')

In [None]:
def process_data():
    print('Preparing data to be model-ready ...')
    build_vocab('train.enc')
    build_vocab('train.dec')
    token2id('train', 'enc')
    token2id('train', 'dec')
    token2id('test', 'enc')
    token2id('test', 'dec')

In [None]:
process_data()

In [None]:
!ls

In [None]:
with open('config.py', 'r') as f:
    print(f.read())

In [None]:
with open('vocab.enc', 'r') as f:
    print(f.read())

In [None]:
with open('vocab.dec', 'r') as f:
    print(f.read())

In [None]:
upload_file_cos('', 'config.py', 'config.py')

In [None]:
upload_file_cos('', 'test.enc', 'test.enc')
upload_file_cos('', 'test.dec', 'test.dec')
upload_file_cos('', 'test_ids.enc', 'test_ids.enc')
upload_file_cos('', 'test_ids.dec', 'test_ids.dec')
upload_file_cos('', 'train_ids.enc', 'train_ids.enc')
upload_file_cos('', 'train_ids.dec', 'train_ids.dec')
upload_file_cos('', 'vocab.enc', 'vocab.enc')
upload_file_cos('', 'vocab.dec', 'vocab.dec')
upload_file_cos('', 'train.enc', 'train.enc')
upload_file_cos('', 'train.dec', 'train.dec')