In [7]:
def load_lines(filename):
    with open(filename, 'r', encoding='iso-8859-1') as file:
        lines = file.readlines()
    return lines

def load_conversations(filename):
    with open(filename, 'r', encoding='iso-8859-1') as file:
        conversations = file.readlines()
    return conversations

def extract_conversations(conversations):
    conversation_list = []
    for line in conversations:
        line_parts = line.split(' +++$+++ ')
        conversation_ids = eval(line_parts[-1])
        conversation_list.append(conversation_ids)
    return conversation_list

def create_lines_dictionary(lines):
    lines_dict = {}
    for line in lines:
        parts = line.split('+++$+++')
        if len(parts) == 5:
            line_id = parts[0].strip()
            text = parts[4].strip()
            lines_dict[line_id] = text
    return lines_dict

def create_dialogue_pairs(lines_dict, conversations):
    conversation_list = extract_conversations(conversations)
    dialogue_pairs = []
    for conversation in conversation_list:
        for i in range(len(conversation) - 1):
            input_line = lines_dict[conversation[i]].strip()
            target_line = lines_dict[conversation[i + 1]].strip()
            dialogue_pairs.append((input_line, target_line))
    return dialogue_pairs

In [3]:
import re
import nltk
nltk.download('punkt')

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

def tokenize(text):
    return nltk.word_tokenize(text)

def preprocess_dialogue(dialogue_pairs):
    preprocessed_pairs = []
    for pair in dialogue_pairs:
        input_line = clean_text(pair[0])
        target_line = clean_text(pair[1])
        input_tokens = tokenize(input_line)
        target_tokens = tokenize(target_line)
        preprocessed_pairs.append((input_tokens, target_tokens))
    return preprocessed_pairs

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weiyu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [5]:
lines = load_lines('../data/movie_lines.txt')
conversations = load_conversations('../data/movie_conversations.txt')

In [9]:
lines_dict = create_lines_dictionary(lines)
dialogue_pairs = create_dialogue_pairs(lines_dict, conversations)
preprocessed_dialogue_pairs = preprocess_dialogue(dialogue_pairs)

In [10]:
import pickle

# Save the preprocessed dialogue pairs
with open('../data/preprocessed_dialogue_pairs.pkl', 'wb') as file:
    pickle.dump(preprocessed_dialogue_pairs, file)

In [11]:
for pair in preprocessed_dialogue_pairs[:10]:
    print(pair)

(['can', 'we', 'make', 'this', 'quick', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break', 'up', 'on', 'the', 'quad', 'again'], ['well', 'i', 'thought', 'we', 'would', 'start', 'with', 'pronunciation', 'if', 'that', 'is', 'okay', 'with', 'you'])
(['well', 'i', 'thought', 'we', 'would', 'start', 'with', 'pronunciation', 'if', 'that', 'is', 'okay', 'with', 'you'], ['not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', 'please'])
(['not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', 'please'], ['okay', 'then', 'how', "'bout", 'we', 'try', 'out', 'some', 'french', 'cuisine', 'saturday', 'night'])
(['you', 'are', 'asking', 'me', 'out', 'that', 'is', 'so', 'cute', 'what', 'is', 'your', 'name', 'again'], ['forget', 'it'])
(['no', 'no', 'it', "'s", 'my', 'fault', 'we', 'did', "n't", 'have', 'a', 'proper', 'introduction'], ['cameron'])
(['cameron'], ['the', 'thing', 'is', 'cameron', 'i', 'am',