In [23]:
import nltk
from tqdm import tqdm
import contractions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from string import punctuation
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weiyu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\weiyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\weiyu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
def load_lines(filename):
    with open(filename, 'r', encoding='iso-8859-1') as file:
        lines = file.readlines()
    return lines

def load_conversations(filename):
    with open(filename, 'r', encoding='iso-8859-1') as file:
        conversations = file.readlines()
    return conversations

def create_lines_dictionary(lines):
    lines_dict = {}
    for line in lines:
        parts = line.split('+++$+++')
        if len(parts) == 5:
            line_id = parts[0].strip()
            text = parts[4].strip()
            lines_dict[line_id] = text
    return lines_dict

def create_dialogue_pairs(lines_dict, conversations):
    conversation_list = extract_conversations(conversations)
    dialogue_pairs = []
    for conversation in conversation_list:
        for i in range(len(conversation) - 1):
            input_line = lines_dict[conversation[i]].strip()
            target_line = lines_dict[conversation[i + 1]].strip()
            dialogue_pairs.append((input_line, target_line))
    return dialogue_pairs

def extract_conversations(conversations):
    conversation_list = []
    for line in conversations:
        line_parts = line.split(' +++$+++ ')
        conversation_ids = eval(line_parts[-1])
        conversation_list.append(conversation_ids)
    return conversation_list


In [25]:

def clean_and_tokenize_text(text):
    text = text.lower()
    text = contractions.fix(text)

    text = ''.join([c for c in text if c not in punctuation])
    tokens = word_tokenize(text)
    #stop_words = set(stopwords.words('english'))
    #tokens = [token for token in tokens if token not in stop_words]
    #lemmatizer = WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token.strip() for token in tokens if token.strip()]
    return tokens

def preprocess_dialogue(dialogue_pairs, min_length=2, max_length=100):
    preprocessed_pairs = []
    total_pairs = len(dialogue_pairs)

    with tqdm(total=total_pairs, desc="Preprocessing") as pbar:
        for pair in dialogue_pairs:
            input_tokens = clean_and_tokenize_text(pair[0])
            target_tokens = clean_and_tokenize_text(pair[1])

            # Check the length of input_tokens and target_tokens
            if len(input_tokens) < min_length or len(input_tokens) > max_length or len(target_tokens) < min_length or len(target_tokens) > max_length:
                pbar.update(1)
                continue

            preprocessed_pairs.append((input_tokens, target_tokens))
            pbar.update(1)

    return preprocessed_pairs

In [13]:
lines = load_lines('../data/movie_lines.txt')
conversations = load_conversations('../data/movie_conversations.txt')
print(len(lines))
print(len(conversations))

304713
83097


In [26]:
lines_dict = create_lines_dictionary(lines)
dialogue_pairs = create_dialogue_pairs(lines_dict, conversations)
print(len(lines_dict))
print(len(dialogue_pairs))

304713
221616


In [27]:
dialogue_pairs[:10]

[('Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."),
 ("Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'),
 ('Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"),
 ("You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'),
 ("No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.'),
 ('Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does."),
 ("The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
  'Seems like she could get a date easy enough...'),
 (

In [28]:
preprocessed_dialogue_pairs = preprocess_dialogue(dialogue_pairs)
print(len(preprocessed_dialogue_pairs))

Preprocessing: 100%|██████████| 221616/221616 [01:13<00:00, 3013.54it/s]

187805





In [29]:
for pair in preprocessed_dialogue_pairs[:10]:
    print(pair)

(['can', 'we', 'make', 'this', 'quick', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break', 'up', 'on', 'the', 'quad', 'again'], ['well', 'i', 'thought', 'we', 'would', 'start', 'with', 'pronunciation', 'if', 'that', 'is', 'okay', 'with', 'you'])
(['well', 'i', 'thought', 'we', 'would', 'start', 'with', 'pronunciation', 'if', 'that', 'is', 'okay', 'with', 'you'], ['not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', 'please'])
(['not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', 'please'], ['okay', 'then', 'how', 'bout', 'we', 'try', 'out', 'some', 'french', 'cuisine', 'saturday', 'night'])
(['you', 'are', 'asking', 'me', 'out', 'that', 'is', 'so', 'cute', 'what', 'is', 'your', 'name', 'again'], ['forget', 'it'])
(['the', 'thing', 'is', 'cameron', 'i', 'am', 'at', 'the', 'mercy', 'of', 'a', 'particularly', 'hideous', 'breed', 'of', 'loser', 'my', 'sister', 'i', 'can', 'not', 'date', 

In [30]:
import pickle

# Save the preprocessed dialogue pairs
with open('../data/preprocessed_dialogue_pairs.pkl', 'wb') as file:
    pickle.dump(preprocessed_dialogue_pairs, file)