In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load pre-trained model and tokenizer
model_name = "microsoft/DialoGPT-medium"  # You can use 'DialoGPT-medium' or 'DialoGPT-large' for larger models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Start conversation
chat_history_ids = None

# Loop for interactive conversation
while True:
    user_input = input("You: ")  # Take user input
    if user_input.lower() == "exit":  # Exit condition
        break
    
    # Tokenize input and add eos token
    new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
    
    # If there's a chat history, concatenate new input with previous history
    bot_input_ids = new_user_input_ids if chat_history_ids is None else torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
    
    # Create an attention mask
    attention_mask = torch.ones(bot_input_ids.shape, dtype=torch.long)
    
    # Generate response with attention mask
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=attention_mask,
        no_repeat_ngram_size=3,  # Prevents repeating 3-gram sequences
        temperature=0.7,  # Controls the randomness of the model's responses
        top_k=50  # Limits the sampling pool to the top 50 tokens
)
    
    # Decode the response and print it
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print("Bot:", response)


You:  how are you?


Bot: I'm good, how are you?


You:  good. can you tell me a joke?


Bot: What's the heaviest soup in Asia? One ton.


You:  tell me another joke


Bot: What is the heaviest soap in the world? One Ton.


You:  exit


In [12]:
import os
import re
import json

# File paths (change to your actual file paths)
data_dir = 'data'
lines_file = os.path.join(data_dir, 'movie_lines.txt')
conversations_file = os.path.join(data_dir, 'movie_conversations.txt')

# Load the lines from the movie_lines.txt file
def load_lines(file_path):
    lines = {}
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            if len(parts) == 5:  # Line should have 5 parts
                line_id = parts[0]
                dialogue_text = parts[4].strip()  # Clean dialogue text
                lines[line_id] = dialogue_text
    return lines

# Load the conversations from the movie_conversations.txt file
def load_conversations(file_path):
    conversations = []
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            if len(parts) == 4:  # Conversation should have 4 parts
                line_ids = json.loads(parts[3].replace("'", '"'))  # Convert to list of line IDs
                conversations.append(line_ids)
    return conversations

# Load lines and conversations
lines = load_lines(lines_file)
conversations = load_conversations(conversations_file)

# Create input-output pairs based on conversations
def create_conversation_pairs(conversations, lines):
    input_texts = []
    target_texts = []
    for conv in conversations:
        for i in range(len(conv) - 1):
            input_line = lines[conv[i]]
            target_line = lines[conv[i + 1]]
            input_texts.append(input_line)
            target_texts.append(target_line)
    return input_texts, target_texts

input_texts, target_texts = create_conversation_pairs(conversations, lines)


print(f"Total conversation pairs: {len(input_texts)}")
print("Sample input:", input_texts[0])
print("Sample target:", target_texts[0])


def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()  # Lowercase and trim spaces
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)  # Add space around punctuation
    sentence = re.sub(r'[" "]+', " ", sentence)  # Replace multiple spaces with single space
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)  # Remove non-alphabetic characters
    return sentence

# Apply preprocessing to the conversation pairs
input_texts = [preprocess_sentence(text) for text in input_texts]
target_texts = [preprocess_sentence(text) for text in target_texts]

print("Preprocessed Sample input:", input_texts[0])
print("Preprocessed Sample target:", target_texts[0])


Total conversation pairs: 221616
Sample input: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Sample target: Well, I thought we'd start with pronunciation, if that's okay with you.
Preprocessed Sample input: can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again . 
Preprocessed Sample target: well , i thought we d start with pronunciation , if that s okay with you . 


In [14]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tokenize the input and target texts
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Prepare dataset for training
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        targets = self.tokenizer(self.target_texts[idx], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        inputs["labels"] = targets["input_ids"]
        return inputs

# Create the dataset
train_dataset = CustomDataset(input_texts, target_texts, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
)

# Data collator for language modeling (GPT-2 expects padding and labels)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # GPT-2 doesn't use masked language modeling (MLM)
)

# Trainer for GPT-2
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
def generate_response(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the fine-tuned model
prompt = "What do you think about artificial intelligence?"
response = generate_response(prompt, model, tokenizer)
print("Bot:", response)
