In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
import json 
import pandas as pd
import random
import sys
import time 
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
#import ipywidgets as widgets
#from IPython.display import display, clear_output
import gradio as gr

In [2]:


# Print Python version and working directory for troubleshooting
print(f"Python version: {sys.version}")
print(f"Current working directory: {os.getcwd()}")

# Define the NLTK data directory in the current working directory
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
print(f"Using NLTK data directory: {nltk_data_dir}")

# Add our directory to the beginning of NLTK's search path
nltk.data.path.insert(0, nltk_data_dir)

# Function to download and verify a specific NLTK resource
def download_and_verify(package_name):
    print(f"\nDownloading {package_name}...")
    download_result = nltk.download(package_name, download_dir=nltk_data_dir, quiet=False)
    print(f"Download result: {download_result}")
    
    # Give filesystem time to update
    time.sleep(1)
    
    # Verify the package exists in our directory
    package_path = os.path.join(nltk_data_dir, *package_name.split('/'))
    if os.path.exists(package_path) or os.path.exists(package_path + '.zip'):
        print(f"✓ Verified {package_name} exists at {package_path}")
        return True
    else:
        print(f"✗ Could not find {package_name} at {package_path}")
        return False

# Download both resources
punkt_success = download_and_verify("tokenizers/punkt")
stopwords_success = download_and_verify("corpora/stopwords")

# Now try to use the resources
if punkt_success:
    try:
        from nltk.tokenize import sent_tokenize
        sentences = sent_tokenize("This is a test. This is another test.")
        print(f"✓ Punkt tokenizer working! Found {len(sentences)} sentences.")
    except Exception as e:
        print(f"✗ Error using punkt tokenizer: {str(e)}")

if stopwords_success:
    try:
        from nltk.corpus import stopwords
        stop_words = stopwords.words('english')
        print(f"✓ Stopwords working! Found {len(stop_words)} stopwords.")
    except Exception as e:
        print(f"✗ Error using stopwords: {str(e)}")

print("\nFinal NLTK data path:")
for path in nltk.data.path:
    print(f"- {path}")

Python version: 3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]
Current working directory: c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot
Using NLTK data directory: c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot\nltk_data

Downloading tokenizers/punkt...


[nltk_data] Error loading tokenizers/punkt: Package 'tokenizers/punkt'
[nltk_data]     not found in index


Download result: False
✓ Verified tokenizers/punkt exists at c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot\nltk_data\tokenizers\punkt

Downloading corpora/stopwords...
Download result: False


[nltk_data] Error loading corpora/stopwords: Package
[nltk_data]     'corpora/stopwords' not found in index


✓ Verified corpora/stopwords exists at c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot\nltk_data\corpora\stopwords
✓ Punkt tokenizer working! Found 2 sentences.
✓ Stopwords working! Found 198 stopwords.

Final NLTK data path:
- c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot\nltk_data
- C:\Users\toyin/nltk_data
- c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot\venv\nltk_data
- c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot\venv\share\nltk_data
- c:\GitHub\peckham_daz_natural_language_processing\supacell_chatbot\venv\lib\nltk_data
- C:\Users\toyin\AppData\Roaming\nltk_data
- C:\nltk_data
- D:\nltk_data
- E:\nltk_data


In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\toyin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:

def parse_character_profiles(file_path):
    """Parse character profiles from the text file"""
    characters = {}
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Split by character names
        character_blocks = re.split(r'(Michael Lasaki-Brown|Tayo "Tazer" Amusan|Sabrina Clarke|Andre Simpson|Rodney Cullen)', content)
        
        # Process each block
        current_character = None
        for block in character_blocks:
            block = block.strip()
            if block in ["Michael Lasaki-Brown", "Tayo \"Tazer\" Amusan", "Sabrina Clarke", "Andre Simpson", "Rodney Cullen"]:
                current_character = block
                characters[current_character] = ""
            elif current_character and block:
                characters[current_character] = block
        
        return characters
        
    except Exception as e:
        print(f"Error parsing profiles: {str(e)}")
        return {}

# Parse profiles
profiles_file = "character_profiles.txt"
if os.path.exists(profiles_file):
    character_profiles = parse_character_profiles(profiles_file)
    print(f"Loaded profiles for {len(character_profiles)} characters")
else:
    print(f"Profile file not found: {profiles_file}")
    character_profiles = {}

Loaded profiles for 5 characters


In [5]:
# Create character info dictionary
character_info = {}

for character, profile in character_profiles.items():
    # Format profile for better prompting
    formatted_profile = f"Character: {character}\n\n{profile}"
    
    # Simplify character name for display
    display_name = character.split()[0]  # Just use first name
    
    # Store in dictionary
    character_info[character] = formatted_profile
    
    print(f"Processed profile for {display_name}")

# If no profiles were loaded, use defaults
if not character_info:
    character_info = {
        'Michael Lasaki-Brown': 'Character who can teleport and time travel, trying to save Dionne',
        'Tayo "Tazer" Amusan': 'Gang leader with invisibility powers',
        'Sabrina Clarke': 'Nurse with telekinesis abilities',
        'Andre Simpson': 'Father with super strength trying to protect his son',
        'Rodney Cullen': 'Drug dealer with super-speed and healing powers'
    }

# List available characters
characters_available = list(character_info.keys())
print(f"Available characters: {', '.join(characters_available)}")

Processed profile for Michael
Processed profile for Tayo
Processed profile for Rodney
Processed profile for Sabrina
Processed profile for Andre
Available characters: Michael Lasaki-Brown, Tayo "Tazer" Amusan, Rodney Cullen, Sabrina Clarke, Andre Simpson


In [6]:
# Load character dialogue from text files
def load_character_dialogues():
    """Load dialogue files for each character."""
    dialogue_files = {
        "Michael Lasaki-Brown": "michael_dialogue.txt",
        "Tayo \"Tazer\" Amusan": "tazer_dialogue.txt",
        "Sabrina Clarke": "sabrina_dialogue.txt",
        "Andre Simpson": "andre_dialogue.txt",
        "Rodney Cullen": "rodney_dialogue.txt"
    }
    
    character_dialogues = {}
    
    for character, filename in dialogue_files.items():
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                dialogue = file.read()
                character_dialogues[character] = dialogue
                print(f"✓ Loaded dialogue for {character} ({len(dialogue)} characters)")
        except FileNotFoundError:
            print(f"✗ Warning: {filename} not found")
            character_dialogues[character] = ""
        except Exception as e:
            print(f"✗ Error loading {filename}: {str(e)}")
            character_dialogues[character] = ""
    
    return character_dialogues

def get_relevant_dialogue_samples(character, query, num_samples=3):
    """Get relevant dialogue samples based on query keywords."""
    dialogue = character_dialogues.get(character, "")
    
    if not dialogue:
        return ""
    
    # Split dialogue into lines
    lines = [line.strip() for line in dialogue.split('\n') if line.strip()]
    
    # If query contains keywords, try to find relevant lines
    query_lower = query.lower()
    query_words = set(query_lower.split())
    
    # Score each line based on keyword matches
    scored_lines = []
    for line in lines:
        line_lower = line.lower()
        matches = sum(1 for word in query_words if word in line_lower)
        if matches > 0:
            scored_lines.append((matches, line))
    
    # Sort by relevance and get top samples
    if scored_lines:
        scored_lines.sort(reverse=True, key=lambda x: x[0])
        relevant_lines = [line for _, line in scored_lines[:num_samples]]
    else:
        # If no relevant matches, get random samples
        relevant_lines = random.sample(lines, min(num_samples, len(lines)))
    
    return "\n".join(relevant_lines)

# Load all dialogues
print("\nLoading character dialogue files...")
character_dialogues = load_character_dialogues()

print(f"\nDialogue loaded for {len([d for d in character_dialogues.values() if d])} characters")


Loading character dialogue files...
✓ Loaded dialogue for Michael Lasaki-Brown (1838 characters)
✓ Loaded dialogue for Tayo "Tazer" Amusan (1532 characters)
✓ Loaded dialogue for Sabrina Clarke (1601 characters)
✓ Loaded dialogue for Andre Simpson (2043 characters)
✓ Loaded dialogue for Rodney Cullen (1992 characters)

Dialogue loaded for 5 characters


In [7]:
# Define character speech patterns for more authentic responses
character_speech = {
    "Michael Lasaki-Brown": {
        "prefixes": ["Listen, ", "I need to tell you, ", ""],
        "fillers": [", yeah?", ", I swear", ""],
        "topics": ["Dionne", "future", "teleportation", "powers", "saving"]
    },
    "Tayo \"Tazer\" Amusan": {
        "prefixes": ["", "Yo, ", "Look man, "],
        "fillers": [", bruv", ", fam", ", man", ", yeah?"],
        "topics": ["gang", "invisibility", "Tower Boys", "respect", "money"]
    },
    "Sabrina Clarke": {
        "prefixes": ["", "I ", ""],
        "fillers": ["", ", seriously", ", honestly"],
        "topics": ["telekinesis", "Sharleen", "nurse", "control", "patient"]
    },
    "Andre Simpson": {
        "prefixes": ["", "Look, ", "Man, "],
        "fillers": [", man", ", innit", ", yeah?"],
        "topics": ["AJ", "son", "strength", "job", "money"]
    },
    "Rodney Cullen": {
        "prefixes": ["Mate, ", "Listen, ", ""],
        "fillers": [", bruv", ", yeah?", ", you get me?"],
        "topics": ["speed", "deal", "business", "run", "Spud"]
    }
}

# Function to make responses sound like the character
def apply_speech_pattern(character, text):
    """Apply character's speech pattern to make response more authentic"""
    
    if character not in character_speech:
        return text
    
    speech = character_speech[character]
    
    # 30% chance to add a prefix if text doesn't already have one
    if random.random() < 0.3 and not any(text.startswith(p) for p in speech["prefixes"] if p):
        prefix = random.choice(speech["prefixes"])
        if prefix:
            text = prefix + text[0].lower() + text[1:]
    
    # 40% chance to add a filler
    if random.random() < 0.4:
        filler = random.choice(speech["fillers"])
        if filler:
            # Find a sentence break to insert the filler
            sentences = text.split(". ")
            if len(sentences) > 1:
                pos = random.randint(0, len(sentences)-1)
                sentences[pos] = sentences[pos] + filler
                text = ". ".join(sentences)
            else:
                text = text + filler
    
    return text

In [8]:

print("Loading dialogue model...")
model_name = "google/flan-t5-small" # Model that runs fast on CPU and doesn't require me to install CUDA Toolkit 

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move to GPU if available
if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to GPU")
else:
    print("Using CPU for inference (this might be slower)")

print(f"Model loaded: {model_name}")



Loading dialogue model...
Using CPU for inference (this might be slower)
Model loaded: google/flan-t5-small


In [9]:
# Function to get pre-defined responses for common questions
def get_templated_response(character, query):
    """Get pre-defined responses for common questions"""
    query_lower = query.lower()
    
    # Character powers
    if any(word in query_lower for word in ["power", "ability", "superpower", "what can you do"]):
        power_responses = {
            "Michael Lasaki-Brown": "I can teleport - move from one place to another in seconds. I can also manipulate time, but I don't have full control over that yet. It started when I got stabbed by Tazer, and I rewound time without even meaning to. My eyes glow yellow when I use my powers, yeah?",
            
            "Tayo \"Tazer\" Amusan": "I can turn invisible, fam. Nobody can see me when it happens. First time was like a week ago in my room. My eyes started tingling, checked the mirror, there was nobody there. I use it when I need to, yeah? Powers can run out though if I overuse them.",
            
            "Sabrina Clarke": "I have telekinesis - I can move objects without touching them. I discovered it when I found out my boyfriend Kevin was cheating. I was so angry that I threw him against a wall without even touching him. I'd rather not have these abilities. I've already killed someone by accident.",
            
            "Andre Simpson": "I've got super strength, man. First happened at that ATM when I was desperate for cash. I can bend metal, lift cars, things like that. Still figuring out what my limits are. My eyes go yellow when I use it.",
            
            "Rodney Cullen": "I've got super-speed, mate! I accidentally ran all the way to Scotland in like a minute. And I heal fast too. The speed thing gives me proper munchies though, you get me? Always starving after using it."
        }
        return power_responses.get(character, "")
    
    # Origin stories
    elif any(word in query_lower for word in ["discover", "first time", "how did you get", "when did you", "find out"]):
        origin_responses = {
            "Michael Lasaki-Brown": "It happened when I got stabbed by Tazer. Somehow, I rewound time and avoided it completely. Then I started teleporting without even trying to. I even saw the future - July 9, 2024. That's when... Listen, I need to find the others with powers to prevent something terrible from happening.",
            
            "Tayo \"Tazer\" Amusan": "First time was in my room, bruv. Eyes started tingling, looked in the mirror, and couldn't see myself. It was mad. Then I used it to take care of some Sixers, know what I'm saying? Now I can control it proper.",
            
            "Sabrina Clarke": "I discovered it in the worst possible way. I found out Kevin was cheating on me. I was so angry, and suddenly he was thrown against the door without me touching him. I've never called in sick before that day. I just wanted things to go back to normal.",
            
            "Andre Simpson": "It happened at the ATM, man. Got fired from my call centre job because of my record. Couldn't get money for my son AJ. I got so frustrated I punched the ATM, and my fist went straight through it. Took all the cash to provide for my boy.",
            
            "Rodney Cullen": "Listen, I was late for a drop, yeah? Needed to be quick, and suddenly I'm in Scotland, bruv! Took me like a minute to get there! At first, I couldn't control it, but now I'm using it for business. Five minutes or free delivery, you get me?"
        }
        return origin_responses.get(character, "")
    
    # Relationships with other characters
    elif any(name.lower() in query_lower for name in ["michael", "tazer", "sabrina", "andre", "rodney", "dionne", "spud", "aj", "sharleen", "krazy"]):
        # Find which character is mentioned
        mentioned_chars = [name for name in ["michael", "tazer", "sabrina", "andre", "rodney", "dionne", "spud", "aj", "sharleen", "krazy"] if name.lower() in query_lower]
        if mentioned_chars:
            mentioned = mentioned_chars[0]
            
            # Define relationship responses
            relationships = {
                ("Michael Lasaki-Brown", "tazer"): "Tazer stabbed me in a timeline that got erased when my powers first manifested. I remember it, but he doesn't. Now I'm trying to convince him to join me, but he's suspicious and hostile.",
                ("Michael Lasaki-Brown", "sabrina"): "I've warned Sabrina about the hooded figures and told her we need to work together, but she's reluctant. She has telekinesis and could fly in the future. She says she can't fly now though.",
                ("Michael Lasaki-Brown", "andre"): "I haven't properly met Andre yet. He once held a door open for me, but I don't think he even remembers that. I know he has super strength but that's it.",
                ("Michael Lasaki-Brown", "rodney"): "Rodney tried selling me drugs before his powers manifested. Later, I saved him from one of those hooded figures with pyrokinesis. He's dismissive about my warnings though - thinks it's a 'me problem'.",
                ("Michael Lasaki-Brown", "dionne"): "Dionne is my fiancée. I love her more than anything. In the future I saw, she dies on July 9, 2024. That's why I'm trying to find everyone with powers - to prevent that from happening. I haven't told her about it.",
                
                ("Tayo \"Tazer\" Amusan", "michael"): "That delivery guy? He came to me talking about powers and shit. Don't know how he knew I have powers. Saw him teleport in front of me. Had me shook for a bit. He's on about some hooded figures coming after us. Don't trust him, to be honest.",
                ("Tayo \"Tazer\" Amusan", "krazy"): "Krazy? He used to run the Tower Boys before me. I looked up to him, fam. But then he came back and shot Tiny - one of my brothers. The doctor said Tiny might not walk again. Krazy's not my brother no more. Next time I see him, I'm gonna kill him.",
                ("Sabrina Clarke", "michael"): "Michael approached me claiming he has powers too. I didn't believe him until I saw his eyes glow yellow like mine. I saw him teleport out of my car, it was mad. He warned me about some hooded figures hunting people with abilities. I'm skeptical, but after what happened with Kadeem, I don't know what to believe anymore.",
                ("Sabrina Clarke", "sharleen"): "Sharleen is my sister. I'm very protective of her - especially after what happened with Kadeem. I wish she'd stay away from guys like Krazy. She deserves better. I'd do anything to keep her safe.",
                ("Andre Simpson", "michael"): "Don't really know him. Think I held a door for him once. Got my own problems to deal with, trying to provide for AJ and stay out of trouble. ",
                ("Andre Simpson", "aj"): "AJ is my son, man. The most important person in my life. I was in prison for a while, so I'm trying to make up for lost time. His mother Aisha's with a guy named Dwayne now. I'm determined to keep AJ away from gang life.",
                ("Rodney Cullen", "michael"): "That guy? Tried to sell him some product before all this power madness. Then he comes saving me from some fire-throwing nutter in a hood. Now he's on about needing my help with some mission. Sounds like a 'him' problem, you get me?",
                ("Rodney Cullen", "spud"): "Spud's my best mate and business partner. We run the drug game together. He's the one who doubted my superspeed at first but now he's treating me like I'm the fucking Flash. It's proper annoying but I can't blame him. I mean, I did run to Scotland in a minute. He's a muppet but I love him."
            }
            
            return relationships.get((character, mentioned), "")
        
    elif any(word in query_lower for word in ["yellow eyes", "eyes glow", "eyes turn"]):
            # Define eye color responses
        eye_color_responses = {
            "Michael Lasaki-Brown": "My eyes glow yellow when I use my powers. It's a sign that I'm teleporting or manipulating time. It happens to all of us with powers.",
            "Tayo \"Tazer\" Amusan": "My eyes go yellow when I turn invisible, fam. First time it happened, they were just tingling. Next thing I know, I can't see myself in the mirror. It's a bit mad, innit?",
            "Sabrina Clarke": "My eyes glow yellow when I use my telekinesis. It's a bit scary, to be honest. I think it happens to everyone with powers. Kadeem and Michael's eyes glow yellow and they have powers. Those guys at Krazy's warehouse had yellow eyes too.",
            "Andre Simpson": "My eyes go yellow when I use my strength. Don't really understand why, but it happens every time.",
            "Rodney Cullen": "My eyes go proper yellow when I'm using my speed, bruv! Like a warning light or something, innit? Happens to all of us with these freaky powers as far as I can tell."
        }
        
        return eye_color_responses.get(character, "")
        
    elif any(phrase in query_lower for phrase in ["hooded", "figures", "fire", "pyrokinesis"]):
            # Define hooded figures responses
        hooded_responses = {
            "Michael Lasaki-Brown": "They're dangerous. I first saw them when I went to the future and now they're in the present. They have powers too - saw one that can create fire, another one can make portals. I need to find the others before it's too late.",
            "Tayo \"Tazer\" Amusan": "Some weird guys in hoods? Haven't seen them myself, fam. That Michael guy was going on about them coming after people with powers. Don't know if I believe that.",
            "Sabrina Clarke": "Michael mentioned something about hooded figures hunting people with powers. I haven't seen them personally and I'm not sure I believe everything he says. But after what happened with Kadeem, I don't know what to think.",
            "Andre Simpson": "Hooded figures? What are you talking about, man? I've got enough real problems without worrying about some made-up threat.",
            "Rodney Cullen": "Yeah, I saw one of those nutters! Had some fire powers or something. Would've been burnt to a crisp if Michael hadn't shown up, to be fair. Still not getting involved in all that. Got a business to run, you get me?"
        }
        
        return hooded_responses.get(character, "")

    
    return ""  # No template matched

# Modify response generation function to use templates
def generate_response(character, query, history=""):
    """Generate an in-character response with templates for common questions"""
    
    # Check for templated responses first
    templated_response = get_templated_response(character, query)
    if templated_response:
        # Apply speech pattern to ensure it sounds authentic
        return apply_speech_pattern(character, templated_response)
    
    # If no template matches the user's query, use the model to generate a response
    try:
       
        info = character_info.get(character, "")
        relevant_dialogue = get_relevant_dialogue_samples(character, query)
        prompt = f""" 
Generate a response as {character} from Supacell.

Character Information:
{info[:2000]}

Relevant Dialogue Samples:
{relevant_dialogue}

Previous conversation:
{history[-200:] if history else ""}

User: {query}

{character} should respond:"""
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_length=150, 
                num_return_sequences=1, 
                do_sample=True, 
                top_p=0.9, 
                temperature=0.7
                )
            
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Apply character's speech pattern to their response
        response = apply_speech_pattern(character, response)
        
        return response
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return "Sorry, I'm having trouble responding right now."

In [10]:
# Create Gradio interface

def respond(message, history, character):
    """Handle chatbot response generation"""
    if not message.strip():
        return "", history
    
    # Build conversation history text
    history_text = ""
    for human, ai in history:
        history_text += f"User: {human}\n{character}: {ai}\n"
    
    # Generate response
    response = generate_response(character, message, history_text)
    
    # Update history
    history.append((message, response))
    return "", history

# Create Gradio interface to interact with Supacell characters
with gr.Blocks() as demo:
    gr.Markdown("# Supacell Character Chatbot")
    
    character_dropdown = gr.Dropdown(
        choices=characters_available,
        label="Select Character",
        value=characters_available[0] if characters_available else None
    )
    
    chatbot = gr.Chatbot(height=500)
    message = gr.Textbox(placeholder="Type your message here...")
    send = gr.Button("Send")
    clear = gr.Button("Clear")
    
    # Set up interactions
    send.click(respond, [message, chatbot, character_dropdown], [message, chatbot])
    message.submit(respond, [message, chatbot, character_dropdown], [message, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)
    
    # Add example questions
    gr.Examples(
        examples=[
            ["What's your power?"],
            ["How did you discover your abilities?"],
            ["What's your biggest fear?"],
            ["Tell me about the other characters"]
        ],
        inputs=message
    )

# Launch the interface
demo.launch()

  chatbot = gr.Chatbot(height=500)


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


