In [52]:
# Pip install required dependencies
! pip install openai
! pip install pinecone-client[grpc]

import os
import csv

import openai
import pinecone 
from kaggle_secrets import UserSecretsClient

# Load OpenAI and Pinecone API keys via Kaggle secrets, to avoid leaking secrets via Jupyter notebook
OPENAI_API_KEY = UserSecretsClient().get_secret("OPENAI_API_KEY")
PINECONE_API_KEY = UserSecretsClient().get_secret("PINECONE_API_KEY")

[0m

In [53]:
# Install requests, beautifulsoup and langchain
! pip install requests
! pip install bs4
! pip install langchain

import requests
from bs4 import BeautifulSoup

[0m

In [54]:
# Helper function to retrieve the summary content from the wiki page. Handles 
# non-uniform HTML - in particular, a case for the office wiki where this content 
# might exist under any of three similar headers: Summary, Plot and Synopsis
def get_summary_paragraphs(soup): 
    possible_summary_headers = ['Summary', 'Plot', 'Synopsis']
    
    paragraphs = []
    
    for h in possible_summary_headers: 
        try : 
            span = soup.select(f'span#{h}')[0]
            paragraphs = span.find_all_next("p")
        except: 
            continue
            #print(f'Could not find header with id: {h}')

    return paragraphs

In [55]:
# Fetches the summary from the supplied wiki page, returning the summary and 
# necessary metadata that will be used by the the file writing function as well
# as data-processing functions that will upload metadata to our Pinecone index
def get_summary_from_wiki_page(season_num: int, url: str):
    html = requests.get(url).text 
    soup = BeautifulSoup(html, features="html.parser")
    
    paragraphs = get_summary_paragraphs(soup)
    
    episode_num = "?"    
    h3_elem = soup.find_all('h3', {'class': 'pi-data-label'})
    for h3 in h3_elem: 
        if h3.get_text().strip() == "Episode": 
            episode_num = h3.find_next("div").get_text()
            # Handle the double-feature case - when a given episode in the wiki 
            # represents two distinct episodes within the series, leading to a value 
            # such as 11/12 or 22/23
            if '/' in episode_num: 
                print(f'Found combo episode number in {episode_num}')
                episode_num = episode_num.replace('/', '_and_')
                print(f'Converted compound episode_num to {episode_num}')
                

    full_summary = ""

    for p in paragraphs: 
        full_summary += p.get_text()

    return season_num, episode_num, slug, full_summary

In [56]:
# Persists a given summary to a specifically named file that can be read later by 
# langchain DocumentLoaders
def write_episode_summary_to_file(season: int, episode: int, slug:str, summary: str): 
    filename = f'{season}_{episode}_{slug}.md' 
    with open(filename, "w") as f: 
        f.write(summary)
        print(f'write_episode_summary_to_file - wrote filename {filename}')

In [57]:
# Define all episodes in a dict of lists to make it easier to iterate through

seasons = {
    1: [
        'https://theoffice.fandom.com/wiki/Pilot',
        'https://theoffice.fandom.com/wiki/Diversity_Day',
        'https://theoffice.fandom.com/wiki/Health_Care', 
        'https://theoffice.fandom.com/wiki/The_Alliance',
        'https://theoffice.fandom.com/wiki/Basketball',
        'https://theoffice.fandom.com/wiki/Hot_Girl'
    ],
    2: [
        'https://theoffice.fandom.com/wiki/The_Dundies',
        'https://theoffice.fandom.com/wiki/Sexual_Harassment',
        'https://theoffice.fandom.com/wiki/Office_Olympics', 
        'https://theoffice.fandom.com/wiki/The_Fire',
        'https://theoffice.fandom.com/wiki/Halloween',
        'https://theoffice.fandom.com/wiki/The_Fight',
        'https://theoffice.fandom.com/wiki/The_Client',
        'https://theoffice.fandom.com/wiki/Performance_Review',
        'https://theoffice.fandom.com/wiki/Email_Surveillance',
        'https://theoffice.fandom.com/wiki/Christmas_Party',
        'https://theoffice.fandom.com/wiki/Booze_Cruise',
        'https://theoffice.fandom.com/wiki/The_Injury',
        'https://theoffice.fandom.com/wiki/The_Secret',
        'https://theoffice.fandom.com/wiki/The_Carpet',
        'https://theoffice.fandom.com/wiki/Boys_and_Girls',
        'https://theoffice.fandom.com/wiki/Valentine%27s_Day',
        'https://theoffice.fandom.com/wiki/Dwight%27s_Speech',
        'https://theoffice.fandom.com/wiki/Take_Your_Daughter_to_Work_Day',
        'https://theoffice.fandom.com/wiki/Michael%27s_Birthday',
        'https://theoffice.fandom.com/wiki/Drug_Testing',
        'https://theoffice.fandom.com/wiki/Conflict_Resolution',
        'https://theoffice.fandom.com/wiki/Casino_Night'
    ], 
    3: [
        'https://theoffice.fandom.com/wiki/Gay_Witch_Hunt',
        'https://theoffice.fandom.com/wiki/The_Convention',
        'https://theoffice.fandom.com/wiki/The_Coup',
        'https://theoffice.fandom.com/wiki/Grief_Counseling',
        'https://theoffice.fandom.com/wiki/Initiation',
        'https://theoffice.fandom.com/wiki/Diwali',
        'https://theoffice.fandom.com/wiki/Branch_Closing',
        'https://theoffice.fandom.com/wiki/The_Merger',
        'https://theoffice.fandom.com/wiki/The_Convict',
        'https://theoffice.fandom.com/wiki/A_Benihana_Christmas',
        'https://theoffice.fandom.com/wiki/Back_From_Vacation',
        'https://theoffice.fandom.com/wiki/Traveling_Salesmen',
        'https://theoffice.fandom.com/wiki/The_Return',
        'https://theoffice.fandom.com/wiki/Ben_Franklin',
        'https://theoffice.fandom.com/wiki/Phyllis%27_Wedding',
        'https://theoffice.fandom.com/wiki/Business_School',
        'https://theoffice.fandom.com/wiki/Cocktails',
        'https://theoffice.fandom.com/wiki/The_Negotiation',
        'https://theoffice.fandom.com/wiki/Safety_Training',
        'https://theoffice.fandom.com/wiki/Product_Recall',
        'https://theoffice.fandom.com/wiki/Women%27s_Appreciation',
        'https://theoffice.fandom.com/wiki/Beach_Games',
        'https://theoffice.fandom.com/wiki/The_Job'
    ], 
    4: [
        'https://theoffice.fandom.com/wiki/Fun_Run',
        'https://theoffice.fandom.com/wiki/Dunder_Mifflin_Infinity',
        'https://theoffice.fandom.com/wiki/Launch_Party',
        'https://theoffice.fandom.com/wiki/Money',
        'https://theoffice.fandom.com/wiki/Local_Ad',
        'https://theoffice.fandom.com/wiki/Branch_Wars',
        'https://theoffice.fandom.com/wiki/Survivor_Man',
        'https://theoffice.fandom.com/wiki/The_Deposition',
        'https://theoffice.fandom.com/wiki/Dinner_Party',
        'https://theoffice.fandom.com/wiki/Chair_Model',
        'https://theoffice.fandom.com/wiki/Night_Out',
        'https://theoffice.fandom.com/wiki/Did_I_Stutter%3F',
        'https://theoffice.fandom.com/wiki/Job_Fair',
        'https://theoffice.fandom.com/wiki/Goodbye,_Toby'
    ], 
    5: [
        'https://theoffice.fandom.com/wiki/Weight_Loss',
        'https://theoffice.fandom.com/wiki/Business_Ethics',
        'https://theoffice.fandom.com/wiki/Baby_Shower',
        'https://theoffice.fandom.com/wiki/Crime_Aid',
        'https://theoffice.fandom.com/wiki/Employee_Transfer',
        'https://theoffice.fandom.com/wiki/Customer_Survey',
        'https://theoffice.fandom.com/wiki/Business_Trip',
        'https://theoffice.fandom.com/wiki/Frame_Toby',
        'https://theoffice.fandom.com/wiki/The_Surplus',
        'https://theoffice.fandom.com/wiki/Moroccan_Christmas',
        'https://theoffice.fandom.com/wiki/The_Duel',
        'https://theoffice.fandom.com/wiki/Prince_Family_Paper',
        'https://theoffice.fandom.com/wiki/Stress_Relief',
        'https://theoffice.fandom.com/wiki/Lecture_Circuit_Part_1',
        'https://theoffice.fandom.com/wiki/Lecture_Circuit_Part_2',
        'https://theoffice.fandom.com/wiki/Blood_Drive',
        'https://theoffice.fandom.com/wiki/Golden_Ticket',
        'https://theoffice.fandom.com/wiki/New_Boss',
        'https://theoffice.fandom.com/wiki/Two_Weeks',
        'https://theoffice.fandom.com/wiki/Dream_Team',
        'https://theoffice.fandom.com/wiki/Michael_Scott_Paper_Company',
        'https://theoffice.fandom.com/wiki/Heavy_Competition',
        'https://theoffice.fandom.com/wiki/Broke',
        'https://theoffice.fandom.com/wiki/Casual_Friday',
        'https://theoffice.fandom.com/wiki/Caf%C3%A9_Disco',
        'https://theoffice.fandom.com/wiki/Company_Picnic'
    ], 
    6: [
        'https://theoffice.fandom.com/wiki/Gossip',
        'https://theoffice.fandom.com/wiki/The_Meeting',
        'https://theoffice.fandom.com/wiki/The_Promotion',
        'https://theoffice.fandom.com/wiki/Niagara',
        'https://theoffice.fandom.com/wiki/Mafia',
        'https://theoffice.fandom.com/wiki/The_Lover',
        'https://theoffice.fandom.com/wiki/Koi_Pond',
        'https://theoffice.fandom.com/wiki/Double_Date',
        'https://theoffice.fandom.com/wiki/Murder',
        'https://theoffice.fandom.com/wiki/Shareholder_Meeting',
        'https://theoffice.fandom.com/wiki/Scott%27s_Tots',
        'https://theoffice.fandom.com/wiki/Secret_Santa',
        'https://theoffice.fandom.com/wiki/The_Banker',
        'https://theoffice.fandom.com/wiki/Sabre',
        'https://theoffice.fandom.com/wiki/Manager_and_Salesman',
        'https://theoffice.fandom.com/wiki/The_Delivery',
        'https://theoffice.fandom.com/wiki/St._Patrick%27s_Day',
        'https://theoffice.fandom.com/wiki/New_Leads',
        'https://theoffice.fandom.com/wiki/Happy_Hour',
        'https://theoffice.fandom.com/wiki/Secretary%27s_Day',
        'https://theoffice.fandom.com/wiki/Body_Language',
        'https://theoffice.fandom.com/wiki/The_Cover-Up',
        'https://theoffice.fandom.com/wiki/The_Chump',
        'https://theoffice.fandom.com/wiki/Whistleblower'
    ],
    7: [
        'https://theoffice.fandom.com/wiki/Nepotism',
        'https://theoffice.fandom.com/wiki/Counseling',
        'https://theoffice.fandom.com/wiki/Andy%27s_Play',
        'https://theoffice.fandom.com/wiki/Sex_Ed',
        'https://theoffice.fandom.com/wiki/The_Sting',
        'https://theoffice.fandom.com/wiki/Costume_Contest',
        'https://theoffice.fandom.com/wiki/Christening',
        'https://theoffice.fandom.com/wiki/Viewing_Party',
        'https://theoffice.fandom.com/wiki/WUPHF.com',
        'https://theoffice.fandom.com/wiki/China',
        'https://theoffice.fandom.com/wiki/Classy_Christmas',
        'https://theoffice.fandom.com/wiki/The_Ultimatum',
        'https://theoffice.fandom.com/wiki/The_Seminar',
        'https://theoffice.fandom.com/wiki/The_Search',
        'https://theoffice.fandom.com/wiki/PDA',
        'https://theoffice.fandom.com/wiki/Threat_Level_Midnight',
        'https://theoffice.fandom.com/wiki/Todd_Packer_(episode)',
        'https://theoffice.fandom.com/wiki/Garage_Sale',
        'https://theoffice.fandom.com/wiki/Training_Day',
        'https://theoffice.fandom.com/wiki/Michael%27s_Last_Dundies',
        'https://theoffice.fandom.com/wiki/Goodbye,_Michael',
        'https://theoffice.fandom.com/wiki/The_Inner_Circle',
        'https://theoffice.fandom.com/wiki/Dwight_K._Schrute,_(Acting)_Manager',
        'https://theoffice.fandom.com/wiki/Search_Committee'
    ],
    8: [
        'https://theoffice.fandom.com/wiki/The_List',
        'https://theoffice.fandom.com/wiki/The_Incentive',
        'https://theoffice.fandom.com/wiki/Lotto',
        'https://theoffice.fandom.com/wiki/Garden_Party',
        'https://theoffice.fandom.com/wiki/Spooked',
        'https://theoffice.fandom.com/wiki/Doomsday',
        'https://theoffice.fandom.com/wiki/Pam%27s_Replacement',
        'https://theoffice.fandom.com/wiki/Gettysburg',
        'https://theoffice.fandom.com/wiki/Mrs._California',
        'https://theoffice.fandom.com/wiki/Christmas_Wishes',
        'https://theoffice.fandom.com/wiki/Trivia',
        'https://theoffice.fandom.com/wiki/Pool_Party',
        'https://theoffice.fandom.com/wiki/Jury_Duty',
        'https://theoffice.fandom.com/wiki/Special_Project',
        'https://theoffice.fandom.com/wiki/Tallahassee',
        'https://theoffice.fandom.com/wiki/After_Hours',
        'https://theoffice.fandom.com/wiki/Test_the_Store',
        'https://theoffice.fandom.com/wiki/Last_Day_in_Florida',
        'https://theoffice.fandom.com/wiki/Get_the_Girl',
        'https://theoffice.fandom.com/wiki/Welcome_Party',
        'https://theoffice.fandom.com/wiki/Angry_Andy',
        'https://theoffice.fandom.com/wiki/Fundraiser',
        'https://theoffice.fandom.com/wiki/Turf_War',
        'https://theoffice.fandom.com/wiki/Free_Family_Portrait_Studio'
    ], 
    9: [
        'https://theoffice.fandom.com/wiki/New_Guys',
        'https://theoffice.fandom.com/wiki/Roy%27s_Wedding',
        'https://theoffice.fandom.com/wiki/Andy%27s_Ancestry',
        'https://theoffice.fandom.com/wiki/Work_Bus',
        'https://theoffice.fandom.com/wiki/Here_Comes_Treble_(Episode)',
        'https://theoffice.fandom.com/wiki/The_Boat',
        'https://theoffice.fandom.com/wiki/The_Whale',
        'https://theoffice.fandom.com/wiki/The_Target',
        'https://theoffice.fandom.com/wiki/Dwight_Christmas',
        'https://theoffice.fandom.com/wiki/Lice',
        'https://theoffice.fandom.com/wiki/Suit_Warehouse',
        'https://theoffice.fandom.com/wiki/Customer_Loyalty',
        'https://theoffice.fandom.com/wiki/Junior_Salesman',
        'https://theoffice.fandom.com/wiki/Vandalism',
        'https://theoffice.fandom.com/wiki/Couples_Discount',
        'https://theoffice.fandom.com/wiki/Moving_On',
        'https://theoffice.fandom.com/wiki/The_Farm',
        'https://theoffice.fandom.com/wiki/Promos',
        'https://theoffice.fandom.com/wiki/Stairmageddon',
        'https://theoffice.fandom.com/wiki/Paper_Airplane',
        'https://theoffice.fandom.com/wiki/Livin%27_the_Dream',
        'https://theoffice.fandom.com/wiki/A.A.R.M.',
        'https://theoffice.fandom.com/wiki/Finale'
    ]
}

In [58]:
# Main data processing loop: iterate through every season and every episode, reading 
# the summary and persisting it to a file that is named with the season and episode 
# numbers for easier future parsing
for season in seasons:
    urls = seasons[season]
    for url in urls: 
        slug = url.split('/')[-1]
        season_num, episode_num, slug, summary = get_summary_from_wiki_page(season, url)

        write_episode_summary_to_file(season_num, episode_num, slug ,summary)

        #print(f'summary for Season: {season_num} episode_num: {episode_num}, a.k.a: {slug} is: {summary}')

write_episode_summary_to_file - wrote filename 1_1_Pilot.md
write_episode_summary_to_file - wrote filename 1_2_Diversity_Day.md
write_episode_summary_to_file - wrote filename 1_3_Health_Care.md
write_episode_summary_to_file - wrote filename 1_4_The_Alliance.md
write_episode_summary_to_file - wrote filename 1_5_Basketball.md
write_episode_summary_to_file - wrote filename 1_6_Hot_Girl.md
write_episode_summary_to_file - wrote filename 2_1_The_Dundies.md
write_episode_summary_to_file - wrote filename 2_2_Sexual_Harassment.md
write_episode_summary_to_file - wrote filename 2_3_Office_Olympics.md
write_episode_summary_to_file - wrote filename 2_4_The_Fire.md
write_episode_summary_to_file - wrote filename 2_5_Halloween.md
write_episode_summary_to_file - wrote filename 2_6_The_Fight.md
write_episode_summary_to_file - wrote filename 2_7_The_Client.md
write_episode_summary_to_file - wrote filename 2_8_Performance_Review.md
write_episode_summary_to_file - wrote filename 2_9_Email_Surveillance.md
w

In [59]:
# Load langchain DocumentLoaders and use them to read all the data from the local 
# directory where we just wrote all the episode summaries
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

loader = DirectoryLoader('../', glob="**/*.md", loader_cls=TextLoader)

data = loader.load()

for doc in data: 
    print(f'doc: {doc.metadata}')
    print(f'doc: {doc}')

doc: {'source': '../working/3_19_The_Negotiation.md'}
doc: page_content='The episode opens with Roy waiting outside the building, presumably for Jim. Inside, Jim is talking to Karen on what they should do for the night. An infuriated Roy makes his way up to the office, first to speak to Pam until he sees Jim with Karen. He calls Jim by his last name, and advances on him with his fist raised. Pam screams at Roy to stop and Jim pushes Karen out of the way. As soon as Roy gets his hands on Jim, Dwight sprays large amounts of pepper spray at Roy (it spreads throughout the front of the office as well). In an interview, Dwight is seen crying, from the pepper spray, explaining that everybody laughed at him for bringing pepper spray in to the office. "Who\'s laughing now?" he asks smugly.\nA week later, Michael and Toby are talking to Jan over the speaker phone, and it is revealed that Roy has been fired, and that Darryl now wants a raise. In separate interviews, Pam says that the attack on Ji

In [60]:
# Helper functions to go from filename to metadata that can be used when upserting
# our summary content. From each filename we can parse the season and episode 
# numbers as well as the human-legible title
def extract_title_from_file(target: str): 
    filename = target.split('/')[-1]
    title_comps = filename.split('_')
    title_comps = [x.replace('.md', '') for x in title_comps]
    if 'and' in title_comps: 
        return ' '.join(title_comps[4:])
    else: 
        return ' '.join(title_comps[2:])

def extract_season_from_file(target: str):
    filename = target.split('/')[-1]
    return filename.split('_')[0]

def extract_episode_from_file(target: str):
    filename = target.split('/')[-1]
    comps = filename.split('_')
    comps = [x.replace('.md', '') for x in comps]
    if 'and' in comps: 
        return f'{comps[1]}and{comps[3]}'
    else: 
        return comps[1]
    

In [61]:
# Install and configure tiktoken for token length counting. This function will be used
# by our text splitting routine a bit below
! pip install tiktoken

import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)


[0m

In [62]:
# Install Pinecone and check if the target index already exists. If it does not, then
# create it. Otherwise, this cell would be a no-op, if the index exists
import pinecone
import sys

index_name = 'the-office-oracle'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment='us-west4-gcp-free'
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

In [63]:
# Configure the text_splitter for chunking our saved content into correctly shaped
# blocks that can converted into embeddings and upserted into our index
# Note the use of the tikentoken_len function we defined above
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [64]:
# Configure OpenAI's text embedding model, text-embedding-ada-002
# Load our OPENAI_API_KEY securely from the environment
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [65]:
# A nice sanity check to run after setting up your Pinecone vector DB 
# You can run this cell at any point to get back the number of dimensions configured
# for your Pinecone index, as well as the number of vectors it currently contains
index = pinecone.Index(index_name)

print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [66]:
# Our main upserting loop: for every document in the data list we created earlier, 
# loop through the document and format its metadata as a dictionary. Split its 
# source text using our text_splitter configured above, and convert each chunk 
# into an embedding. Upsert each embedding to our Pinecone index, effectively creating
# the recent-context-enhanced "brain" for our AI chatbot 
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'source': record.metadata["source"],
        'title': extract_title_from_file(record.metadata["source"]),
        'season': extract_season_from_file(record.metadata["source"]), 
        'episode': extract_episode_from_file(record.metadata["source"])
    }

     # now we create chunks from the record text
    record_texts = text_splitter.split_text(record.page_content)
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/188 [00:00<?, ?it/s]