In [278]:
!pip install tiktoken
!pip install openai



### Data Cleaning

In [279]:
import pandas as pd
import tiktoken
from openai import OpenAI
import numpy as np
from typing import List


In [280]:
client = OpenAI(max_retries=5, api_key="INSERT-API-KEY")


In [281]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [282]:
# load & inspect dataset
input_datapath = "imdb_top_1000.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath)

df["Cast"] = (
    df.Star1.str.strip() + ", " + df.Star2.str.strip() + ", " + df.Star3.str.strip()
)

df = df[["Series_Title", "Released_Year", "Overview", "Runtime", "Genre", "IMDB_Rating", "Director", "Cast"]]
df = df.dropna()
df.head()

Unnamed: 0,Series_Title,Released_Year,Overview,Runtime,Genre,IMDB_Rating,Director,Cast
0,The Shawshank Redemption,1994,Two imprisoned men bond over a number of years...,142 min,Drama,9.3,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton"
1,The Godfather,1972,An organized crime dynasty's aging patriarch t...,175 min,"Crime, Drama",9.2,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan"
2,The Dark Knight,2008,When the menace known as the Joker wreaks havo...,152 min,"Action, Crime, Drama",9.0,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart"
3,The Godfather: Part II,1974,The early life and career of Vito Corleone in ...,202 min,"Crime, Drama",9.0,Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall"
4,12 Angry Men,1957,A jury holdout attempts to prevent a miscarria...,96 min,"Crime, Drama",9.0,Sidney Lumet,"Henry Fonda, Lee J. Cobb, Martin Balsam"


In [283]:
df = df.rename(columns={
    'Series_Title': 'Title',
})

df.head()

Unnamed: 0,Title,Released_Year,Overview,Runtime,Genre,IMDB_Rating,Director,Cast
0,The Shawshank Redemption,1994,Two imprisoned men bond over a number of years...,142 min,Drama,9.3,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton"
1,The Godfather,1972,An organized crime dynasty's aging patriarch t...,175 min,"Crime, Drama",9.2,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan"
2,The Dark Knight,2008,When the menace known as the Joker wreaks havo...,152 min,"Action, Crime, Drama",9.0,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart"
3,The Godfather: Part II,1974,The early life and career of Vito Corleone in ...,202 min,"Crime, Drama",9.0,Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall"
4,12 Angry Men,1957,A jury holdout attempts to prevent a miscarria...,96 min,"Crime, Drama",9.0,Sidney Lumet,"Henry Fonda, Lee J. Cobb, Martin Balsam"


In [284]:
encoding = tiktoken.get_encoding(embedding_encoding)

# omit overview that are too long to embed
df["n_tokens"] = df.Overview.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]
len(df)

1000

In [285]:
df

Unnamed: 0,Title,Released_Year,Overview,Runtime,Genre,IMDB_Rating,Director,Cast,n_tokens
0,The Shawshank Redemption,1994,Two imprisoned men bond over a number of years...,142 min,Drama,9.3,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton",22
1,The Godfather,1972,An organized crime dynasty's aging patriarch t...,175 min,"Crime, Drama",9.2,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan",19
2,The Dark Knight,2008,When the menace known as the Joker wreaks havo...,152 min,"Action, Crime, Drama",9.0,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart",36
3,The Godfather: Part II,1974,The early life and career of Vito Corleone in ...,202 min,"Crime, Drama",9.0,Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall",41
4,12 Angry Men,1957,A jury holdout attempts to prevent a miscarria...,96 min,"Crime, Drama",9.0,Sidney Lumet,"Henry Fonda, Lee J. Cobb, Martin Balsam",21
...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,A young New York socialite becomes interested ...,115 min,"Comedy, Drama, Romance",7.6,Blake Edwards,"Audrey Hepburn, George Peppard, Patricia Neal",30
996,Giant,1956,Sprawling epic covering the life of a Texas ca...,201 min,"Drama, Western",7.6,George Stevens,"Elizabeth Taylor, Rock Hudson, James Dean",19
997,From Here to Eternity,1953,"In Hawaii in 1941, a private is cruelly punish...",118 min,"Drama, Romance, War",7.6,Fred Zinnemann,"Burt Lancaster, Montgomery Clift, Deborah Kerr",36
998,Lifeboat,1944,Several survivors of a torpedoed merchant ship...,97 min,"Drama, War",7.6,Alfred Hitchcock,"Tallulah Bankhead, John Hodiak, Walter Slezak",35


In [286]:
output_path = './imdb_cleaned.csv'
df.to_csv(output_path, index=False)

### Generate Embedding

In [287]:
!pip install langchain-community



In [288]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_community.document_loaders.csv_loader import CSVLoader

In [289]:
# load & inspect dataset
input_datapath = "imdb_cleaned.csv"
loader = CSVLoader(file_path=input_datapath)
df = loader.load()

In [290]:
def get_embedding(text, model="text-embedding-3-small"):
   res = client.embeddings.create(input = text, model=model)
   x_embed = np.array([e.embedding for e in res.data])
   return x_embed


# Compute Embedding from the imdb movie dataset
def compute_embeddings_for_document(doc, doc_index):
    chunks = doc.page_content
    if len(chunks) == 0:
        return None

    try:
        chunk_embeddings = get_embedding([chunks])
    except Exception as e:
        print(f"Error computing embeddings for document {doc_index}: {e}")
        return None

    # Create a DataFrame for the embeddings
    page_df = pd.DataFrame({
        "chunk_id": list(range(chunk_embeddings.shape[0])),
        "chunk": chunks,
        "embedding": chunk_embeddings.tolist(),
        "page": f"document_{doc_index}"
    })

    return page_df

def compute_embeddings_from_documents(documents, cache_file, max_workers=4):
    if os.path.exists(cache_file):
        os.remove(cache_file)

    results = []

    # Use ThreadPoolExecutor to process documents concurrently
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_doc_index = {executor.submit(compute_embeddings_for_document, doc, i): i for i, doc in enumerate(documents)}
        for future in as_completed(future_to_doc_index):
            doc_index = future_to_doc_index[future]
            try:
                result = future.result()
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Error processing document {doc_index}: {e}")

    # Write all results to the cache file
    with open(cache_file, 'a') as f:
        for result in results:
            f.write(result.to_json(orient="records", lines=True))
            f.write('\n')

In [268]:
compute_embeddings_from_documents(df, "embeddings.json")

### Generate Answer

In [269]:
!pip install scipy



In [270]:
from scipy import spatial  # for calculating vector similarities for search

In [271]:
df = pd.read_json("embeddings.json", orient="record", lines=True)
embeddings = np.array([e for e in df['embedding'].values])

In [272]:
# search function
EMBEDDING_MODEL = "text-embedding-3-small"
GPT_MODEL = "gpt-4"

def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["chunk"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [273]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = """
      You will be given the dataset from Top 500 IMDB Movies.
      Use the below movies information to answer the subsequent question.
      If the answer cannot be found in the data given, write "I could not find an answer."
      The answer must be given based on relevance from the user questions!
    """
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        if (
            num_tokens(message + string + question, model=model)
            > token_budget
        ):
            break
        else:
            message += string
    # print(message)
    return message + question

In [274]:
def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the top rated movie from IMDB."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [294]:
ask('Which movie that about the mob with highest rating? please describe the movie and the cast')

'The highest-rated movie about the mob is "The Godfather" with an IMDB rating of 9.2. The movie was released in 1972 and is about an organized crime dynasty\'s aging patriarch who transfers control of his clandestine empire to his reluctant son. The movie was directed by Francis Ford Coppola and the main cast includes Marlon Brando, Al Pacino, and James Caan.'

In [295]:
ask('Which movie that have andrew garfield in it?')

'Andrew Garfield appears in the movies "The Social Network" and "Hacksaw Ridge".'

In [296]:
ask("What is shawshank redemption?")

'"Shawshank Redemption" is a movie released in 1994. It is a drama directed by Frank Darabont. The movie is about two imprisoned men who bond over a number of years, finding solace and eventual redemption through acts of common decency. The main cast includes Tim Robbins, Morgan Freeman, and Bob Gunton. The movie has a runtime of 142 minutes and an IMDB rating of 9.3.'