## CSV data into embedding

#### imports

In [1]:
import os
import time
import random
import openai
import pandas as pd
import numpy  as np
import tiktoken

from dotenv                  import load_dotenv
from openai.embeddings_utils import cosine_similarity, get_embedding

In [2]:
load_dotenv()
OPENAI_API_KEY                       = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT           = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME               = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME                    = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION            = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME      = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

OPENAI_DAVINCI_DEPLOYMENT_NAME       = os.getenv("OPENAI_DAVINCI_DEPLOYMENT_NAME")
OPENAI_DAVINCI_MODEL_NAME            = os.getenv("OPENAI_DAVINCI_MODEL_NAME")

# Configure OpenAI API
openai.api_type    = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base    = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key     = OPENAI_API_KEY

#### Utility class for progress bar display purpose

In [3]:
# a singleton class just for the display of the progress bar

class Singleton:
    _instance = None
    _counter = 0

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(Singleton, cls).__new__(cls, *args, **kwargs)
        return cls._instance
    
    def inc(self):
        self._counter += 1
        return self._counter

# testing the Singleton class

s1 = Singleton()


#### Read CSV using pandas

In [4]:
input_datapath = "./data/IMDB-Movie-Data.csv"  
df = pd.read_csv(input_datapath)

In [9]:
df

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,All,n_tokens
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"Movie title: Guardians of the Galaxy, Movie G...",46
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"Movie title: Prometheus, Movie Genre: Adventu...",45
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,"Movie title: Split, Movie Genre: Horror,Thril...",49
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,"Movie title: Sing, Movie Genre: Animation,Com...",63
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,"Movie title: Suicide Squad, Movie Genre: Acti...",51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,,45.0,"Movie title: Secret in Their Eyes, Movie Genr...",53
996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.54,46.0,"Movie title: Hostel: Part II, Movie Genre: Ho...",40
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.01,50.0,"Movie title: Step Up 2: The Streets, Movie Ge...",42
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,,22.0,"Movie title: Search Party, Movie Genre: Adven...",39


#### Create a cell with the data prepared for OpenAI

In [6]:
df['All'] = \
" Movie title: "                 + df['Title'] + \
  ", Movie Genre: "              + df['Genre'] + \
  ", Movie description: "        + df['Description']
  
  # ", Movie Director: "           + df['Director'] + \
  # ", Movie Actors: "             + df['Actors'] + \
  # ", Movie Year: "               + df['Year'].astype(str) + \
  # ", Movie Runtime (Minutes): "  + df['Runtime (Minutes)'].astype(str) + \
  # ", Movie Rating: "             + df['Rating'].astype(str) + \
  # ", Movie Revenue (Millions): " + df['Revenue (Millions)'].astype(str) + \


### Using OpenAI model ada for embeddings

OpenAI offers one second-generation embedding model (denoted by -002 in the model ID) and 16 first-generation models (denoted by -001 in the model ID).
We recommend using text-embedding-ada-002 for nearly all use cases. It’s better, cheaper, and simpler to use. 

#### Initialize Embeddings model 

In [7]:
# embedding model parameters
# encoding for text-embedding-ada-002
embedding_encoding = "cl100k_base"  
# the maximum for text-embedding-ada-002 is 8191
max_tokens = 8000 
# the number of reviews to embed
top_n = 1000
encoding = tiktoken.get_encoding(embedding_encoding)

#### Count number of tokens in each row

In [8]:
# skip Recipes that are too long to embed > max_tokens
df["n_tokens"] = df.All.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)

#### Create embeddings

In [10]:
def rate_limited_get_embedding(x, engine):
    # progress bar display, no logic here
    print('.', end='', flush=True)
    counter = s1.inc()
    if counter % 80 == 0:
        print(f' [{counter=}]')

    # picked a small number here, just as an example if u need to rate limit
    time.sleep(0.001)  # This adds a delay between requests
    try:
        return get_embedding(x, engine)
    except Exception as e:
        print(f"Error: {e}")
        raise

print(f"The embedding will complete after {df.count()[0]} dots")
df["embedding"] = df.All.apply(lambda x: rate_limited_get_embedding(x, OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME))
df.to_csv("./data/imdb_movies_with_embeddings.csv")


The embedding will complete after 1000 dots
................................................................................ [counter=80]
................................................................................ [counter=160]
................................................................................ [counter=240]
................................................................................ [counter=320]
................................................................................ [counter=400]
................................................................................ [counter=480]
................................................................................ [counter=560]
................................................................................ [counter=640]
................................................................................ [counter=720]
................................................................................ [counter=800]
.......

#### Read embeddings

In [9]:
datafile_path = "./data/imdb_movies_with_embeddings.csv"
df = pd.read_csv(datafile_path)

# convert the string representation of the embedding to a numpy array
df["embedding"] = df.embedding.apply(eval).apply(np.array)

#### Define a function to ask a question and get an answer from GPT-3.5

In [11]:
def ask_question(question, n=1):
    # get embedding for question
    question_embedding = get_embedding(
        question,
        engine=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    )

    # find the most similar embedding in the dataset
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, question_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
    )
    # if we asked for more than one anwser (n=2 or more), we would return a list of answers
    answer =  ' \n'.join(results.All.tolist()) 
    print("Answer:\n", answer)

#### Find a movie about kipnapping

In [12]:
ask_question("kidnapping", n=3)

Answer:
  Movie title: Taken, Movie Genre: Action,Thriller, Movie description: A retired CIA agent travels across Europe and relies on his old skills to save his estranged daughter, who has been kidnapped while on a trip to Paris. 
 Movie title: 31, Movie Genre: Horror,Thriller, Movie description: Five carnival workers are kidnapped and held hostage in an abandoned, Hell-like compound where they are forced to participate in a violent game, the goal of which is to survive twelve hours against a gang of sadistic clowns. 
 Movie title: Gone Baby Gone, Movie Genre: Crime,Drama,Mystery, Movie description: Two Boston area detectives investigate a little girl's kidnapping, which ultimately turns into a crisis both professionally and personally.


#### a movie about space

In [13]:
ask_question("a movie in space", n=5)

Answer:
  Movie title: Gravity, Movie Genre: Drama,Sci-Fi,Thriller, Movie description: Two astronauts work together to survive after an accident which leaves them alone in space. 
 Movie title: Interstellar, Movie Genre: Adventure,Drama,Sci-Fi, Movie description: A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival. 
 Movie title: Moon, Movie Genre: Drama,Mystery,Sci-Fi, Movie description: Astronaut Sam Bell has a quintessentially personal encounter toward the end of his three-year stint on the Moon, where he, working alongside his computer, GERTY, sends back to Earth parcels of a resource that has helped diminish our planet's power problems. 
 Movie title: The Martian, Movie Genre: Adventure,Drama,Sci-Fi, Movie description: An astronaut becomes stranded on Mars after his team assume him dead, and must rely on his ingenuity to find a way to signal to Earth that he is alive. 
 Movie title: Pandorum, Movie Genre: Action,Horror,Mystery, Movie 

In [14]:
ask_question("romance in new york", n=5)

Answer:
  Movie title: Brooklyn, Movie Genre: Drama,Romance, Movie description: An Irish immigrant lands in 1950s Brooklyn, where she quickly falls into a romance with a local. When her past catches up with her, however, she must choose between two countries and the lives that exist within. 
 Movie title: Sex and the City, Movie Genre: Comedy,Drama,Romance, Movie description: A New York writer on sex and love is finally getting married to her Mr. Big. But her three best girlfriends must console her after one of them inadvertently leads Mr. Big to jilt her. 
 Movie title: Before We Go, Movie Genre: Comedy,Drama,Romance, Movie description: Two strangers stuck in Manhattan for the night grow into each other's most trusted confidants when an evening of unexpected adventure forces them to confront their fears and take control of their lives. 
 Movie title: How to Be Single, Movie Genre: Comedy,Romance, Movie description: A group of young adults navigate love and relationships in New York Ci

In [15]:
ask_question("young people and love", n=5)


Answer:
  Movie title: Endless Love, Movie Genre: Drama,Romance, Movie description: The story of a privileged girl and a charismatic boy whose instant desire sparks a love affair made only more reckless by parents trying to keep them apart. 
 Movie title: Hounds of Love, Movie Genre: Crime,Drama,Horror, Movie description: A cold-blooded predatory couple while cruising the streets in search of their next victim, will stumble upon a 17-year-old high school girl, who will be sedated, abducted and chained in the strangers' guest room. 
 Movie title: Youth, Movie Genre: Comedy,Drama,Music, Movie description: A retired orchestra conductor is on holiday with his daughter and his film director best friend in the Alps when he receives an invitation from Queen Elizabeth II to perform for Prince Philip's birthday. 
 Movie title: Lovesong, Movie Genre: Drama, Movie description: The relationship between two friends deepens during an impromptu road trip. 
 Movie title: Love & Other Drugs, Movie Genr

In [16]:
ask_question("a journey to america", n=3)

Answer:
  Movie title: American Honey, Movie Genre: Drama, Movie description: A teenage girl with nothing to lose joins a traveling magazine sales crew, and gets caught up in a whirlwind of hard partying, law bending and young love as she criss-crosses the Midwest with a band of misfits. 
 Movie title: Into the Wild, Movie Genre: Adventure,Biography,Drama, Movie description: After graduating from Emory University, top student and athlete Christopher McCandless abandons his possessions, gives his entire $24,000 savings account to charity and hitchhikes to Alaska to live in the wilderness. Along the way, Christopher encounters a series of characters that shape his life. 
 Movie title: Idiocracy, Movie Genre: Adventure,Comedy,Sci-Fi, Movie description: Private Joe Bauers, the definition of "average American", is selected by the Pentagon to be the guinea pig for a top-secret hibernation program. Forgotten, he awakes five centuries in the future. He discovers a society so incredibly dumbed 

In [30]:
df

Unnamed: 0.1,Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,All,n_tokens,embedding,similarity
0,0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"Movie title: Guardians of the Galaxy, Movie G...",69,"[0.0034464721102267504, -0.034149687737226486,...",0.724889
1,1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"Movie title: Prometheus, Movie Genre: Adventu...",75,"[0.021496623754501343, -0.05055924132466316, -...",0.708119
2,2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,"Movie title: Split, Movie Genre: Horror,Thril...",82,"[-0.005913888104259968, -0.012753144837915897,...",0.706143
3,3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,"Movie title: Sing, Movie Genre: Animation,Com...",97,"[0.005802715662866831, -0.032581549137830734, ...",0.715434
4,4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,"Movie title: Suicide Squad, Movie Genre: Acti...",76,"[-0.01627175137400627, -0.03590480238199234, -...",0.718452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,,45.0,"Movie title: Secret in Their Eyes, Movie Genr...",80,"[0.012722349725663662, -0.020892666652798653, ...",0.711059
996,996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.54,46.0,"Movie title: Hostel: Part II, Movie Genre: Ho...",64,"[0.009888299740850925, -0.01366265770047903, -...",0.720850
997,997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.01,50.0,"Movie title: Step Up 2: The Streets, Movie Ge...",72,"[0.004272955935448408, -0.012731133960187435, ...",0.723805
998,998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,,22.0,"Movie title: Search Party, Movie Genre: Adven...",66,"[-0.0004308541538193822, -0.013530106283724308...",0.722942
