In [None]:
!pip install -qU \
    datasets==2.14.5 \
    pinecone-client[grpc]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.listdir()

['.config', 'drive', 'sample_data']

In [None]:
!python -m spacy download en_core_web_md

2023-12-04 20:53:15.408389: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 20:53:15.408455: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 20:53:15.408502: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now l

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import unicodedata
import string
from gensim.parsing.preprocessing import remove_stopwords
import spacy
nlp = spacy.load("en_core_web_md")
tokenizer = nlp.tokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data importing and cleaning

In [None]:
import pandas as pd
import re
import ast
import numpy as np
df = pd.read_csv('//content//drive//MyDrive//NLP Project//MovieSummaries//plot_summaries.txt',
                 sep="	", header = None)
df.columns = ['Wikipedia_movie_ID','summary']
meta_data = pd.read_csv('//content//drive//MyDrive//NLP Project//MovieSummaries//movie.metadata.tsv' ,
                        sep='\t', header = None)
meta_data.columns = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'movie_name',
                     'relase_date','box_office_revenue','language','countries','country', 'genres']
# join meta_data and df on Wikipedia_movie_ID
df = pd.merge(df, meta_data[['Wikipedia_movie_ID','relase_date', 'movie_name', 'genres']],
              on = 'Wikipedia_movie_ID')
actors_info =  pd.read_csv('//content//drive//MyDrive//NLP Project//MovieSummaries//character.metadata.tsv' ,
                           sep='\t', header = None)
actors_info.columns = ["Wikipedia_movie_ID", "Freebase Movie ID","Release Date",
                       "Character Name", "Actor DOB", "Actor gender", "Actor height",
                       "Actor ethnicity", "Actor Name", "Actor age at movie release", "Freebase character map",
                       "Temp1", "Temp2"]
df = pd.merge(df, # join with the list of the actor names of the movie
         actors_info.groupby('Wikipedia_movie_ID')['Actor Name'].agg(list).reset_index()
          , how='left', on='Wikipedia_movie_ID')

In [None]:
# extract genres
def extract_genres(genres):
    genres = list(ast.literal_eval(genres).values())
    genres = [re.split('/|&', genre) for genre in genres]
    flattened_genres = [item.strip() for sublist in genres for item in sublist]
    return list(set(flattened_genres))

df['genres'] = df['genres'].apply(extract_genres)

In [None]:
def expand_summary(summary, movie_name, genres, actors_names):
    str_genres = " the movie has the genres of " + ', '.join(genres)
    str_movie_name = 'the movie ' + movie_name + ' is a movie '

    if isinstance(actors_names, float) and pd.isna(actors_names):
        return str_movie_name+ 'in which ' + summary + str_genres
    else:
        actors_str = ''
        if isinstance(actors_names, list):
            valid_actors = [str(name) for name in actors_names if not pd.isna(name)]
            actors_str = ' of the actors ' + ', '.join(valid_actors) + ' in which '

        return str_movie_name + actors_str.lower() + summary + str_genres
expand_summary(df['summary'][0],df['movie_name'][0],df['genres'][0],df['Actor Name'][0])

"the movie Taxi Blues is a movie  of the actors natalia koliakanova, pyotr mamonov, hal singer, vladimir kashpur, pyotr zaychenko, elena saphonova in which Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all. the movie has the genres of Drama, World cinema"

In [None]:
df['expanded_summary'] = df.apply(lambda x: expand_summary(x['summary'],
                                                           x['movie_name'],
                                                           x['genres'],
                                                           x['Actor Name']),axis=1)

In [None]:
def spacy_tokenize(text):
  # fonction qui tokenize avec le tokenizer spacy
  return [token.text for token in tokenizer(text)]

# remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

def remove_special_characters(text):
    new_s = re.sub(r"""["?,$!]|'(?!(?<! ')[ts])""", "", text)

    # Deal with the final dot
    new_s = re.sub(r"\.", " .", new_s)

    return new_s
stop_words = set(stopwords.words('english'))
# Tokenize and remove stopwords
def remove__stopwords(text):
    tokens = spacy_tokenize(text.lower())
    filtered_tokens = remove_stopwords(tokens)
    return ' '.join(filtered_tokens)

df['summary'] = df['expanded_summary'].apply(remove_stopwords)
df['summary'] = df['summary'].str.lower()
df['summary'] = df['summary'].apply(punctuation_removal)
df['summary'] = df.apply(lambda x: remove_accented_chars(x['summary']), axis = 1)
df['summary'] = df.apply(lambda x: remove_special_characters(x['summary']), axis = 1)

In [None]:
from datasets import Dataset
data_dict = Dataset.from_pandas(df)

In [None]:
data_dict[0]

{'Wikipedia_movie_ID': 23890098,
 'summary': 'movie taxi blues movie actors natalia koliakanova pyotr mamonov hal singer vladimir kashpur pyotr zaychenko elena saphonova shlykov hardworking taxi driver lyosha saxophonist develop bizarre lovehate relationship despite prejudices realize arent different all movie genres drama world cinema',
 'relase_date': '1990-09-07',
 'movie_name': 'Taxi Blues',
 'genres': ['Drama', 'World cinema'],
 'Actor Name': ['Natalia Koliakanova',
  'Pyotr Mamonov',
  'Hal Singer',
  'Vladimir Kashpur',
  'Pyotr Zaychenko',
  'Elena Saphonova'],
 'expanded_summary': "the movie Taxi Blues is a movie  of the actors natalia koliakanova, pyotr mamonov, hal singer, vladimir kashpur, pyotr zaychenko, elena saphonova in which Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all. the movie has the genres of Drama, World cinema",
 '__index_level_0_

In [None]:
data_dict = data_dict.map(lambda x: {
    "id": f'{x["Wikipedia_movie_ID"]}',
    "text": x["summary"] ,
    "metadata": {
        # "release_date": x["relase_date"],
        "movie_name": x["movie_name"],
        "genres": x["genres"],
        "text": x["expanded_summary"],
    }
    })

Map:   0%|          | 0/42204 [00:00<?, ? examples/s]

In [None]:
data = data_dict.remove_columns(["Wikipedia_movie_ID",'summary','expanded_summary',
                                 'relase_date','movie_name','genres','__index_level_0__',
                                 'Actor Name'])

In [None]:
data

Dataset({
    features: ['id', 'text', 'metadata'],
    num_rows: 42204
})

In [None]:
pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

### vector database initialization

In [None]:
import os
# import openai
import getpass

In [None]:
# !pip uninstall pinecone-client
# !pip install pinecone-client[grpc]

In [None]:
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = 'e9007b22-f360-4cd7-b660-0146e1d2d2b1'#os.getenv("PINECONE_API_KEY") or getpass.getpass()
# find your environment next to the api key in pinecone console
env = "gcp-starter" #os.getenv("PINECONE_ENVIRONMENT") or input()

pinecone.init(api_key=api_key, environment=env)

In [None]:
import time
import pinecone
index_name = "llama2rag"
# connect to index
index = pinecone.GRPCIndex(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [None]:
pip install openai



In [None]:
from tqdm.auto import tqdm
import openai

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    passed = False
    # find end of batch
    i_end = min(len(data), i+batch_size)
    # create batch
    batch = data[i:i_end]
    # create embeddings (exponential backoff to avoid RateLimitError)
    for j in range(5):  # max 5 retries
        try:
            res = embed_model.encode(batch["text"])
            passed = True
        except openai.error.RateLimitError:
            time.sleep(2**j)  # wait 2^j seconds before retrying
            print("Retrying...")
    if not passed:
        raise RuntimeError("Failed to create embeddings.")
    # get embeddings
    embeds = res.tolist()
    to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

  0%|          | 0/423 [00:00<?, ?it/s]

In [None]:
def get_docs(query: str, top_k: int):
    # encode query
    xq = embed([query]).tolist()  # Convert the NumPy array to a list
    res = index.query(xq, top_k=top_k, include_metadata=True)
    # get doc text
    docs = {x["metadata"]['text']: i for i, x in enumerate(res["matches"])}
    return docs

In [None]:
def get_docs(query: str, top_k: int):
    # encode query
    xq = embed([query]).tolist()
    res = index.query(xq, top_k=top_k + 1, include_metadata=True)  # Fetch one extra result
    queried_movie_name = query.split("recommend me a movie similar to ")[1].lower()  # Extract the queried movie name

    # get doc text excluding movies with similar names
    docs = {
        x["metadata"]['text']: i
        for i, x in enumerate(res["matches"])
        if queried_movie_name not in x["metadata"]['movie_name'].lower()
    }

    # Take the top_k results excluding movies with similar names
    docs = dict(list(docs.items())[:top_k])
    return docs

In [None]:
def embed(docs: list[str]) -> list[list[float]]:
  embeddings = embed_model.encode(docs)
  return embeddings

In [None]:
# query = "recommend me a movie similar to charlie chan secret"
# docs = get_docs(query, top_k=10)
# print("\n---\n".join(docs.keys()))

In [None]:
query = "recommend me a movie similar to Ghost In The Noonday Sun"
top_k = 5
# encode query
xq = embed(query).tolist()
res = index.query(xq, top_k=top_k + 1, include_metadata=True)  # Fetch one extra result
queried_movie_name = punctuation_removal(query.split("similar to ")[1].lower())  # Extract the queried movie name
# Retrieve all movie names in lower case
movie_names = [punctuation_removal(x["metadata"]['movie_name'].lower()) for x in res["matches"]]
# Filter out the queried movie name
filtered_movie_names = [name for name in movie_names if queried_movie_name != name]
# Find the indices of the remaining movie names
indices = [movie_names.index(name) for name in filtered_movie_names]
# Take the top_k results excluding movies with similar names
docs = {res["matches"][i]["metadata"]['text']: i for i in indices[:top_k]}

In [None]:
docs

{'the movie Annoyed is a movie in which A mute young girl was selected by a relentless ghost as her messenger in order to find the man who is responsible for her brutal death. A new supernatural bone chilling film that is rife with suspense and rich in atmosphere. the movie has the genres of Horror': 0,
 'the movie Ghost Chase is a movie  of the actors jason lively, jill whitlow, toby kaye, leonard lansink in which In an old Hollywood mansion, the spirit of an old family retainer inhabits an old grandfather clock. When a movie company uses the mansion for a film, the spirit inhabits the body of an alien and persuades the two film-makers to track down an old house that will resolve a family scandal. the movie has the genres of Fantasy Comedy, World cinema, Fantasy, Comedy, Teen, Science Fiction': 1,
 "the movie I Downloaded A Ghost is a movie  of the actors ellen page, vince corazza, carlos alazraqui, krista mitchell, landon peters in which Stella Blackstone  and her best friend Albert 

In [None]:
def get_docs(query: str, top_k: int):
  # encode query
  xq = embed(query).tolist()
  res = index.query(xq, top_k=top_k + 1, include_metadata=True)  # Fetch one extra result
  queried_movie_name = punctuation_removal(query.split("similar to ")[1].lower())  # Extract the queried movie name
  # Retrieve all movie names in lower case
  movie_names = [punctuation_removal(x["metadata"]['movie_name'].lower()) for x in res["matches"]]
  # Filter out the queried movie name
  filtered_movie_names = [name for name in movie_names if queried_movie_name != name]
  # Find the indices of the remaining movie names
  indices = [movie_names.index(name) for name in filtered_movie_names]
  # Take the top_k results excluding movies with similar names
  docs = {res["matches"][i]["metadata"]['text']: i for i in indices[:top_k]}
  return docs

In [None]:
query = "recommend me a movie similar to Ghost In The Noonday Sun"
docs = get_docs(query, top_k=5)
docs

{'the movie Annoyed is a movie in which A mute young girl was selected by a relentless ghost as her messenger in order to find the man who is responsible for her brutal death. A new supernatural bone chilling film that is rife with suspense and rich in atmosphere. the movie has the genres of Horror': 0,
 'the movie Ghost Chase is a movie  of the actors jason lively, jill whitlow, toby kaye, leonard lansink in which In an old Hollywood mansion, the spirit of an old family retainer inhabits an old grandfather clock. When a movie company uses the mansion for a film, the spirit inhabits the body of an alien and persuades the two film-makers to track down an old house that will resolve a family scandal. the movie has the genres of Fantasy Comedy, World cinema, Fantasy, Comedy, Teen, Science Fiction': 1,
 "the movie I Downloaded A Ghost is a movie  of the actors ellen page, vince corazza, carlos alazraqui, krista mitchell, landon peters in which Stella Blackstone  and her best friend Albert 

In [None]:
dataset = []
for i in tqdm(range(len(df))):
    movie_name = df["movie_name"][i]
    query = f"recommend me a movie similar to {movie_name}"
    docs = get_docs(query, top_k=5)
    dataset.append({"MovieName": movie_name, "Instruction": query, "Output": docs})

  0%|          | 0/42204 [00:00<?, ?it/s]

In [None]:
result_df = pd.DataFrame(dataset.copy())

In [None]:
result_df

Unnamed: 0,MovieName,Instruction,Output
0,Taxi Blues,recommend me a movie similar to Taxi Blues,{'the movie Taxi 2 is a movie of the actors f...
1,The Hunger Games,recommend me a movie similar to The Hunger Games,{'the movie The Hunger is a movie of the acto...
2,Narasimham,recommend me a movie similar to Narasimham,{'the movie Narasimhudu is a movie of the act...
3,The Lemon Drop Kid,recommend me a movie similar to The Lemon Drop...,{'the movie Now That Summer is Gone is a movie...
4,A Cry in the Dark,recommend me a movie similar to A Cry in the Dark,"{'the movie After Dark, My Sweet is a movie o..."
...,...,...,...
42199,Oomakkuyil Padumbol,recommend me a movie similar to Oomakkuyil Pad...,{'the movie Oktapodi is a movie in which Two o...
42200,The Last Command,recommend me a movie similar to The Last Command,{'the movie The Last Eve is a movie of the ac...
42201,Randy Parsons: American Luthier,recommend me a movie similar to Randy Parsons:...,"{'the movie Lenexa, 1 Mile is a movie of the ..."
42202,Kabuliwala,recommend me a movie similar to Kabuliwala,{'the movie Dhuaan is a movie of the actors a...


In [None]:
result_df.to_csv('/content/drive/MyDrive/NLP Project/Doc/MoviesDataPre_k_5.csv', index=False)

In [None]:
result_df[result_df['MovieName'].str.lower().str.contains('spider-man')]

Unnamed: 0,MovieName,Instruction,Output
21929,Spider-Man 3,recommend me a movie similar to Spider-Man 3,{'the movie The Amazing Spiderman is a movie ...
30301,Spider-Man 2,recommend me a movie similar to Spider-Man 2,{'the movie The Amazing Spiderman is a movie ...
33028,Spider-Man,recommend me a movie similar to Spider-Man,{'the movie The Amazing Spiderman is a movie ...
40981,The Amazing Spider-Man,recommend me a movie similar to The Amazing Sp...,{'the movie The Spider Returns is a movie of ...


In [None]:
result_df[result_df['MovieName'].str.lower().str.contains('spider-man')]

Unnamed: 0,MovieName,Instruction,Output
21929,Spider-Man 3,recommend me a movie similar to Spider-Man 3,{'the movie The Amazing Spiderman is a movie ...
30301,Spider-Man 2,recommend me a movie similar to Spider-Man 2,{'the movie The Amazing Spiderman is a movie ...
33028,Spider-Man,recommend me a movie similar to Spider-Man,{'the movie The Amazing Spiderman is a movie ...
40981,The Amazing Spider-Man,recommend me a movie similar to The Amazing Sp...,{'the movie The Spider Returns is a movie of ...


In [None]:
def chat_template(instruction):
    return f"### Instruction:\n {instruction}\n\n### Response:\n"

result_df['Instruction'] = result_df['Instruction'].apply(chat_template)

In [None]:
# Save the DataFrame to a new CSV file
df.to_csv('/content/drive/MyDrive/NLP Project/Doc/new_dataset.csv', index=False)