In [None]:
!pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  pinecone-client[grpc] \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain \
  xformers==0.0.20 \
  bitsandbytes==0.41.0 \
  peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m16.9 

## Initializing the Hugging Face Embedding Pipeline


In [None]:
import os
import pinecone
import time
import pandas as pd
import re
import ast
import numpy as np
import torch
from torch import cuda, bfloat16
import transformers
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM

from langchain.chains import RetrievalQA
from langchain import LLMChain, OpenAI, PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Pinecone
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.evaluation.qa import QAGenerateChain , QAEvalChain

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import unicodedata
import string
from gensim.parsing.preprocessing import remove_stopwords
import spacy

import warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32})

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Connecting to the Vector Index

In [None]:
api_key = 'pinecone api key here'
env = "gcp-starter"
pinecone.init(api_key=api_key, environment=env)
index_name = "llama2rag"
# connect to index
index = pinecone.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.42204,
 'namespaces': {'': {'vector_count': 42204}},
 'total_vector_count': 42204}

## Initializing the Hugging Face Pipeline

In [None]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
# begin initializing HF items, need auth token for these
hf_auth = 'hf key here'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=0.9,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)
llama2_base = HuggingFacePipeline(pipeline=generate_text)
text_field = 'text'  # field in metadata that contains text content
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
def llama2_base_prompted(question):
  prompt = f"""
            You are an assistant for question-answering tasks about movies, and basic
            conversation openers (use emojis also to make it friendly).
            You have no preferences.
            Your job to assist human to find the movie that suits their taste and not your taste.
            If you don't know the answer, just say that you don't know.
            Keep the answer concise.
            Use three sentences maximum and keep the answer concise.
            Question: {question}
            Answer:
            """
  return llama2_base(prompt)

text = 'Recommend me a movie silmilar to Spider-man'
llama2_base_prompted(text)

"🤔 Hmm... I'm glad you asked! Based on your interest in Spider-Man, I would recommend The Avengers! It's an action-packed superhero movie with great characters and exciting fight scenes. 💥 Would you like to watch it? 🎬"

In [None]:
prompt = """
You are an assistant for question-answering tasks about movies, and basic
conversation openers (use emojis also to make it friendly).
Use the following pieces of retrieved context to answer the question.
You have no preferences.
Your job to assist human to find the movie that suits their taste and not your taste.
If you don't know the answer, just say that you don't know.
Don't give any information about a movie that is not in the context.
Use five sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [None]:
# Your prompt template string
prompt_template_string = prompt

# Create a PromptTemplate object
prompt_template = PromptTemplate(
    template=prompt_template_string,
    input_variables=["question"]
)

In [None]:
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llama2_base,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    chain_type_kwargs={"prompt": prompt_template},
    # return_source_documents=True
    )

In [None]:
''.join(rag_pipeline("hello how are you?")['result'])

"Hello! How are you? 😊 I'm here to help you find a movie that suits your taste! 🎬 Can you tell me more about what you're in the mood for? For example, are you feeling romantic or funny? 💕😂"

In [None]:
torch.cuda.empty_cache()

In [None]:
''.join(rag_pipeline("What tasks can you assist me with?")['result'])

'🤔 Hi there! As an AI assistant, I can help you find a movie that suits your taste. Can you tell me a bit more about what you\'re looking for? Are you in the mood for something lighthearted and funny like "I Enjoy the World With You"? Or perhaps something more serious and thought-provoking like "Out of the Shadows"? Or maybe something entirely different like "Mine Games" or "24 Hours on Craigslist"? Let me know and I\'ll do my best to help! 🎬'

In [None]:
response = rag_pipeline("Recommend me a movie similar to Superman")
''.join(response['result'])

'Superman vs. The Elite (2012) - This movie features Superman facing off against a new group of superpowered villains known as "The Elite." It has elements of action, science fiction, and superhero movies.\nSuperman Classic (2017) - This short film showcases Superman\'s origins and early days as a crime-fighter in Metropolis. It has elements of animation, superhero, and science fiction genres.\nSuperman (1978) - This movie follows Superman as he battles various villains and saves the day in Metropolis. It has elements of superhero, action, and science fiction genres.\nSuperman II (1980) - This movie sees Superman facing off against three Kryptonian criminals who have been freed from their prison and are wreaking havoc on Earth. It has elements of superhero, action, and science fiction genres.\nSuperman vs. The Elite (2012) - This movie has a similar plot to Superman II, with Superman facing off against a new group of superpowered villains. However, it has a more modern take on the clas

In [None]:
# ''.join(rag_pipeline("What does the movie interstellar talk about? when was it released and who is the director?")['result'])

In [None]:
# ''.join(rag_pipeline("Do you a movie that is called Interstellar?")['result'])

In [None]:
# while True:
#     user_input = input("You: ")  # Get user input
#     if user_input.lower() in ["exit", "quit", "bye"]:
#         print("\nGoodbye!")
#         break  # Exit the loop if the user wants to end the conversationNT:
#     response = ''.join(rag_pipeline(user_input)['result'])
#     print(f"\nAssistant: {response}\n")

# Model Evaluation

## Evaluation dataset preparation

In [None]:
torch.cuda.empty_cache()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
evaluation_df = pd.read_csv('//content//drive/MyDrive/NLP Project/Doc/MoviesDataPre_final.csv')
evaluation_df.head()

Unnamed: 0,MovieName,Instruction,Output
0,Taxi Blues,recommend me a movie similar to Taxi Blues,"{""the movie Taxi 2 is a movie of the actors f..."
1,The Hunger Games,recommend me a movie similar to The Hunger Games,{'the movie The Hunger is a movie of the acto...
2,Narasimham,recommend me a movie similar to Narasimham,"{""the movie Narasimhudu is a movie of the act..."
3,The Lemon Drop Kid,recommend me a movie similar to The Lemon Drop...,{'the movie Now That Summer is Gone is a movie...
4,A Cry in the Dark,recommend me a movie similar to A Cry in the Dark,"{'the movie After Dark, My Sweet is a movie o..."


In [None]:
evaluation_df['Output'] = evaluation_df.Output.str[2:-5]
evaluation_df = evaluation_df[(evaluation_df.Output != '{}')]

In [None]:
# Check if backslashes exist in 'Column1' and replace if found
mask = evaluation_df['MovieName'].str.contains(r'\\')
if mask.any():
    evaluation_df.loc[mask, 'MovieName'] = evaluation_df.loc[mask, 'MovieName'].str.replace('\\', '')
mask = evaluation_df['Instruction'].str.contains(r'\\')
if mask.any():
    evaluation_df.loc[mask, 'Instruction'] = evaluation_df.loc[mask, 'Instruction'].str.replace('\\', '')

In [None]:
rand_indices = np.random.choice(evaluation_df.shape[0],20)
examples = [{"query":row['Instruction'], "answer":row['Output']} for i,row in evaluation_df.iloc[rand_indices].iterrows()]

array([20022, 17359, 21953,  8198,  3776, 37058,  2532,  3135, 13351,
        6271, 20135, 23582, 41686, 39184,   664, 11876,   893, 17102,
       21795,  2275]) rand_indices for reproducibility£

In [None]:
basellama_predictions = examples.copy()
for i in basellama_predictions:
  i['result'] = llama2_base_prompted(i)
print('Llama Predicted')
rag_predictions = rag_pipeline.apply(examples)
print('RAG Predicted')

Llama Predicted
RAG Predicted


In [None]:
predictions_answers = pd.DataFrame(np.array([[i['result'] for i in (basellama_predictions)],
                                   [i['result'] for i in (rag_predictions)],
                                   [i['answer'] for i in (examples)]]).T,
                                   columns = ['base_llama2_pred','rag_pred', 'movie_summary'])

In [None]:
predictions_answers.head()

Unnamed: 0,base_llama2_pred,rag_pred,movie_summary
0,🎬 What's your favorite movie genre? 🤔 Let me s...,"Based on the information provided, I would rec...",the movie I Met My Love Again is a movie in wh...
1,🎬 Recommended movie: Simanaheen 💕\n ...,"Based on the information provided, I would rec...",the movie Simanaheen is a movie in which {{Mor...
2,"🎬 Sure! Based on your interest in ""It Happened...","Recommendation: For a movie similar to ""It Hap...",the movie Somewhere in the Night is a movie o...
3,🎬 I can definitely help you find a movie simil...,"Based on the information provided, I would rec...",the movie Children is a movie in which The plo...
4,"🤔 Hmm, I'm not familiar with that movie. Can y...",I cannot recommend a movie similar to The Mome...,the movie The Awakening is a movie of the act...


In [None]:
def spacy_tokenize(text):
  # fonction qui tokenize avec le tokenizer spacy
  return [token.text for token in tokenizer(text)]

# remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

def remove_special_characters(text):
    new_s = re.sub(r"""["?,$!]|'(?!(?<! ')[ts])""", "", text)

    # Deal with the final dot
    new_s = re.sub(r"\.", " .", new_s)

    return new_s
stop_words = set(stopwords.words('english'))
# Tokenize and remove stopwords
def remove__stopwords(text):
    tokens = spacy_tokenize(text.lower())
    filtered_tokens = remove_stopwords(tokens)
    return ' '.join(filtered_tokens)

predictions_answers['base_llama2_pred'] = predictions_answers['base_llama2_pred'].apply(remove_stopwords)
predictions_answers['base_llama2_pred'] = predictions_answers['base_llama2_pred'].str.lower()
predictions_answers['base_llama2_pred'] = predictions_answers['base_llama2_pred'].apply(punctuation_removal)
predictions_answers['base_llama2_pred'] = predictions_answers.apply(lambda x: remove_accented_chars(x['base_llama2_pred']), axis = 1)
predictions_answers['base_llama2_pred'] = predictions_answers.apply(lambda x: remove_special_characters(x['base_llama2_pred']), axis = 1)

predictions_answers['rag_pred'] = predictions_answers['rag_pred'].apply(remove_stopwords)
predictions_answers['rag_pred'] = predictions_answers['rag_pred'].str.lower()
predictions_answers['rag_pred'] = predictions_answers['rag_pred'].apply(punctuation_removal)
predictions_answers['rag_pred'] = predictions_answers.apply(lambda x: remove_accented_chars(x['rag_pred']), axis = 1)
predictions_answers['rag_pred'] = predictions_answers.apply(lambda x: remove_special_characters(x['rag_pred']), axis = 1)

predictions_answers['movie_summary'] = predictions_answers['movie_summary'].apply(remove_stopwords)
predictions_answers['movie_summary'] = predictions_answers['movie_summary'].str.lower()
predictions_answers['movie_summary'] = predictions_answers['movie_summary'].apply(punctuation_removal)
predictions_answers['movie_summary'] = predictions_answers.apply(lambda x: remove_accented_chars(x['movie_summary']), axis = 1)
predictions_answers['movie_summary'] = predictions_answers.apply(lambda x: remove_special_characters(x['movie_summary']), axis = 1)

## EmbeddingDistanceEvalChain evaluation

In [None]:
from langchain.evaluation.embedding_distance.base import EmbeddingDistance, EmbeddingDistanceEvalChain

chain = EmbeddingDistanceEvalChain(embeddings = embed_model,
                                  distance_metric=EmbeddingDistance.COSINE)

In [None]:
base_euc_dist = round(np.array([chain.evaluate_strings(prediction=example['base_llama2_pred'], reference=example['movie_summary'])['score']
                       for i,example in predictions_answers.iterrows()]).mean(),2)
rag_euc_dist = round(np.array([chain.evaluate_strings(prediction=example['rag_pred'], reference=example['movie_summary'])['score']
                       for i,example in predictions_answers.iterrows()]).mean(),2)

In [None]:
pd.DataFrame(np.array(
    [['llama2-7b-chat-base', base_euc_dist],
     ['llama2-7b-chat-RAG', rag_euc_dist]]),
    columns = ['Model', 'Embedding Average Cosine similarity']
)

Unnamed: 0,Model,Embedding Average Cosine similarity
0,llama2-7b-chat-base,0.45
1,llama2-7b-chat-RAG,0.49


## QAeval on recommendation question and summary as answer dataset

In [None]:
eval_chain = QAEvalChain.from_llm(llama2_base)
llama2Base_graded_outputs = eval_chain.evaluate(examples, basellama_predictions)
rag_graded_outputs = eval_chain.evaluate(examples, rag_predictions)

In [None]:
binary_base_output = np.array([1 if ' correct' in i['results'].lower() else 0 for i in llama2Base_graded_outputs]).mean()
binary_rag_output = np.array([1 if ' correct' in i['results'].lower() else 0 for i in rag_graded_outputs]).mean()

In [None]:
pd.DataFrame(np.array(
    [['llama2-7b-chat-base', binary_base_output],
     ['llama2-7b-chat-RAG', binary_rag_output]]),
    columns = ['Model', 'QAeval accuracy']
)

Unnamed: 0,Model,QAeval accuracy
0,llama2-7b-chat-base,0.7
1,llama2-7b-chat-RAG,0.95
