# Importing necessary libraries and connecting to the vector store

In [None]:
!pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  pinecone-client[grpc] \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain \
  xformers==0.0.20 \
  bitsandbytes==0.41.0 \
  peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m3.6 M

In [None]:

import os
import pinecone
import time
import pandas as pd
import re
import ast
import numpy as np
import torch
from torch import cuda, bfloat16
import transformers
from peft import AutoPeftModelForCausalLM,PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

from langchain.chains import RetrievalQA
from langchain import LLMChain, OpenAI, PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Pinecone
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.evaluation.qa import QAGenerateChain , QAEvalChain

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import unicodedata
import string
from gensim.parsing.preprocessing import remove_stopwords
import spacy

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


## Initializing the Hugging Face Embedding Pipeline


In [None]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32})

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Connecting to the Vector Index

In [None]:
api_key = 'pinecone api key here'
env = "gcp-starter"
pinecone.init(api_key=api_key, environment=env)
index_name = "llama2rag"
# connect to index
index = pinecone.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()
text_field = 'text'  # field in metadata that contains text content
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

# Initializing the Hugging Face and langchain RAG Pipelines for the base model based RAG

In [None]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
# begin initializing HF items, need auth token for these
hf_auth = 'hf key here'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=0.9,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)
llama2_base = HuggingFacePipeline(pipeline=generate_text)
torch.cuda.empty_cache()

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Models Retrieval Evaluation

In [None]:
torch.cuda.empty_cache()

In [None]:
evaluation_df = pd.read_csv('//content//drive/MyDrive/NLP Project/Doc/MoviesDataPre_final.csv')
evaluation_df['Output'] = evaluation_df.Output.str[2:-5]
evaluation_df = evaluation_df[(evaluation_df.Output != '{}')]
# Check if backslashes exist in 'Column1' and replace if found
mask = evaluation_df['MovieName'].str.contains(r'\\')
if mask.any():
    evaluation_df.loc[mask, 'MovieName'] = evaluation_df.loc[mask, 'MovieName'].str.replace('\\', '')
mask = evaluation_df['Instruction'].str.contains(r'\\')
if mask.any():
    evaluation_df.loc[mask, 'Instruction'] = evaluation_df.loc[mask, 'Instruction'].str.replace('\\', '')
evaluation_df.head()

Unnamed: 0,MovieName,Instruction,Output
0,Taxi Blues,recommend me a movie similar to Taxi Blues,the movie Taxi 2 is a movie of the actors fré...
1,The Hunger Games,recommend me a movie similar to The Hunger Games,the movie The Hunger is a movie of the actors...
2,Narasimham,recommend me a movie similar to Narasimham,the movie Narasimhudu is a movie of the actor...
3,The Lemon Drop Kid,recommend me a movie similar to The Lemon Drop...,the movie Now That Summer is Gone is a movie ...
4,A Cry in the Dark,recommend me a movie similar to A Cry in the Dark,"the movie After Dark, My Sweet is a movie of ..."


In [None]:
examples = [{"query":row['Instruction'], "answer":row['Output']} for i,row in evaluation_df.iloc[:10].iterrows()]

### prompt templates creation

In [None]:
prompt = """
You are an assistant for question-answering tasks about movies, and basic
conversation openers (use emojis also to make it friendly).
Use the following pieces of retrieved context to answer the question.
You have no preferences.
Your job to assist human to find the movie that suits their taste and not your taste.
If you don't know the answer, just say that you don't know.
Don't give any information about a movie that is not in the context.
Use five sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
# Your prompt template string
zeroshot_prompt_template_string = prompt
# Create a PromptTemplate object
zeroshot_prompt_template = PromptTemplate(
    template=zeroshot_prompt_template_string,
    input_variables=["question"]
)

zero_shot_Rag_pipeline = RetrievalQA.from_chain_type(
    llm=llama2_base,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    chain_type_kwargs={"prompt": zeroshot_prompt_template}    )

In [None]:
one_shot_example_ind = np.random.randint(len(evaluation_df))
one_shot_inst = evaluation_df.iloc[one_shot_example_ind].Instruction
one_shot_answer = evaluation_df.iloc[one_shot_example_ind].Output

one_shot_prompt = """
You are an assistant for question-answering tasks about movies, and basic
conversation openers (use emojis also to make it friendly).
Use the following pieces of retrieved context to answer the question.
You have no preferences.
Your job to assist human to find the movie that suits their taste and not your taste.
If you don't know the answer, just say that you don't know.
Don't give any information about a movie that is not in the context.
Use five sentences maximum and keep the answer concise.
Question: """ + one_shot_inst + """Answer: """ + one_shot_answer+ """Question: {question}
Context: {context}
Answer:
"""
# Your prompt template string
oneshot_prompt_template_string = one_shot_prompt
# Create a PromptTemplate object
oneshot_prompt_template = PromptTemplate(
    template=oneshot_prompt_template_string,
    input_variables=["question"]
)

one_shot_Rag_pipeline = RetrievalQA.from_chain_type(
    llm=llama2_base,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    chain_type_kwargs={"prompt": oneshot_prompt_template})

In [None]:
few_shot_example_ind = np.random.choice(len(evaluation_df) , 10)
few_shot_inst = evaluation_df.iloc[few_shot_example_ind].Instruction.values
few_shot_answer = evaluation_df.iloc[few_shot_example_ind].Output.values
fewshot_str = ""
for i in range(10):
  fewshot_str+= f"Question: {few_shot_inst[i]} \nAnswer: Based on the information provided, I can recommend you {few_shot_answer[i]}\n"

few_shot_prompt = """
You are an assistant for question-answering tasks about movies, and basic
conversation openers (use emojis also to make it friendly).
Use the following pieces of retrieved context to answer the question.
You have no preferences.
Your job to assist human to find the movie that suits their taste and not your taste.
If you don't know the answer, just say that you don't know.
Don't give any information about a movie that is not in the context.
Use five sentences maximum and keep the answer concise.""" + fewshot_str + """
Question:
Context: {context}
Answer:
"""
# Your prompt template string
fewshot_prompt_template_string = few_shot_prompt
# Create a PromptTemplate object
fewshot_prompt_template = PromptTemplate(
    template=oneshot_prompt_template_string,
    input_variables=["question"]
)
few_shot_Rag_pipeline = RetrievalQA.from_chain_type(
    llm=llama2_base,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    chain_type_kwargs={"prompt": fewshot_prompt_template})

In [None]:
print(fewshot_str)

Question: recommend me a movie similar to Peacemaker 
Answer: Based on the information provided, I can recommend you the movie Peace, Love & Misunderstanding is a movie  of the actors jeffrey dean morgan, elizabeth olsen, chace crawford, catherine keener, kyle maclachlan, jane fonda, rosanna arquette, nat wolff, maddie corman, poorna jagannathan, marissa o'donnell, wayne pyle, april crisafulli, sean marrinan, terry mckenna, joseph dunn, laurent rejto, michael patrick burke, teri gibson, nicholas karoly in which When her husband tells her he wants a divorce, devastated Manhattan lawyer Diane  heads upstate with her two teens to Woodstock to stay with her estranged hippie mother . In this charming village, Diane and her city kids get a new perspective on life: poetry-reading daughter Zoe  becomes interested in a sensitive young butcher Cole , nerdy son Jake  finds material for his first film project, and Diane herself grows close to a handsome carpenter/singer Jude . Most importantly, Di

In [None]:
from tqdm.notebook import tqdm
torch.cuda.empty_cache()

In [None]:
zero_shot_predictions = zero_shot_Rag_pipeline.apply(examples)
one_shot_predictions = one_shot_Rag_pipeline.apply(examples)
few_shot_predictions = few_shot_Rag_pipeline.apply(examples)
predictions_list = [zero_shot_predictions,one_shot_predictions, few_shot_predictions]
torch.cuda.empty_cache()

In [None]:
predictions_answers = pd.DataFrame(np.array([[i['answer'] for i in (examples)],
                                             [i['result'] for i in predictions_list[0]],
                                             [i['result'] for i in predictions_list[1]],
                                             [i['result'] for i in predictions_list[2]]]).T,
                                             columns = ['movie_summary','zero_shot' ,'one_shot','few_shot(5)'])

In [None]:
predictions_answers.head()

Unnamed: 0,movie_summary,zero_shot,one_shot,few_shot(5)
0,the movie Taxi 2 is a movie of the actors fré...,"Based on the provided context, I would recomme...","\nFor the movie Project Shadowchaser IV, I wou...","\nFor the movie Project Shadowchaser IV, I wou..."
1,the movie The Hunger is a movie of the actors...,"Based on the context provided, I would recomme...","Based on the context provided, I would recomme...","Based on the context provided, I would recomme..."
2,the movie Narasimhudu is a movie of the actor...,"Based on the provided context, I would recomme...","Based on the provided context, I would recomme...","Based on the provided context, I would recomme..."
3,the movie Now That Summer is Gone is a movie ...,"Based on the provided context, I would recomme...","Based on the provided context, I would recomme...","Based on the provided context, I would recomme..."
4,"the movie After Dark, My Sweet is a movie of ...","Based on the provided context, I would recomme...","For a movie similar to Artifact, I would recom...","For a movie similar to Artifact, I would recom..."


In [None]:
def spacy_tokenize(text):
  # fonction qui tokenize avec le tokenizer spacy
  return [token.text for token in tokenizer(text)]

# remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

def remove_special_characters(text):
    new_s = re.sub(r"""["?,$!]|'(?!(?<! ')[ts])""", "", text)

    # Deal with the final dot
    new_s = re.sub(r"\.", " .", new_s)

    return new_s
stop_words = set(stopwords.words('english'))
# Tokenize and remove stopwords
def remove__stopwords(text):
    tokens = spacy_tokenize(text.lower())
    filtered_tokens = remove_stopwords(tokens)
    return ' '.join(filtered_tokens)

for col in ['movie_summary','zero_shot' ,'one_shot','few_shot(5)']:
  predictions_answers[col] = predictions_answers[col].apply(remove_stopwords)
  predictions_answers[col] = predictions_answers[col].str.lower()
  predictions_answers[col] = predictions_answers[col].apply(punctuation_removal)
  predictions_answers[col] = predictions_answers.apply(lambda x: remove_accented_chars(x[col]), axis = 1)
  predictions_answers[col] = predictions_answers.apply(lambda x: remove_special_characters(x[col]), axis = 1)

## EmbeddingDistanceEvalChain evaluation

In [None]:
from langchain.evaluation.embedding_distance.base import EmbeddingDistance, EmbeddingDistanceEvalChain

chain = EmbeddingDistanceEvalChain(embeddings = embed_model,
                                  distance_metric=EmbeddingDistance.COSINE)

In [None]:
distances = []
for col in ['zero_shot' ,'one_shot','few_shot(5)']:
  distances.append(round(np.array([chain.evaluate_strings(prediction=example[col],
                                                           reference=example['movie_summary'])['score']
                       for i,example in predictions_answers.iterrows()]).mean(),2))

In [None]:
[['llama2-7b-chat-RAG',distances[0],distances[1],distances[2]]]

[['llama2-7b-chat-RAG', 0.4, 0.49, 0.49]]

In [None]:
pd.DataFrame(np.array(
    [['llama2-7b-chat-RAG',distances[0],distances[1],distances[2]]]),
    columns = ['Model', 'zero shot', 'one shot', 'few shot (10)'])

Unnamed: 0,Model,zero shot,one shot,few shot (10)
0,llama2-7b-chat-RAG,0.4,0.49,0.49


## QAeval on recommendation question and summary as answer dataset

In [None]:
eval_chain = QAEvalChain.from_llm(llama2_base)
eval_list = []
zeroshot_pred = eval_chain.evaluate(examples, predictions_list[0])
oneshot_pred = eval_chain.evaluate(examples, predictions_list[1])
fewshot_pred = eval_chain.evaluate(examples, predictions_list[2])


In [None]:
eval_list = []
binary_zeroshot_output = np.array([1 if ' correct' in i['results'].lower()[:20] else 0 for i in zeroshot_pred]).mean()
binary_oneshot_output = np.array([1 if ' correct' in i['results'].lower()[:20] else 0 for i in oneshot_pred]).mean()
binary_fewshot_output = np.array([1 if ' correct' in i['results'].lower()[:20] else 0 for i in fewshot_pred]).mean()

eval_list.append([binary_zeroshot_output,binary_oneshot_output,binary_fewshot_output])

In [None]:
np.array(eval_list).T

array([[0.9],
       [0.8],
       [0.8]])

In [None]:
pd.DataFrame(np.array(
    [['llama2-7b-chat-base', np.array(eval_list).T[0][0],np.array(eval_list).T[1][0],np.array(eval_list).T[2][0]]]),
    columns = ['Model', 'zero shot', 'one shot', 'few shot (10)']
)

Unnamed: 0,Model,zero shot,one shot,few shot (10)
0,llama2-7b-chat-base,0.9,0.8,0.8
