In [None]:
#PREPARE DATASET

#import logging
#logging.basicConfig(level=logging.DEBUG)

import pandas as pd
import numpy as np

#SELECTED APPROPRIATE DATA

selected_cols =['product_id','star_rating','helpful_votes','total_votes','review_headline','review_body']
# Read TSV file into DataFrame
df = pd.read_csv('amazon_reviews_us_Mobile_Electronics_v1_00.tsv', sep='\t', on_bad_lines='skip',usecols=selected_cols) 
df = df.loc[(df.helpful_votes > df.total_votes / 2) & (df.helpful_votes > 1)]
df=df.reset_index(drop=True)

#Select a spesific product
product = df.loc[df.product_id=="B002MWYUFU"]
product["review_body"]


In [None]:
import pinecone

#We initialize a document store in order to create embedings of the reviews
#We do this, to perform semantic search (Retrieval step)
#If computation time is an issue, we can use BM25 

from haystack.document_stores import PineconeDocumentStore
from haystack.nodes import PreProcessor

document_store = PineconeDocumentStore(
    api_key='8c39bd74-3fa9-436a-b775-f0ddb9073e9c', 
    index='generative-review',
    similarity="cosine",
    embedding_dim=768,environment='asia-southeast1-gcp-free'
)

In [None]:
from haystack import Document
from haystack.nodes import PreProcessor
from tqdm.auto import tqdm  # progress bar

#This cell is used to pass some data in Pinecone to experiment
#A preprocessor is also initialized, to split reviews in appropriate lengths.
#The deep learning model can process a limited number of tokens per review.

total_doc_count = product.shape[0]
batch_size = 50

#initialize preprocessor
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=170,
    split_overlap=15,
    split_respect_sentence_boundary=True,
)


counter = 0
docs = []
for d,r in tqdm(product.iterrows(), total=total_doc_count):
    
    # create haystack document object with text content and doc metadata
    doc = Document(
        content=r["review_body"],
        meta={
            "product_id": r["product_id"],
        }
    )
    docs.append(doc)
    counter+=1
    if counter == batch_size:
        split_documents = preprocessor.process(docs)
        document_store.write_documents(split_documents)
        docs.clear()
    if counter == total_doc_count:
        split_documents = preprocessor.process(docs)
        document_store.write_documents(docs)
        break


       

In [None]:
#CREATE EMBEDINGS
from haystack.nodes import EmbeddingRetriever

 

retriever = EmbeddingRetriever(
   document_store=document_store,
   embedding_model="sentence-transformers/all-mpnet-base-v2",
   model_format="sentence_transformers",top_k=30
)



In [None]:
# #TEST RETRIVER

# from haystack.pipelines import DocumentSearchPipeline
# from haystack.utils import print_documents

# search_pipe = DocumentSearchPipeline(retriever)
# result = search_pipe.run(
#     query="What are some negative comments about the characteristics?",
#     params={"Retriever": {"top_k": 5}}
# )

# result

In [None]:
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
#"""Synthesize a comprehensive answer from the following  most relevant paragraphs and the given question. Provide a clear and concise response that summarizes the key points and information presented in the paragraphs.

#Initialize prompt to be passed to GPT 3.5, to get an answer

openai_api_key="key"
rag_prompt = PromptTemplate(
    prompt="""Given a question related to a product, please use the information from a collection of product reviews to generate a concise and informative answer. 
              The system should consider multiple perspectives and opinions expressed in the reviews to provide a balanced response. Ensure that the answer is coherent, relevant, and based on the sentiments and insights gathered from the reviews. 
                              
                             \n\n Paragraphs: {join(documents, pattern='$summary')} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)
prompt_node=PromptNode(model_name_or_path="gpt-3.5-turbo", api_key=openai_api_key,default_prompt_template=rag_prompt,max_length=200)


#{join(documents)}
#{join(documents, pattern='$summary')}

In [None]:
from haystack.nodes import TransformersSummarizer

summarizer = TransformersSummarizer(model_name_or_path="ThanosAng/Product_Review_Summary",max_length= 100,min_length=4)

In [None]:
# This function will be called if the language is not english. The idea is to support questions in languages different than english
# Through this process, the user can simply write in his own language without performing any other action and the appropriate
# models will be automatically be chosen to create an answer and translate back. This is done through a combination of classification mode\
# that classifies the language in the question, and translation models.

#parameters: question= the original question from the user
#            to_eng= model that translates question to egnlish
#            to_foreign = model that translates the answer back to original language 

#It can be any language (supported by the classification model), but we test just for french
#Search "papluca/xlm-roberta-base-language-detection" for more information

def foreign_input(question,to_eng,to_foreign)

    translator_input  = TransformersTranslator(model_name_or_path=to_eng)
    translator_output = TransformersTranslator(model_name_or_path=to_foreign)

    node = PromptNode()
    pipeline = Pipeline()
    
    # Translator for input query
    pipeline.add_node(component=translator_input, name="Translator_Input", inputs=["Query"])

    # Retriever
    pipeline.add_node(component=retriever, name='Retriever', inputs=['Translator_Input'])

    # Summarizer
    pipeline.add_node(component=summarizer, name="Summarizer", inputs=["Retriever"])

    pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["Summarizer"])

    # Run the pipeline
    res = pipeline.run(query="Y a-t-il des reflets?")

    #Finally, translate back to French

    DOCS = [
            Document(
                content=res["answers"][0].answer
            )
        ]
    
    res = translator_output.translate(documents=DOCS, query=None)
    print(res)

In [None]:
#This function is called when question was classified as english.
def QA_english(question)    

    node = PromptNode()
    pipeline = Pipeline()

    # Retriever
    pipeline.add_node(component=retriever, name='Retriever', inputs=['Query'])

    # Summarizer
    pipeline.add_node(component=summarizer, name="Summarizer", inputs=["Retriever"])

    pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["Summarizer"])

    # Run the pipeline
    res = pipeline.run(query=question)
    print(res)

In [None]:
from transformers import pipeline

#Initialize the classification model
pipe = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")


In [None]:
from haystack import Document
from haystack.pipelines import Pipeline
from haystack.nodes import TransformersTranslator

# We initialize a dictionary where the keys are the outputs of the classification model (different languages)\
# and as values, the models  to be used. For demonstration purposes we only added english, german and french, but we could insert\
# any other language (as long as its supported by the classification model).

#The classification model will run, classify the language of question, and choose the appropriate model, which will be loaded\
#automatically in the pipeline.

# Available languages are: arabic (ar), bulgarian (bg), german (de), modern greek (el), english (en), spanish (es), french (fr), hindi (hi), 
#                          italian (it), japanese (ja), dutch (nl), polish (pl), portuguese (pt), russian (ru), swahili (sw), thai (th), turkish (tr), 
#                          urdu (ur), vietnamese (vi), and chinese (zh)


lang_to_eng = {"fr":"Helsinki-NLP/opus-mt-fr-en","de","Helsinki-NLP/opus-mt-de-en","en":"en"} #The dict contains models that map the language to english
lang_to_foreign={"fr:Helsinki-NLP/opus-mt-en-fr","de","Helsinki-NLP/opus-mt-en-de"}    #The dict contains models that map english back to foreign

#Set a question
question = "L'écran reflète-t-il"

#Classify the language (identify it)
language=pipe(question)[0]["label"]

#Call the functions
if language =="en":
    QA_english(question)   
else:
    model_to_foreign= lang_to_eng[language] #Define the model to be used to trnaslate to english
    model_to_english= lang_to_foreign[language] #Define the model that will translate back to foreign
    QA_foreign(question,model_to_foreign,model_to_english)
    

In [None]:
# #DELETE CONTENTS OF INDEX

# pinecone.init(api_key='8c39bd74-3fa9-436a-b775-f0ddb9073e9c', environment='asia-southeast1-gcp-free') 
# index = pinecone.Index('generative-review') 

# delete_response = index.delete(deleteAll=True,namespace="no-vectors")

In [None]:
# import torch
# # confirm GPU is available, outputs True if so
# torch.cuda.is_available()