<a href="https://colab.research.google.com/github/zxb-97/NLP1/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random as rand

def scrape_reviews(base_url, headers, max_pages=20):
    all_reviews = []
    for page in range(1, max_pages + 1): #Change here for scraping other pages
        print(f"Fetching page {page}")
        response = requests.get(base_url + f"&pageNumber={page}", headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}, status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'lxml')
        review_elements = soup.select("div[data-hook='review']")

        if not review_elements:
            print(f"No reviews found on page {page}, stopping.")
            break

        for review in review_elements:
            r_author_element = review.select_one("span.a-profile-name")
            r_author = r_author_element.text.strip() if r_author_element else None

            r_rating_element = review.select_one("i.review-rating span.a-icon-alt")
            r_rating = r_rating_element.text.replace(" out of 5 stars", "") if r_rating_element else None

            r_title_element = review.select_one("a.review-title span")
            r_title = r_title_element.text.strip() if r_title_element else None

            r_content_element = review.select_one("span.review-text-content span")
            r_content = r_content_element.text.strip() if r_content_element else None

            r_date_element = review.select_one("span.review-date")
            r_date = r_date_element.text.strip() if r_date_element else None

            r_verified_element = review.select_one("span.a-declarative span.a-size-mini")
            r_verified = r_verified_element.text.strip() if r_verified_element else "Not Verified"

            review_data = {
                "author": r_author,
                "rating": r_rating,
                "title": r_title,
                "content": r_content,
                "date": r_date,
                "verified": r_verified
            }

            all_reviews.append(review_data)

        # To avoid hitting the server too frequently
        time.sleep(rand.uniform(5,10))

    return all_reviews

# Base URL of the Amazon product reviews page (without page number)
base_url = 'https://www.amazon.co.uk/Lenovo-ThinkPad-Windows-Professional-Renewed/product-reviews/B07CSSF72G/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1'
custom_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}

# Fetch all reviews
all_reviews = scrape_reviews(base_url, custom_headers, max_pages=20)
print(all_reviews); exit;
# Save reviews to a JSONL file
file_path = "thinkpad_reviews_1-20.jsonl"
with open(file_path, "w") as f:
    for review in all_reviews:
        json.dump(review, f)
        f.write("\n")

print(f"Total number of reviews fetched: {len(all_reviews)}")
print(f"Reviews saved to {file_path}")

Fetching page 1
Fetching page 2
Fetching page 3
Fetching page 4
Fetching page 5
Fetching page 6
Fetching page 7
Fetching page 8
Fetching page 9
Fetching page 10
Fetching page 11
Fetching page 12
Fetching page 13
Fetching page 14
Fetching page 15
Fetching page 16
Fetching page 17
Fetching page 18
Fetching page 19
Fetching page 20
[{'author': 'Amazon Customer', 'rating': '5.0', 'title': '5.0 out of 5 stars', 'content': 'Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could have been slicker.  I knew roughly delivery date - it was delivered bang on time by the way - but would have been great given the delivery details, tracker number etc.  I chased this and was given day before delivery date - perhaps me being impatient and a very, very minor quibble.Laptop arrived in very secure packaging and, with the exception that the box was plain rather than Lenovo

Load the Json

In [1]:
!pip install -U -q "langchain" "transformers==4.31.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.41.3" "trl==0.4.7" "safetensors>=0.3.1"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
import json
from langchain.docstore.document import Document

def load_reviews(file_path):
    reviews = []
    with open(file_path, "r") as f:
        for line in f:
            reviews.append(json.loads(line))
    return reviews

# Load reviews from the file
file_path = "thinkpad_reviews_1-20.jsonl"
reviews = load_reviews(file_path)
print(f"Loaded {len(reviews)} reviews.")

Loaded 200 reviews.


Clean Up the Data

In [3]:
import re

def clean_text(text):
    if text:
        text = text.strip()  # Remove leading and trailing white spaces
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

def preprocess_reviews(reviews):
    cleaned_reviews = []
    for review in reviews:
        cleaned_review = {
            "author": clean_text(review.get("author", "")),
            "rating": clean_text(review.get("rating", "")),
            "title": clean_text(review.get("title", "")),
            "content": clean_text(review.get("content", "")),
            "date": clean_text(review.get("date", "")),
            "verified": clean_text(review.get("verified", "Not Verified"))
        }
        cleaned_reviews.append(cleaned_review)
    return cleaned_reviews

# Clean the reviews
cleaned_reviews = preprocess_reviews(reviews)
print(f"Cleaned {len(cleaned_reviews)} reviews.")

Cleaned 200 reviews.


Splitting Document into chunks


In [35]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the splitter with the maximum chunk length
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, # the character length of the chunk
    chunk_overlap = 30, # the character length of the overlap between chunks
    length_function = len,  # the length function - in this case, character length (aka the python len() fn.)
    separators=[". ", ", ", " "]
)

def chunk_reviews(reviews):
    chunked_reviews = []
    for review in reviews:
        # Split the content into chunks using the CharacterTextSplitter
        content_chunks = text_splitter.split_text(review["content"])
        for i, chunk in enumerate(content_chunks):
            chunked_review = review.copy()
            chunked_review["content"] = chunk
            chunked_review["chunk_id"] = f"{review['author']}_{i}" #Unique id for each chunk should work better
            chunked_reviews.append(chunked_review)
    return chunked_reviews

# Split the reviews into chunks
chunked_reviews = chunk_reviews(cleaned_reviews)
print(f"Chunked reviews count: {len(chunked_reviews)}")
for i, chunk in enumerate(chunked_reviews): #Nicer print
    print(f"Chunk {i+1}:\n{chunk}\n")

Chunked reviews count: 380
Chunk 1:
{'author': 'Amazon Customer', 'rating': '5.0', 'title': '5.0 out of 5 stars', 'content': 'Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could have been slicker. I knew roughly delivery date - it was delivered bang on time by the way - but would have been great given the delivery details, tracker number etc', 'date': 'Reviewed in the United Kingdom on 20 March 2020', 'verified': 'Not Verified', 'chunk_id': 'Amazon Customer_0'}

Chunk 2:
{'author': 'Amazon Customer', 'rating': '5.0', 'title': '5.0 out of 5 stars', 'content': '. I chased this and was given day before delivery date - perhaps me being impatient and a very, very minor quibble.Laptop arrived in very secure packaging and, with the exception that the box was plain rather than Lenovo branded, you could easily be fooled this was a brand new device. Not a scr

In [36]:
from langchain.docstore.document import Document
import pandas as pd

def create_documents(chunked_reviews):
    documents = []
    for review in chunked_reviews:
        metadata = {
            "author": review["author"],
            "rating": review["rating"],
            "title": review["title"],
            "date": review["date"],
            "verified": review["verified"],
            "chunk_id": review["chunk_id"]
        }
        document = Document(
            page_content=review["content"],
            metadata=metadata
        )
        documents.append(document)
    return documents

# Create documents from chunked reviews
documents = create_documents(chunked_reviews)
print(f"Created {len(documents)} documents.")
for i, doc in enumerate(documents): #Nicer print
    print(f"Chunk {i+1}:\n{doc}\n")

Created 380 documents.
Chunk 1:
page_content='Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could have been slicker. I knew roughly delivery date - it was delivered bang on time by the way - but would have been great given the delivery details, tracker number etc' metadata={'author': 'Amazon Customer', 'rating': '5.0', 'title': '5.0 out of 5 stars', 'date': 'Reviewed in the United Kingdom on 20 March 2020', 'verified': 'Not Verified', 'chunk_id': 'Amazon Customer_0'}

Chunk 2:
page_content='. I chased this and was given day before delivery date - perhaps me being impatient and a very, very minor quibble.Laptop arrived in very secure packaging and, with the exception that the box was plain rather than Lenovo branded, you could easily be fooled this was a brand new device. Not a scratch, no wear on the keys, screen looks perfect and mint condition. De

# Load documents into FAISS vector store


In [37]:
!pip install -q -U faiss-cpu tiktoken sentence-transformers

In [8]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.2-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensi

In [57]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(documents, embedder)



# Alternative embedding using page content + metadata

In [58]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

class CustomHuggingFaceEmbeddings(HuggingFaceEmbeddings):
    def embed(self, documents):
        embeddings = []
        for doc in documents:
            # Combine text content and metadata
            content = doc['page_content']
            metadata = doc['metadata']
            chunk_id = metadata.get('chunk_id','')
            combined_text = f"{content} {chunk_id}"  # Adjust as needed

            # Generate embeddings for combined text
            embedding = super().embed(combined_text)
            embeddings.append(embedding)
        return embeddings

core_embeddings_model = CustomHuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(documents, embedder)

# Check if vector store works with a query

In [66]:
def retrieve_unique_chunks(query, vector_store, core_embeddings_model, k):
    embedding_vector = core_embeddings_model.embed_query(query)
    docs = vector_store.similarity_search_by_vector(embedding_vector, k )  # Retrieve more to filter later

    unique_chunks = []
    seen_chunk_ids = set()
    for doc in docs:
        if doc.metadata["chunk_id"] not in seen_chunk_ids:
            unique_chunks.append(doc)
            seen_chunk_ids.add(doc.metadata["chunk_id"])
        if len(unique_chunks) == k:
            break
    return unique_chunks

# Example usage
query = "What processor does the computer have?"
top_k_chunks = retrieve_unique_chunks(query, vector_store, core_embeddings_model, k=4)

for i, page in enumerate(top_k_chunks):
    print(f"Unique Chunk {i+1}:\n{page.page_content}\nMetadata: {page.metadata}\n")

Unique Chunk 1:
Absolutely Amazing Laptop and with an i5 processor,Great hard drive capacity and I'm very pleased with this well priced item.
Metadata: {'author': 'AbsoAbsolutely rubbish', 'rating': '5.0', 'title': '5.0 out of 5 stars', 'date': 'Reviewed in the United Kingdom on 1 May 2024', 'verified': 'Not Verified', 'chunk_id': 'AbsoAbsolutely rubbish_0'}



In [72]:
query = "How is the performance of this computer?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

Absolutely Amazing Laptop and with an i5 processor,Great hard drive capacity and I'm very pleased with this well priced item.
Absolutely Amazing Laptop and with an i5 processor,Great hard drive capacity and I'm very pleased with this well priced item.
Absolutely Amazing Laptop and with an i5 processor,Great hard drive capacity and I'm very pleased with this well priced item.
Absolutely Amazing Laptop and with an i5 processor,Great hard drive capacity and I'm very pleased with this well priced item.


# Build Retrival chain

In [49]:
!pip install -q -U transformers huggingface_hub torch

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [50]:
import torch
import transformers

model_id = "meta-llama/Llama-2-13b-chat-hf"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

model.eval()
# Need to save the model to avoid downloading it

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

#Tokenizer

In [60]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

#Pack it into a pipeline for compatibility with LangChain

In [67]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    temperature=0.2,
    max_new_tokens=256
)

In [68]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [69]:
retriever = vector_store.as_retriever()

In [70]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True
)

# Tests

In [71]:
qa_with_sources_chain({"query" : "Is this computer cheap?"})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Is this computer cheap?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n. The computer is very compact and is in excellent to nearly new condition. I am having to get used to a smaller keyboard but that also is no problem. The computer is also a lot faster than my old one, especially booting up and shutting down. I have to say that the sellers also have very high standards of customer care and highly recommend anyone looking to buy a computer to buy from them.\n\n. The computer is very compact and is in excellent to nearly new condition. I am having to get used to a smaller keyboard but that also is no problem. The computer is also a lot faster than my old one, especially booting up and shutting down. I have to say that the sellers also have very high standards of customer care and highly recommend anyone looking to buy a computer to buy from th