<a href="https://colab.research.google.com/github/zxb-97/NLP1/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random as rand

def scrape_reviews(base_url, headers, max_pages=20):
    all_reviews = []
    for page in range(1, max_pages + 1): #Change here for scraping other pages
        print(f"Fetching page {page}")
        response = requests.get(base_url + f"&pageNumber={page}", headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}, status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'lxml')
        review_elements = soup.select("div[data-hook='review']")

        if not review_elements:
            print(f"No reviews found on page {page}, stopping.")
            break

        for review in review_elements:
            r_author_element = review.select_one("span.a-profile-name")
            r_author = r_author_element.text.strip() if r_author_element else None

            r_rating_element = review.select_one("i.review-rating span.a-icon-alt")
            r_rating = r_rating_element.text.replace(" out of 5 stars", "") if r_rating_element else None

            r_title_element = review.select_one("a.review-title span")
            r_title = r_title_element.text.strip() if r_title_element else None

            r_content_element = review.select_one("span.review-text-content span")
            r_content = r_content_element.text.strip() if r_content_element else None

            r_date_element = review.select_one("span.review-date")
            r_date = r_date_element.text.strip() if r_date_element else None

            r_verified_element = review.select_one("span.a-declarative span.a-size-mini")
            r_verified = r_verified_element.text.strip() if r_verified_element else "Not Verified"

            review_data = {
                "author": r_author,
                "rating": r_rating,
                "title": r_title,
                "content": r_content,
                "date": r_date,
                "verified": r_verified
            }

            all_reviews.append(review_data)

        # To avoid hitting the server too frequently
        time.sleep(rand.uniform(5,10))

    return all_reviews

# Base URL of the Amazon product reviews page (without page number)
base_url = 'https://www.amazon.co.uk/Lenovo-ThinkPad-Windows-Professional-Renewed/product-reviews/B07CSSF72G/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1'
custom_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}

# Fetch all reviews
all_reviews = scrape_reviews(base_url, custom_headers, max_pages=20)
print(all_reviews); exit;
# Save reviews to a JSONL file
file_path = "thinkpad_reviews_1-20.jsonl"
with open(file_path, "w") as f:
    for review in all_reviews:
        json.dump(review, f)
        f.write("\n")

print(f"Total number of reviews fetched: {len(all_reviews)}")
print(f"Reviews saved to {file_path}")

Fetching page 1
No reviews found on page 1, stopping.
[]
Total number of reviews fetched: 0
Reviews saved to thinkpad_reviews_1-20.jsonl


Load the Json

In [None]:
!pip install -U -q "langchain" "transformers==4.31.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.41.3" "trl==0.4.7" "safetensors>=0.3.1"
!pip install jq
!pip install -U langchain-community


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.6/973.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
import json
from pprint import pprint
from pathlib import Path
from langchain_community.document_loaders import JSONLoader
from langchain.docstore.document import Document





def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["author"] = record["author"]
    metadata["rating"] = record["rating"]
    return metadata


file_path = "thinkpad_reviews_1-20.jsonl"
loader = JSONLoader(
      file_path = "thinkpad_reviews_1-20.jsonl",
      jq_schema = '.content',
      #metadata_func = metadata_func,
      text_content = True,
      json_lines = True)

reviews = loader.load()
pprint(reviews)

[Document(page_content='Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could have been slicker.  I knew roughly delivery date - it was delivered bang on time by the way - but would have been great given the delivery details, tracker number etc.  I chased this and was given day before delivery date - perhaps me being impatient and a very, very minor quibble.Laptop arrived in very secure packaging and, with the exception that the box was plain rather than Lenovo branded, you could easily be fooled this was a brand new device.  Not a scratch, no wear on the keys, screen looks perfect and mint condition.  Device was extremely clean and smelled fresh... I know odd comment but I guess it was wiped thoroughly with a citrus wipe or similar perhaps due to Covid19.On booting up, was really fast and had the advertised storage space, RAM etc.  Perhaps this is me

In [None]:
import json
import csv

file_path = "thinkpad_reviews_1-20.jsonl"
csv_file_path = "thinkpad_reviews_1-20.csv"

# Read the JSONL file and convert each line to a dictionary
with open(file_path, 'r') as jsonl_file:
    data = [json.loads(line) for line in jsonl_file]

# Write the dictionaries to a CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
    if data:
        writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

In [None]:
import csv
from langchain.docstore.document import Document

csv_file_path = "thinkpad_reviews_1-20.csv"
reviews = []
# Need to remove content from metadata
# Read the CSV file
with open(csv_file_path, 'r') as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:

        if 'rating' in row:
            del row['rating']
            del row['title']


        document = Document(
            page_content=row['content'],
            metadata=row
        )
        reviews.append(document)


pprint(reviews)


[Document(page_content='Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could have been slicker.  I knew roughly delivery date - it was delivered bang on time by the way - but would have been great given the delivery details, tracker number etc.  I chased this and was given day before delivery date - perhaps me being impatient and a very, very minor quibble.Laptop arrived in very secure packaging and, with the exception that the box was plain rather than Lenovo branded, you could easily be fooled this was a brand new device.  Not a scratch, no wear on the keys, screen looks perfect and mint condition.  Device was extremely clean and smelled fresh... I know odd comment but I guess it was wiped thoroughly with a citrus wipe or similar perhaps due to Covid19.On booting up, was really fast and had the advertised storage space, RAM etc.  Perhaps this is me

Clean Up the Data

In [None]:
import re
import json

def clean_text(text):
    if isinstance(text, dict):
        text = json.dumps(text)
    if text:
        text = text.strip()  # Remove leading and trailing white spaces
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

def preprocess_reviews(reviews):
    cleaned_reviews = []
    for review in reviews:

        cleaned_review = {
            "page_content": clean_text(review.page_content),
            "metadata": clean_text(review.metadata)

        }
        cleaned_reviews.append(cleaned_review)
    return cleaned_reviews

# Clean the reviews
cleaned_reviews = preprocess_reviews(reviews) # List of dictionaries
pprint(cleaned_reviews)
#return ;
print(f"Cleaned {len(cleaned_reviews)} reviews.")
pprint(cleaned_reviews)


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
 {'metadata': '{"author": "Susan", "content": "This is a beautiful little '
              'laptop computer I am very pleased with it. My husband bought it '
              'for me to replace an old Lenovo I had had since 2014 and '
              'Windows 8 support was being withdrawn. I was not sure about '
              'Windows 10 but it is no problem. Prior to despatch the seller '
              'very kindly offered an upgrade free of charge but including all '
              'the things the original order included. I accepted and we '
              'received the PC in as new condition and very well packaged two '
              'days later. The computer is very compact and is in excellent to '
              'nearly new condition. I am having to get used to a smaller '
              'keyboard but that also is no problem. The computer is also a '
              'lot faster than my old one, especially booting up and shutting 

Splitting Document into chunks


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the splitter with the maximum chunk length
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300, # the character length of the chunk
    chunk_overlap = 50, # the character length of the overlap between chunks
    length_function = len,  # the length function - in this case, character length (aka the python len() fn.)
    separators=[". ", ", ", " "]
)
'''
Many chunks start with "." , maybe this makes embeddings too similar
'''

def chunk_reviews(reviews):
    chunked_reviews = []
    for review in reviews:
        # Split the content into chunks using the CharacterTextSplitter
        content_chunks = text_splitter.split_text(review.page_content)
        for i, chunk in enumerate(content_chunks):
            chunked_review = review.copy()
            chunked_review.page_content = chunk
            #chunked_review["chunk_id"] = f"{review['author']}_{i}" #Unique id for each chunk should work better
            chunked_reviews.append(chunked_review)
    return chunked_reviews

# Split the reviews into chunks
chunked_reviews = chunk_reviews(reviews) # Forget about cleaned_reviews for now
print(f"Chunked reviews count: {len(chunked_reviews)}")
for i, chunk in enumerate(chunked_reviews): #Nicer print
    print(f"Chunk {i+1}:\n{chunk}\n")

Chunked reviews count: 640
Chunk 1:
page_content='Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could have been slicker' metadata={'author': 'Amazon Customer', 'content': 'Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could have been slicker.  I knew roughly delivery date - it was delivered bang on time by the way - but would have been great given the delivery details, tracker number etc.  I chased this and was given day before delivery date - perhaps me being impatient and a very, very minor quibble.Laptop arrived in very secure packaging and, with the exception that the box was plain rather than Lenovo branded, you could easily be fooled this was a brand new device.  Not a scratch, no wear on the keys, screen l

In [None]:
from langchain.docstore.document import Document
import pandas as pd

def create_documents(chunked_reviews):
    documents = []
    for review in chunked_reviews:
        metadata = {
            "author": review["author"],
            "title": review["title"],
            "chunk_id": review["chunk_id"]
        }
        document = Document(
            page_content=review["content"],
            metadata=metadata
        )
        documents.append(document)
    return documents

# Create documents from chunked reviews
documents = create_documents(chunked_reviews)
print(f"Created {len(documents)} documents.")
for i, doc in enumerate(documents): #Nicer print
    print(f"Chunk {i+1}:\n{doc}\n")

TypeError: 'Document' object is not subscriptable

# Load documents into FAISS vector store


In [None]:
!pip install -q -U faiss-cpu tiktoken sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m99.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
pip install -U langchain-community



In [None]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(chunked_reviews, embedder)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Alternative embedding using page content + metadata

In [None]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

class CustomHuggingFaceEmbeddings(HuggingFaceEmbeddings):
    def embed(self, documents):
        embeddings = []
        for doc in documents:
            # Combine text content and metadata
            content = doc['page_content']
            metadata = doc['metadata']
            chunk_id = metadata.get('chunk_id','')
            author = metadata.get('author','')
            combined_text = f"{content}{author} {chunk_id}"  # Adjust as needed

            # Generate embeddings for combined text
            embedding = super().embed(combined_text)
            embeddings.append(embedding)
        return embeddings

core_embeddings_model = CustomHuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(chunked_reviews, embedder)

# Check if vector store works with a query

In [None]:
query = "How much RAM ?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)
print(f"Number of documents indexed: {len(docs)}")



for page in docs:
  print(page.page_content)


Number of documents indexed: 4
, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!
, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!
, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!
, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!


# Build Retrival chain

In [None]:
!pip install -q -U transformers huggingface_hub torch

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
import transformers

model_id = "meta-llama/Llama-2-13b-chat-hf"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

model.eval()
# Need to save the model to avoid downloading it



config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

#Tokenizer

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

#Pack it into a pipeline for compatibility with LangChain

In [None]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    temperature=0.2,
    max_new_tokens=256
)

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
retriever = vector_store.as_retriever()

In [None]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True
)

# Tests

In [None]:
qa_with_sources_chain({"query" : "How much RAM?"})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'How much RAM?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!\n\n, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!\n\n, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!\n\n, 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!\n\nQuestion: How much RAM?\nHelpful Answer: 16GB",
 'source_documents': [Document(page_content=', 16gb RAM and i5 processor is an absolute bargain and one I would definitely recommend!', metadata={'author': 'Amazon Customer', 'content': 'Just received the laptop two days ago so very early days as I haven’t given it a thorough test drive.However on face value, appears to be an excellent deal.Firstly the delivery process could h