<a href="https://colab.research.google.com/github/zxb-97/NLP1/blob/main/NLPcsv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random as rand

def scrape_reviews(base_url, headers, max_pages=20):
    all_reviews = []
    for page in range(1, max_pages + 1): #Change here for scraping other pages
        print(f"Fetching page {page}")
        response = requests.get(base_url + f"&pageNumber={page}", headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}, status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'lxml')
        review_elements = soup.select("div[data-hook='review']")

        if not review_elements:
            print(f"No reviews found on page {page}, stopping.")
            break

        for review in review_elements:
            r_author_element = review.select_one("span.a-profile-name")
            r_author = r_author_element.text.strip() if r_author_element else None

            r_rating_element = review.select_one("i.review-rating span.a-icon-alt")
            r_rating = r_rating_element.text.replace(" out of 5 stars", "") if r_rating_element else None

            r_title_element = review.select_one("a.review-title span")
            r_title = r_title_element.text.strip() if r_title_element else None

            r_content_element = review.select_one("span.review-text-content span")
            r_content = r_content_element.text.strip() if r_content_element else None

            r_date_element = review.select_one("span.review-date")
            r_date = r_date_element.text.strip() if r_date_element else None

            r_verified_element = review.select_one("span.a-declarative span.a-size-mini")
            r_verified = r_verified_element.text.strip() if r_verified_element else "Not Verified"

            review_data = {
                "author": r_author,
                "rating": r_rating,
                "title": r_title,
                "content": r_content,
                "date": r_date,
                "verified": r_verified
            }

            all_reviews.append(review_data)

        # To avoid hitting the server too frequently
        time.sleep(rand.uniform(5,10))

    return all_reviews

# Base URL of the Amazon product reviews page (without page number)
base_url = 'https://www.amazon.co.uk/Lenovo-ThinkPad-Windows-Professional-Renewed/product-reviews/B07CSSF72G/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1'
custom_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}

# Fetch all reviews
all_reviews = scrape_reviews(base_url, custom_headers, max_pages=20)
print(all_reviews); exit;
# Save reviews to a JSONL file
file_path = "thinkpad_reviews_1-20.jsonl"
with open(file_path, "w") as f:
    for review in all_reviews:
        json.dump(review, f)
        f.write("\n")

print(f"Total number of reviews fetched: {len(all_reviews)}")
print(f"Reviews saved to {file_path}")

Fetching page 1
No reviews found on page 1, stopping.
[]
Total number of reviews fetched: 0
Reviews saved to thinkpad_reviews_1-20.jsonl


Load the Json

In [3]:
!pip install -U -q "langchain" "transformers==4.31.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.41.3" "trl==0.4.7" "safetensors>=0.3.1"

!pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.2.3-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensi

# Data parsing


In [4]:
from langchain.document_loaders.csv_loader import CSVLoader

reviews_loader = CSVLoader(file_path = "parsedReviews.csv")

reviews_data = reviews_loader.load()

len(reviews_data)

10966

# Splitting the data

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, # the character length of the chunk
    chunk_overlap = 100, # the character length of the overlap between chunks
    length_function = len, # the length function - in this case, character length (aka the python len() fn.)
)

# Create documents

In [6]:
reviews_documents = text_splitter.transform_documents(reviews_data)
len(reviews_documents)


11093

# Clean the data
*To be done for csv files*

In [None]:
import re
import json

def clean_text(text):
    if isinstance(text, dict):
        text = json.dumps(text)
    if text:
        text = text.strip()  # Remove leading and trailing white spaces
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

def preprocess_reviews(reviews):
    cleaned_reviews = []
    for review in reviews:

        cleaned_review = {
            "page_content": clean_text(review.page_content),
            "metadata": clean_text(review.metadata)

        }
        cleaned_reviews.append(cleaned_review)
    return cleaned_reviews

# Clean the reviews
cleaned_reviews = preprocess_reviews(reviews) # List of dictionaries
pprint(cleaned_reviews)
#return ;
print(f"Cleaned {len(cleaned_reviews)} reviews.")
pprint(cleaned_reviews)


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
 {'metadata': '{"author": "Susan", "content": "This is a beautiful little '
              'laptop computer I am very pleased with it. My husband bought it '
              'for me to replace an old Lenovo I had had since 2014 and '
              'Windows 8 support was being withdrawn. I was not sure about '
              'Windows 10 but it is no problem. Prior to despatch the seller '
              'very kindly offered an upgrade free of charge but including all '
              'the things the original order included. I accepted and we '
              'received the PC in as new condition and very well packaged two '
              'days later. The computer is very compact and is in excellent to '
              'nearly new condition. I am having to get used to a smaller '
              'keyboard but that also is no problem. The computer is also a '
              'lot faster than my old one, especially booting up and shutting 

In [None]:
from langchain.docstore.document import Document
import pandas as pd

def create_documents(chunked_reviews):
    documents = []
    for review in chunked_reviews:
        metadata = {
            "author": review["author"],
            "title": review["title"],
            "chunk_id": review["chunk_id"]
        }
        document = Document(
            page_content=review["content"],
            metadata=metadata
        )
        documents.append(document)
    return documents

# Create documents from chunked reviews
documents = create_documents(chunked_reviews)
print(f"Created {len(documents)} documents.")
for i, doc in enumerate(documents): #Nicer print
    print(f"Chunk {i+1}:\n{doc}\n")

TypeError: 'Document' object is not subscriptable

# Load documents into FAISS vector store


In [7]:
!pip install -q -U faiss-cpu tiktoken sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
pip install -U langchain-community



In [9]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(reviews_documents, embedder)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Check if vector store works with a query

In [10]:
query = "How much RAM ?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)
print(f"Number of documents indexed: {len(docs)}")



for page in docs:
  print(page.page_content)


Number of documents indexed: 4
Author: Pete
Rating: 5
Review_Title: Great for the price
Review: I bought for my niece and she loves it. The fact that you can add memory is good because the 16gb will fill up fast...
Author: Susan
Rating: 4
Review_Title: Great price
Review: My only complaint is that 8GB is really not enough memory if you do audiobooks or videos.
Author: glc1213
Rating: 4
Review_Title: Good product for the price
Review: This was a gift to my sister in law and she loves it. Only comes with 8GB of memory but you can upgrade to 128 and she did.
Author: Viju
Rating: 4
Review_Title: Maximum memory used
Review: Best Amazon app for purchasing thinks. More memory card use 64 gb or more


# Build Retrival chain

In [11]:
# Ignore this
!pip install -q -U transformers huggingface_hub torch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.7/401.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.3.0+cu121 requires torch==2.3.0, but you have torch 2.3.1 which is incompatible.
torchvision 0.18.0+cu121 requires torch==2.3.0, but you have torch 2.3.1 which is incompatible.[0m[31m
[0m

In [12]:
!pip install huggingface-hub -q

In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
import torch
import transformers

model_id = "meta-llama/Llama-2-13b-chat-hf"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)


model.eval()
# Need to save the model to avoid downloading it



config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

#Tokenizer

In [15]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

#Pack it into a pipeline for compatibility with LangChain

In [16]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    temperature=0.2,
    max_new_tokens=256
)

In [17]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

  warn_deprecated(


In [18]:
retriever = vector_store.as_retriever()

In [19]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True
)

# Tests

In [23]:
qa_with_sources_chain({"query" : "Is this tablet for kids too?"})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Is this tablet for kids too?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nAuthor: Jay323\nRating: 5\nReview_Title: great gift idea\nReview: This item was purchased as a gift. I heard nothing bad about this tablet so i can say it is a good tablet for kids.\n\nAuthor: Tbaby22386\nRating: 4\nReview_Title: Great tablet for kids ...\nReview: This product is great for children. Easy to use and very inexpensive. Allows for enough storage and very durable!\n\nAuthor: j2jwhitaker\nRating: 5\nReview_Title: great tablet for kids and adults\nReview: This tablet is the perfect size for kids. It's great for adults too if your looking for something not so big.\n\nAuthor: BBCustomer\nRating: 4\nReview_Title: Good tablet for the kids.\nReview: It is reasonably priced so you can replace if need be. The small size is perfect for kids especially when travelling.