## Data Preparation

In [22]:
import jsonlines, json

file_path = "Training Data/Laptop_training.jsonl"
laptop_classes = ["RAM","STORAGE","SCREEN SIZE","PROCESSOR","PROCESSOR_ZEN"]
with jsonlines.open(file_path) as reader:
    output_json = {"classes": laptop_classes, "annotations": []}
    # Iterate over each line (JSON object)
 
    for obj in reader:
        processed_obj = [obj["text"],{"entities":obj["label"]}]
       
        output_json["annotations"].append(processed_obj)
        
# Save the output JSON to a new file
with open('Training Data/Laptop/Laptop_annotations.json', 'w') as f:
    json.dump(output_json, f, indent=None)

In [114]:
## Loading the annotated data
laptop_data = json.load(open("Training Data/Laptop/Laptop_annotations.json",'r',encoding='utf-8'))

print(laptop_data["annotations"])


[['ASUS FX506LHB-HN355W i5 10300H/ GTX1650- 4GB/ 8G/ 512G SSD/ 15.6 FHD-144hz/ Backlit KB- 1 Zone RGB/ 48Whr/ Win 11/ / / McAfee(1 Year)/ 1B-Black Plastic', {'entities': [[0, 23, 'PROCESSOR'], [41, 44, 'RAM'], [46, 48, 'RAM'], [50, 58, 'STORAGE'], [60, 74, 'SCREEN_SIZE']]}], ['Microsoft Surface Laptop Go 2 - i5/8GB/128GB + Arc Mouse (Poppy Red)', {'entities': [[31, 34, 'PROCESSOR'], [35, 38, 'RAM'], [39, 44, 'STORAGE']]}], ['HP Omen 16,AMD Ryzen 7 6800H,16.1 inch(40.9cm) QHD Gaming Laptop & HyperX Cloud III Red', {'entities': [[11, 28, 'PROCESSOR'], [29, 47, 'SCREEN_SIZE']]}], ['ASUS TUF Gaming A15 (2022) FA506IC-HN100W R7 4800H/ RTX3050- 4GB/ 8G/ 1T SSD/ 15.6 FHD-144hz/ Backlit KB- 1 Zone RGB/ 90Whr/ Win 11/ / / McAfee(1 Year)/ 2B-Graphite Black (Plastic)', {'entities': [[42, 45, 'PROCESSOR'], [61, 64, 'RAM'], [66, 68, 'RAM'], [70, 76, 'STORAGE'], [78, 92, 'SCREEN_SIZE']]}], ['HP (Refurbished) THIN CLIENT T420 Android (AMD-GX 1st GEN/2GB RAM/7GB SSD)Black', {'entities': [[50, 57, 'PRO

In [115]:
import pandas as pd
# Convert the JSON data into a pandas DataFrame
df = pd.DataFrame([{"Description": item[0], "Annotations": item[1]["entities"]} for item in laptop_data["annotations"]])
df.head()

Unnamed: 0,Description,Annotations
0,ASUS FX506LHB-HN355W i5 10300H/ GTX1650- 4GB/ ...,"[[0, 23, PROCESSOR], [41, 44, RAM], [46, 48, R..."
1,Microsoft Surface Laptop Go 2 - i5/8GB/128GB +...,"[[31, 34, PROCESSOR], [35, 38, RAM], [39, 44, ..."
2,"HP Omen 16,AMD Ryzen 7 6800H,16.1 inch(40.9cm)...","[[11, 28, PROCESSOR], [29, 47, SCREEN_SIZE]]"
3,ASUS TUF Gaming A15 (2022) FA506IC-HN100W R7 4...,"[[42, 45, PROCESSOR], [61, 64, RAM], [66, 68, ..."
4,HP (Refurbished) THIN CLIENT T420 Android (AMD...,"[[50, 57, PROCESSOR_GEN], [58, 61, RAM], [66, ..."


In [116]:
df["Description"].isnull().sum()

0

In [117]:
zero_length_count = df[df["Annotations"].apply(len) == 0].shape[0]
zero_length_count

## There are 4 zero length

4

In [118]:
# Remove rows where the length of the "Annotations" list is zero
df = df[df["Annotations"].apply(len) != 0]
zero_length_count = df[df["Annotations"].apply(len) == 0].shape[0]
zero_length_count

0

In [119]:
if all(isinstance(x, list) for x in df['Annotations']):
    print(True)
else:
    print(False)

True


In [120]:
### Split the data
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)
train.head()

Unnamed: 0,Description,Annotations
66,Apple 2023 MacBook Pro Apple M2 Max - (32 GB/1...,"[[29, 31, PROCESSOR], [39, 44, RAM], [45, 53, ..."
22,Lenovo V15 Laptop Ryzen 5-5500U|8Gb 3200Mhz Dd...,"[[18, 25, PROCESSOR], [32, 35, RAM], [49, 60, ..."
14,Asus ROG Zephyrus G14 R7-4800HS/ GTX1660Ti Max...,"[[22, 31, PROCESSOR], [49, 52, RAM], [54, 59, ..."
35,(Refurbished) HP 15q APU Dual Core A6 - (4 GB/...,"[[21, 37, PROCESSOR], [41, 45, RAM], [46, 54, ..."
84,ASUS TUF Gaming A15 (2022) FA506IC-HN100W R7 4...,"[[42, 45, PROCESSOR], [61, 64, RAM], [66, 68, ..."


In [133]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()
  
  # Initialize a counter for None spans
  none_spans = 0
  spans = 0
  for index, row in data.iterrows():
    # Get the text and annotations
    text = row["Description"]
    annotations = row["Annotations"]

    # Check if the text is not empty
    if not text:
      continue

    # Process the text and annotations
    doc = nlp(text)
    if doc is None:
            print(f"Failed to process text: {text}")
            continue
    
    ents = []
    for start, end, label in annotations:
        if start < 0 or end < 0:
                print(f"Invalid annotation: {start}, {end}, {label}")
                continue
        #print(text)
        span = doc.char_span(start, end, label=label)

        
        if span is None:
          print(f"Failed to create span for annotation: {start}, {end}, {label}")
          none_spans += 1
          continue
        else:
          spans+=1
          ents.append(span)

    print(ents)
    doc.ents = ents

    #Add the processed document to the DocBin
    db.add(doc)

  print(f"Number of None spans: {none_spans}")
  print(f"Number of spans: {spans}")




  return db

In [134]:
# # Open a file to log errors during annotation processing
# #file = open('Training Data/Laptop/train_file.txt','w')
# # Create spaCy DocBin objects for training and testing data
# db = get_spacy_doc(train)
# db.to_disk("Training Data/Laptop/train_data.spacy")

Failed to create span for annotation: 39, 44, RAM
Failed to create span for annotation: 45, 53, STORAGE
2
[M2, 14 Inch]
Failed to create span for annotation: 32, 35, RAM
Failed to create span for annotation: 49, 60, STORAGE
Failed to create span for annotation: 65, 74, SCREEN_SIZE
1
[Ryzen 5]
Failed to create span for annotation: 22, 31, PROCESSOR
Failed to create span for annotation: 49, 52, RAM
Failed to create span for annotation: 54, 59, RAM
Failed to create span for annotation: 60, 69, STORAGE
Failed to create span for annotation: 71, 85, SCREEN_SIZE
0
[]
Failed to create span for annotation: 41, 45, RAM
Failed to create span for annotation: 46, 54, STORAGE
1
[APU Dual Core A6]
Failed to create span for annotation: 42, 45, PROCESSOR
Failed to create span for annotation: 61, 64, RAM
Failed to create span for annotation: 66, 68, RAM
Failed to create span for annotation: 70, 76, STORAGE
Failed to create span for annotation: 78, 92, SCREEN_SIZE
0
[]
Failed to create span for annotatio

In [135]:


# Open a file to log errors during annotation processing
# file = open('Training Data/Laptop/train_file.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(train)
db.to_disk("Training Data/Laptop/train_data.spacy")

db = get_spacy_doc(test)
db.to_disk('Training Data/Laptop/test_data.spacy')



Failed to create span for annotation: 39, 44, RAM
Failed to create span for annotation: 45, 53, STORAGE
2
[M2, 14 Inch]
Failed to create span for annotation: 32, 35, RAM
Failed to create span for annotation: 49, 60, STORAGE
Failed to create span for annotation: 65, 74, SCREEN_SIZE
1
[Ryzen 5]
Failed to create span for annotation: 22, 31, PROCESSOR
Failed to create span for annotation: 49, 52, RAM
Failed to create span for annotation: 54, 59, RAM
Failed to create span for annotation: 60, 69, STORAGE
Failed to create span for annotation: 71, 85, SCREEN_SIZE
0
[]
Failed to create span for annotation: 41, 45, RAM
Failed to create span for annotation: 46, 54, STORAGE
1
[APU Dual Core A6]
Failed to create span for annotation: 42, 45, PROCESSOR
Failed to create span for annotation: 61, 64, RAM
Failed to create span for annotation: 66, 68, RAM
Failed to create span for annotation: 70, 76, STORAGE
Failed to create span for annotation: 78, 92, SCREEN_SIZE
0
[]
Failed to create span for annotatio

## Spacy Model Training

In [25]:
# install any necessary packages
!pip3 install -U spacy
!pip3 install spacy_transformers


Collecting spacy
  Downloading spacy-3.7.5-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.4-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp311-cp311-win_amd64.

  You can safely remove it manually.


In [140]:
### Generating config.cfg
!python -m spacy init fill-config "Training Data/base_config_ner.cfg" "Training Data/config_ner.cfg"

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
Training Data\config_ner.cfg
You can now add your data and train your pipeline:
python -m spacy train config_ner.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [91]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/587.7 MB 3.1 MB/s eta 0:03:09
     ---------------------------------------- 0.8/587.7 MB 6.1 MB/s eta 0:01:36
     ---------------------------------------- 1.7/587.7 MB 9.9 MB/s eta 0:01:00
     --------------------------------------- 2.7/587.7 MB 12.5 MB/s eta 0:00:47
     --------------------------------------- 3.5/587.7 MB 14.0 MB/s eta 0:00:42
     --------------------------------------- 4.5/587.7 MB 14.5 MB/s eta 0:00:41
     --------------------------------------- 5.9/587.7 MB 16.3 MB/s eta 0:00:36
     --------------------------------------- 7.4/587.7 MB 18.1 MB/s eta 0:00:33
      -------------------------------

In [141]:

!# Train a spaCy NER model using the provided configuration and data
!python -m spacy train "Training Data/config_ner.cfg"  --output "Training Data/Laptop/output"  --paths.train "Training Data/Laptop/train_data.spacy"  --paths.dev "Training Data/Laptop/test_data.spacy"

'#' is not recognized as an internal or external command,
operable program or batch file.


[38;5;4mℹ Saving to output directory: Training Data\Laptop\output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     40.36    0.00    0.00    0.00    0.00
  4     200         88.38   1605.06   48.15   43.33   54.17    0.48
  8     400         23.58    439.22   59.26   53.33   66.67    0.59
 14     600        104.49    289.68   61.90   72.22   54.17    0.62
 20     800         39.03    196.97   65.22   68.18   62.50    0.65
 27    1000         48.17    170.84   60.00   75.00   50.00    0.60
 34    1200         31.62    165.31   69.77   78.95   62.50    0.70
 42    1400         65.98    204.34   69.39   68.00   70.83    0.69
 53    1600         36.93    173.93   72.73   80.00   66.67    0.73
 64    1800        118.11

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
### Testing
def test(text):
# Load the trained spaCy NER model from the specified path
  nlp = spacy.load("Training Data/Laptop/output/model-best")

  #text = "ASUS FX506LHB-HN355W i5 10300H\/ GTX1650- 4GB\/ 8G\/ 512G SSD\/ 15.6 FHD-144hz\/ Backlit KB- 1 Zone RGB\/ 48Whr\/ Win 11\/ \/ \/ McAfee(1 Year)\/ 1B-Black Plastic"

  docs = nlp(text)
  attributes = {}
  for ent in docs.ents:
    # Print the recognized text and its corresponding label
    #print(ent.text, "  ->>>>  ", ent.label_)
    attributes[ent.label_] = ent.text
  return attributes




In [5]:
test("ASUS FX506LHB-HN355W i5 10300H\/ GTX1650- 4GB\/ 8G\/ 512G SSD\/ 15.6 FHD-144hz\/ Backlit KB- 1 Zone RGB\/ 48Whr\/ Win 11\/ \/ \/ McAfee(1 Year)\/ 1B-Black Plastic")

NameError: name 'spacy' is not defined

In [7]:
import json
import pandas as pd
import spacy
## Loading the model writing the batches of text from a csv file with its attributes in a column
laptop_data = json.load(open("Training Data/Laptop/Laptop_annotations.json",'r',encoding='utf-8'))

texts = []
for text, entities in laptop_data["annotations"]:
    texts.append(text)

batch_size =5
Laptop_input_data = pd.DataFrame({'Text': texts})
Laptop_input_data = Laptop_input_data.sample(batch_size)
Laptop_input_data["Attributes"] = Laptop_input_data["Text"].apply(test)
Laptop_input_data.to_csv("Training Data/Laptop/Final_Result_Laptop.csv")
Laptop_input_data.head()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Unnamed: 0,Text,Attributes
28,"""Asus Vivobook K513EP-BQ513TSi5-1135G7/MX330/8...",{}
43,Asus Vivobook 16X K3605ZU-MBN742WS Intel®Core™...,"{'PROCESSOR': 'Intel®Core™ i7-12650H', 'RAM': ..."
161,[{'key1': '【15.6 Inch Laptop】15.6 inch FHD LED...,{'SCREEN_SIZE': '15.6 Inch'}
127,"(Refurbished) MSI Raider GE68HX, Intel 13th Ge...",{}
138,Acer Nitro 16 Gaming Laptop (AMD Ryzen 7 7840H...,{'RAM': '16GB'}


In [None]:
### Function for taking data as csv giving the output with a column attributes
### data should be a pandas dataframe with batch of texts
def test_csv(data):
    data["Attributes"] = data["Text"].apply(test)
    return data
