## Data Preparation

In [1]:
import jsonlines, json

file_path = "Training Data/Mobile/Mobile_training.jsonl"
laptop_classes = ["RAM","STORAGE","BATTERY CAPACITY","PROCESSOR_TYPE","SCREEN_SIZE","REFRESH_RATE","SCREEN_TYPE","BACK_CAMERA","FRONT_CAMERA"]
with jsonlines.open(file_path) as reader:
    output_json = {"classes": laptop_classes, "annotations": []}
    # Iterate over each line (JSON object)
 
    for obj in reader:
        processed_obj = [obj["text"],{"entities":obj["label"]}]
       
        output_json["annotations"].append(processed_obj)
        
# Save the output JSON to a new file
with open('Training Data/Mobile/Mobile_annotations.json', 'w') as f:
    json.dump(output_json, f, indent=None)

In [2]:
## Loading the annotated data
mobile_data = json.load(open("Training Data/Mobile/Mobile_annotations.json",'r',encoding='utf-8'))

print(mobile_data["annotations"])


[["Xiaomi 11i 5G Hypercharge (Stealth Black, 6GB RAM, 128GB Storage),Medium,MZB0A55IN ['6 GB RAM | 128 GB ROM | Expandable Upto 1 TB  ', '16.94 cm (6.67 inch) Full HD+ AMOLED Display  ', '108MP + 8MP + 2MP | 16MP Front Camera  ', '4500 mAh Li-Polymer Battery  ', 'Mediatek Dimensity 920 Processor  ']", {'entities': [[27, 40, 'COLOR'], [42, 50, 'RAM'], [51, 64, 'STORAGE'], [85, 94, 'RAM'], [96, 106, 'STORAGE'], [109, 129, 'EXPANDABLE_STORAGE'], [135, 154, 'SCREEN_SIZE'], [164, 180, 'SCREEN_TYPE'], [185, 202, 'BACK_CAMERA'], [205, 222, 'FRONT_CAMERA'], [228, 237, 'BATTERY_CAPACITY'], [261, 293, 'PROCESSOR_TYPE']]}], ["EL D68 (Green, 32 GB) 3 GB RAM ['3 GB RAM | 32 GB ROM | Expandable Upto 128 GB', '15.46 cm (6.088 inch) Display', '13MP Rear Camera | 8MP Front Camera', '4000 mAh Battery', 'Quad-Core Processor']", {'entities': [[8, 13, 'COLOR'], [15, 20, 'STORAGE'], [22, 30, 'RAM'], [33, 41, 'RAM'], [44, 53, 'STORAGE'], [56, 78, 'EXPANDABLE_STORAGE'], [81, 102, 'SCREEN_SIZE'], [114, 132, 'B

In [3]:
import pandas as pd
# Convert the JSON data into a pandas DataFrame
df = pd.DataFrame([{"Description": item[0], "Annotations": item[1]["entities"]} for item in mobile_data["annotations"]])
df.head()

Unnamed: 0,Description,Annotations
0,"Xiaomi 11i 5G Hypercharge (Stealth Black, 6GB ...","[[27, 40, COLOR], [42, 50, RAM], [51, 64, STOR..."
1,"EL D68 (Green, 32 GB) 3 GB RAM ['3 GB RAM | 32...","[[8, 13, COLOR], [15, 20, STORAGE], [22, 30, R..."
2,"Zoom Me ME-M1 (Scarlet Red, 32 GB) 3 GB RAM ['...","[[15, 26, COLOR], [28, 33, STORAGE], [35, 43, ..."
3,"vivo Y20 (Purist Blue, 64 GB) 4 GB RAM ['4 GB ...","[[10, 21, COLOR], [23, 28, STORAGE], [29, 40, ..."
4,"SAMSUNG Galaxy M42 (Prism Dot Gray, 128 GB) 8 ...","[[20, 34, COLOR], [36, 42, STORAGE], [44, 52, ..."


In [4]:
df["Description"].isnull().sum()

0

In [5]:
zero_length_count = df[df["Annotations"].apply(len) == 0].shape[0]
zero_length_count

## There are 4 zero length

10

In [6]:
# Remove rows where the length of the "Annotations" list is zero
df = df[df["Annotations"].apply(len) != 0]
zero_length_count = df[df["Annotations"].apply(len) == 0].shape[0]
zero_length_count

0

In [7]:
if all(isinstance(x, list) for x in df['Annotations']):
    print(True)
else:
    print(False)

True


In [8]:
### Split the data
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.1)
train.head()

Unnamed: 0,Description,Annotations
1128,Vibrant Display Enjoy uninterrupted binge-watc...,"[[201, 218, REFRESH_RATE], [705, 727, PROCESSO..."
370,"Vivo T2 5G (Velocity Wave, 128 GB) (6 GB RAM) ...","[[12, 25, COLOR], [26, 33, STORAGE], [36, 44, ..."
682,"vivo V27 Pro 5G (Noble Black, 256 GB) 8 GB RAM...","[[17, 28, COLOR], [30, 37, STORAGE], [38, 57, ..."
834,"HP Slate 6 (Graphite, 16 GB) 1 GB RAM ['1 GB R...","[[12, 20, COLOR], [22, 27, STORAGE], [29, 37, ..."
1228,"""A54 (Starry Blue, 6GB RAM, 128GB Storage) [{'...","[[361, 419, BACK_CAMERA], [421, 438, FRONT_CAM..."


In [9]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()
  
  # Initialize a counter for None spans
  none_spans = 0
  spans = 0
  for index, row in data.iterrows():
    # Get the text and annotations
    text = row["Description"]
    annotations = row["Annotations"]

    # Check if the text is not empty
    if not text:
      continue

    # Process the text and annotations
    doc = nlp(text)
    if doc is None:
            print(f"Failed to process text: {text}")
            continue
    
    ents = []
    for start, end, label in annotations:
        if start < 0 or end < 0:
                print(f"Invalid annotation: {start}, {end}, {label}")
                continue
        #print(text)
        span = doc.char_span(start, end, label=label)
        
        if span is None:
          print(f"Failed to create span for annotation: {start}, {end}, {label}")
          none_spans += 1
          continue
        else:
          spans+=1
          ents.append(span)

    doc.ents = ents

    #Add the processed document to the DocBin
    db.add(doc)

  print(f"Number of None spans: {none_spans}")
  print(f"Number of spans: {spans}")

  return db

In [11]:


# Open a file to log errors during annotation processing
# file = open('Training Data/Laptop/train_file.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(train)
db.to_disk("Training Data/Mobile/train_data.spacy")

db = get_spacy_doc(test)
db.to_disk('Training Data/Mobile/test_data.spacy')



  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


[90Hz refresh rate, MediaTek Dimensity 810, 8MP front camera, expand the memory by up to 1TB, 'Maximum Card Capacity': '1TB', 'Refresh Rate': '90 Hz', 90 Hz Refresh Rate, microSD Card upto 1TB, 50 MP + 2 MP + 2 MP Triple Rear, 8 MP Front Camera, 18W]
Failed to create span for annotation: 26, 33, STORAGE
Failed to create span for annotation: 68, 86, FRONT_CAMERA
Failed to create span for annotation: 91, 100, BATTERY_CAPACITY
Failed to create span for annotation: 184, 195, RAM
Failed to create span for annotation: 197, 208, STORAGE
[Velocity Wave, 6 GB RAM, '64 MP (OIS) + 2MP, '16.21 cm (6.38 inch), Snapdragon 695 Processor]
Failed to create span for annotation: 115, 138, BACK_CAMERA
[Noble Black, 256 GB), 8 GB RAM ['8 GB RAM, 256 GB ROM, '17.22 cm (6.78 inch, HD+ Display', 50MP Front Camera', 4600 mAh Battery', Mediatek Dimensity 8200 Processor']
Failed to create span for annotation: 62, 84, EXPANDABLE_STORAGE
Failed to create span for annotation: 89, 105, SCREEN_SIZE
[Graphite, 16 GB, 

## Spacy Model Training

In [12]:
# install any necessary packages
!pip3 install -U spacy
!pip3 install spacy_transformers




In [13]:
### Generating config.cfg
!python -m spacy init fill-config "Training Data/base_config_ner.cfg" "Training Data/config_ner.cfg"

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
Training Data\config_ner.cfg
You can now add your data and train your pipeline:
python -m spacy train config_ner.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [91]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/587.7 MB 3.1 MB/s eta 0:03:09
     ---------------------------------------- 0.8/587.7 MB 6.1 MB/s eta 0:01:36
     ---------------------------------------- 1.7/587.7 MB 9.9 MB/s eta 0:01:00
     --------------------------------------- 2.7/587.7 MB 12.5 MB/s eta 0:00:47
     --------------------------------------- 3.5/587.7 MB 14.0 MB/s eta 0:00:42
     --------------------------------------- 4.5/587.7 MB 14.5 MB/s eta 0:00:41
     --------------------------------------- 5.9/587.7 MB 16.3 MB/s eta 0:00:36
     --------------------------------------- 7.4/587.7 MB 18.1 MB/s eta 0:00:33
      -------------------------------

In [14]:

!# Train a spaCy NER model using the provided configuration and data
!python -m spacy train "Training Data/config_ner.cfg"  --output "Training Data/Mobile/output"  --paths.train "Training Data/Mobile/train_data.spacy"  --paths.dev "Training Data/Mobile/test_data.spacy"

'#' is not recognized as an internal or external command,
operable program or batch file.


[38;5;2m✔ Created output directory: Training Data\Mobile\output[0m
[38;5;4mℹ Saving to output directory: Training Data\Mobile\output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    106.69    0.00    0.00    0.00    0.00
  0     200        414.20   4348.63   44.95   58.20   36.61    0.45
  0     400        328.51   2794.96   41.46   47.00   37.09    0.41
  0     600         58.27   3250.15   41.52   45.59   38.13    0.42
  0     800         55.10   3031.42   52.04   49.57   54.78    0.52
  1    1000         60.28   3333.52   39.04   36.83   41.53    0.39
  1    1200         94.13   4071.16   54.20   50.66   58.28    0.54
  2    1400         82.30   4359.56   52.53   49.46   56.01    0.53
  2    1600         89.3

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [21]:
### Testing
def test(text):
  # Load the trained spaCy NER model from the specified path
  nlp = spacy.load("Training Data/Mobile/output/model-best")

  #text = "EL D68 (Green, 32 GB) 3 GB RAM ['3 GB RAM | 32 GB ROM | Expandable Upto 128 GB', '15.46 cm (6.088 inch) Display', '13MP Rear Camera | 8MP Front Camera', '4000 mAh Battery', 'Quad-Core Processor']"
  docs = nlp(text)
  attributes = {}
  for ent in docs.ents:
    # Print the recognized text and its corresponding label
    attributes[ent.label_] = ent.text
  return attributes



In [None]:
test("EL D68 (Green, 32 GB) 3 GB RAM ['3 GB RAM | 32 GB ROM | Expandable Upto 128 GB', '15.46 cm (6.088 inch) Display', '13MP Rear Camera | 8MP Front Camera', '4000 mAh Battery', 'Quad-Core Processor']")

In [25]:
## Loading the model writing the batches of text from a csv file with its attributes in a column
mobile_data = json.load(open("Training Data/Mobile/Mobile_annotations.json",'r',encoding='utf-8'))

texts = []
for text, entities in mobile_data["annotations"]:
    texts.append(text)

batch_size = 5
Mobile_input_data = pd.DataFrame({'Text': texts})
Mobile_input_data = Mobile_input_data.sample(batch_size)
Mobile_input_data["Attributes"] = Mobile_input_data["Text"].apply(test)
Mobile_input_data.to_csv("Training Data/Mobile/Final_Result_Mobile.csv")
Mobile_input_data.head()

Unnamed: 0,Text,Attributes
512,"Infinix Hot S3 (Blush Gold, 32 GB) 3 GB RAM ['...","{'COLOR': '(Blush Gold', 'STORAGE': '32 GB ROM..."
1113,[{'key1': 'Take great low-light images with th...,"{'BACK_CAMERA': '50MP Dual Rear AI Camera', 'F..."
200,"MOTOROLA G30 (Dark Pearl, 64 GB) 4 GB RAM ['4 ...","{'COLOR': 'Dark Pearl', 'STORAGE': '64 GB ROM'..."
463,"realme C30 (Denim Black, 32 GB) 2 GB RAM ['2 G...","{'COLOR': 'Denim Black', 'STORAGE': '32 GB ROM..."
368,"OPPO A83 (Red, 16 GB) 2 GB RAM ['2 GB RAM | 16...","{'COLOR': 'Red', 'STORAGE': '16 GB ROM', 'RAM'..."


In [26]:
### Function for taking data as csv giving the output with a column attributes
### data should be a pandas dataframe with batch of texts
def test_csv(data):
    data["Attributes"] = data["Text"].apply(test)
    return data
