In [3]:
import cv2
from PIL import Image
import easyocr
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras import layers, models, Model
import numpy as np
import os
import pandas as pd
text_input = layers.Input(shape=(128,), dtype='int32')
attention_mask_input = layers.Input(shape=(128,), dtype='int32')

In [5]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
sample_test=pd.read_csv('/content/sample_test.csv')
sample_test_out = pd.read_csv('/content/sample_test_out.csv')

In [6]:
def val_unit_split(entity):
    entity= entity.split()
    if(entity[0][0]=='['):
        value=None
    else:
        value=float(entity[0])
    if(len(entity)==2):
        unit=entity[1]
    else:
        unit = entity[1]+" "+entity[2]
    return pd.Series([value, unit])

In [7]:
train[['value','unit']]=train['entity_value'].apply(val_unit_split)

In [11]:
# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Text input for tokenized text and attention mask (after extracting text from OCR)
# Define input layers as TensorFlow tensors
text_input = tf.keras.Input(shape=(128,), dtype='int32')
attention_mask_input = tf.keras.Input(shape=(128,), dtype='int32')

# Create a custom layer to wrap the BERT model
class BertLayer(layers.Layer):
    def __init__(self, bert_model, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert_model = bert_model

    def call(self, inputs, attention_mask):
        return self.bert_model(inputs, attention_mask=attention_mask).last_hidden_state

# Pass tokenized text through BERT model using the custom layer
bert_output = BertLayer(bert_model)(text_input, attention_mask=attention_mask_input)
bert_output = layers.Flatten()(bert_output)

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the 

In [12]:
# Load VGG16 without top layer and freeze layers
model = VGG16(weights="imagenet", include_top=False)
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

for layer in model.layers:
    layer.trainable = False

# Preprocessing layers: Resize to 224x224 and rescale
resize_and_rescale = tf.keras.Sequential([
    layers.Resizing(224, 224),
    layers.Rescaling(1.0 / 255)
])


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [14]:
# Define new inputs with shape (256, 256, 3)
img_inputs = layers.Input(shape=(256, 256, 3))

# Apply resizing and rescaling
img = resize_and_rescale(img_inputs)

# Pass the processed input to the modified VGG16 model
img = model(img, training=False)
img = layers.Flatten()(img) # Flatten the output of VGG16

# Define the single input (an integer ranging from 0 to 7)
entity_input = layers.Input(shape=(1,), dtype='int32')  # Integer input

# Embedding the integer input (for values 0-7)
embedded_int = layers.Embedding(input_dim=8, output_dim=4)(entity_input)  # Embedding layer for 8 categories
embedded_int = layers.Flatten()(embedded_int)  # Flatten the embedding output

# Placeholder for x3 (assuming it's another input; define the actual shape)
# You need to define the shape of x3
x3 = layers.Input(shape=(1,))  # Replace some_shape with actual shape of x3

# Concatenate the outputs of VGG16, embedded integer, and x3 input
concatenated = layers.Concatenate()([img, embedded_int, bert_output, x3])

In [15]:
# First dense layer
x1 = layers.Dense(512, activation='relu')(concatenated)
x1 = layers.BatchNormalization()(x1)  # Add normalization layer
x1 = layers.Dropout(0.5)(x1)          # Dropout for regularization

# Second dense layer
x1 = layers.Dense(256, activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)  # Add normalization layer
x1 = layers.Dropout(0.5)(x1)

# Third dense layer
x1 = layers.Dense(128, activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)  # Add normalization layer
x1 = layers.Dropout(0.5)(x1)


In [16]:
# Output layer for continuous value (0, e+17)
out1 = layers.Dense(64, activation='relu')(x1)
out1 = layers.BatchNormalization()(out1)  # Add normalization layer
out1 = layers.Dropout(0.5)(out1)
out1 = layers.Dense(1, activation='exponential')(out1)  # Exponential activation for large continuous values

# Output layer for categorical classification (0–34)
out2 = layers.Dense(64, activation='relu')(x1)
out2 = layers.BatchNormalization()(out2)  # Add normalization layer
out2 = layers.Dropout(0.5)(out2)
out2 = layers.Dense(35, activation='softmax')(out2)  # Softmax for multi-class classification (0-34)



In [17]:
# Final model
final_model = models.Model(inputs=[img_inputs, entity_input, text_input, attention_mask_input], outputs=[out1, out2])


In [18]:
# Compile the model
final_model.compile(optimizer='adam',
                    loss=['mean_squared_error', 'sparse_categorical_crossentropy'],
                    metrics=['mae', 'accuracy'])

# Print the model summary
print(final_model.summary())

None


In [19]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value,value,unit
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,500.0,gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,1.0,cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,0.709,gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,0.709,gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,1400.0,milligram


In [23]:
clean_train = train.dropna()
clean_train.shape

(260583, 6)

In [24]:
entity_names=list(train['entity_name'].unique())
print(entity_names)

['item_weight', 'item_volume', 'voltage', 'wattage', 'maximum_weight_recommendation', 'height', 'depth', 'width']


In [25]:
def allot(entity):
    try:
        return entity_names.index(entity)
    except ValueError:
        return -1
allot('height')

5

In [26]:
clean_train['entity_name_no']=clean_train['entity_name'].apply(allot)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_train['entity_name_no']=clean_train['entity_name'].apply(allot)


In [27]:
clean_train['image_link'][0]

'https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg'

In [28]:
def extract_filename(url):
    return os.path.basename(url)
clean_train['image_name']=clean_train['image_link'].apply(extract_filename)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_train['image_name']=clean_train['image_link'].apply(extract_filename)


In [30]:
clean_train['unit'].unique()

array(['gram', 'cup', 'milligram', 'kilogram', 'kilogram to', 'ounce',
       'gallon', 'volt', 'watt', 'pound', 'millilitre', 'cubic foot',
       'fluid ounce', 'ton', 'decilitre', 'cubic inch', 'litre',
       'microgram', 'centimetre', 'quart', 'horsepower', 'kilowatt',
       'kilowatt hour', 'gigabyte', 'millimetre', 'pint', 'gram to',
       'centilitre', 'candela', 'inch', 'person', 'ounce to', 'metre',
       'pound to', 'milligram to', 'foot', 'carat', 'e+17 pound',
       'milliampere hour', 'nits', 'millilitre to'], dtype=object)

In [31]:
units=clean_train['unit'].unique().tolist()

In [32]:
clean_train['group_id'].describe()

Unnamed: 0,group_id
count,260583.0
mean,546107.317039
std,249310.025721
min,101697.0
25%,311997.0
50%,524635.0
75%,752266.0
max,998545.0


In [35]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
predata=clean_train
group_id_reshaped = predata['group_id'].values.reshape(-1, 1)

# Fit and transform the data
scaled_group_id = scaler.fit_transform(group_id_reshaped)

# Add the scaled data as a new column
predata['scaled_group_id'] = scaled_group_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predata['scaled_group_id'] = scaled_group_id


In [37]:
predata.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value,value,unit,entity_name_no,image_name,scaled_group_id
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,500.0,gram,0,61I9XdN6OFL.jpg,0.721663
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,1.0,cup,1,71gSRbyXmoL.jpg,0.908817
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,0.709,gram,0,61BZ4zrjZXL.jpg,0.398974
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,0.709,gram,0,612mrlqiI4L.jpg,0.398974
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,1400.0,milligram,0,617Tl40LOXL.jpg,0.702165


In [46]:
import requests
from PIL import Image
import io

In [49]:
def process_image_and_text(image_path):
    print(image_path)
    # Fetch the image from the URL
    response = requests.get(image_path)
    image = Image.open(io.BytesIO(response.content))

    # Initialize the OCR reader
    reader = easyocr.Reader(['en'])

    # Perform OCR on the image
    result = reader.readtext(np.array(image))

    # Extract and return the text
    texti =  [text for _, text, _ in result]
    extracted_text = ' '.join(texti).lower()
    # Tokenize the extracted text using BERT tokenizer
    tokenized_text = tokenizer.encode_plus(
        extracted_text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    input_ids = tokenized_text['input_ids']
    attention_mask = tokenized_text['attention_mask']
    return pd.Series({'tokenized_text':tokenized_text, 'attention_mask':attention_mask})

In [50]:
tt=predata.sample(2)
tt[['tokenized_text','attention_mask']]=tt['image_link'].apply(process_image_and_text)



https://m.media-amazon.com/images/I/812S9rfFnAL.jpg


  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


https://m.media-amazon.com/images/I/816iPvHtxZL.jpg


In [51]:
tt.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value,value,unit,entity_name_no,image_name,scaled_group_id,tokenized_text,attention_mask
105456,https://m.media-amazon.com/images/I/812S9rfFnA...,524635,item_weight,10.0 milligram,10.0,milligram,0,812S9rfFnAL.jpg,0.471583,"[input_ids, token_type_ids, attention_mask]","((tf.Tensor(1, shape=(), dtype=int32), tf.Tens..."
76853,https://m.media-amazon.com/images/I/816iPvHtxZ...,375816,item_weight,92.0 gram,92.0,gram,0,816iPvHtxZL.jpg,0.305647,"[input_ids, token_type_ids, attention_mask]","((tf.Tensor(1, shape=(), dtype=int32), tf.Tens..."
