In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import easyocr
import requests
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the CSV data and use only the first 10 rows
data = pd.read_csv('train.csv').head(100)

# Define a function to extract text from images using OCR
def extract_text_from_image(image_url):
    try:
        # Open image from the URL
        image = Image.open(requests.get(image_url, stream=True).raw)
        # Use easyocr to extract text
        ocr = easyocr.Reader(['en'])
        results = ocr.readtext(image)
        # Combine all detected text into a single string
        text = " ".join([result[1] for result in results])
        return text
    except Exception as e:
        print(f"Error in processing image: {e}")
        return ""

# Apply the function to the 'image_link' column
data['extracted_text'] = data['image_link'].apply(extract_text_from_image)

# Encode the 'entity_name' for training the model
encoder = LabelEncoder()
data['entity_name_encoded'] = encoder.fit_transform(data['entity_name'])

# Convert 'entity_value' to numeric (assuming it has text like '1400 milligram')
def extract_numeric_value(value):
    try:
        # Extract the numeric part from the string
        return float(''.join(filter(str.isdigit, value)))
    except ValueError:
        return np.nan  # Use NaN for non-numeric values

data['entity_value_numeric'] = data['entity_value'].apply(extract_numeric_value)

# Drop rows with NaN values in target
data = data.dropna(subset=['entity_value_numeric'])

# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(data['extracted_text']).toarray()

# Combine text features and encoded entity names
X = np.hstack([X_text, data[['entity_name_encoded']].values])

# Prepare y for the model
y = data['entity_value_numeric']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Evaluate the model
score = rf_model.score(X_test, y_test)
print(f"Model accuracy: {score * 100:.2f}%")

# Function to predict entity_value from text and entity_name
def predict_entity_value(text, entity_name):
    # Transform text and entity_name to match the training data format
    text_features = vectorizer.transform([text]).toarray()
    entity_name_encoded = encoder.transform([entity_name])[0]
    X_input = np.hstack([text_features, [[entity_name_encoded]]])
    prediction = rf_model.predict(X_input)
    return prediction

# Example prediction
example_text = "Extracted text from some image 1400g"
example_entity_name = "item_weight"
predicted_value = predict_entity_value(example_text, example_entity_name)
print(f"Predicted entity value: {predicted_value}")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting 

In [18]:
example_text = "Extracted text from some image 1400g"
example_entity_name = "item_weight"
predicted_value = predict_entity_value(example_text, example_entity_name)
print(f"Predicted entity value: {predicted_value}")

Predicted entity value: [1123.6]


In [10]:
pip install easyocr

^C
Note: you may need to restart the kernel to use updated packages.
Collecting easyocr
  Using cached easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting torch (from easyocr)
  Using cached torch-2.4.1-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting torchvision>=0.5 (from easyocr)
  Using cached torchvision-0.19.1-cp312-cp312-win_amd64.whl.metadata (6.1 kB)
Collecting opencv-python-headless (from easyocr)
  Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting scikit-image (from easyocr)
  Using cached scikit_image-0.24.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting python-bidi (from easyocr)
  Using cached python_bidi-0.6.0-cp312-none-win_amd64.whl.metadata (4.7 kB)
Collecting Shapely (from easyocr)
  Using cached shapely-2.0.6-cp312-cp312-win_amd64.whl.metadata (7.2 kB)
Collecting pyclipper (from easyocr)
  Using cached pyclipper-1.3.0.post5-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Collecting ninja (from easyocr)
 

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Python312\\Lib\\site-packages\\sympy\\utilities\\decorator.py'
Consider using the `--user` option or check the permissions.



In [11]:
pip uninstall easyocr


Note: you may need to restart the kernel to use updated packages.




In [12]:
pip install easyocr

Collecting easyocr
  Using cached easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting torch (from easyocr)
  Using cached torch-2.4.1-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting torchvision>=0.5 (from easyocr)
  Using cached torchvision-0.19.1-cp312-cp312-win_amd64.whl.metadata (6.1 kB)
Collecting opencv-python-headless (from easyocr)
  Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting scikit-image (from easyocr)
  Using cached scikit_image-0.24.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting python-bidi (from easyocr)
  Using cached python_bidi-0.6.0-cp312-none-win_amd64.whl.metadata (4.7 kB)
Collecting Shapely (from easyocr)
  Using cached shapely-2.0.6-cp312-cp312-win_amd64.whl.metadata (7.2 kB)
Collecting pyclipper (from easyocr)
  Using cached pyclipper-1.3.0.post5-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Collecting ninja (from easyocr)
  Using cached ninja-1.11.1.1-py2.py3-none-win_amd64.whl.metadata (5.4