In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import easyocr
import os
from tqdm import tqdm
from pathlib import Path
import re
import urllib.request
import time
import aiohttp
import aiofiles
import asyncio
from tqdm.asyncio import tqdm
import io
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial


In [2]:

async def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        buffer = io.BytesIO()
        placeholder_image.save(buffer, format='PNG')
        buffer.seek(0)
        async with aiofiles.open(image_save_path, 'wb') as f:
            await f.write(buffer.getvalue())
    except Exception as e:
        print(f"Error creating placeholder image: {e}")

async def download_image(session, image_link, save_folder, semaphore, retries=3, delay=1):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    async with semaphore:
        for attempt in range(retries):
            try:
                async with session.get(image_link, timeout=10) as response:
                    if response.status == 200:
                        content = await response.read()
                        async with aiofiles.open(image_save_path, 'wb') as f:
                            await f.write(content)
                        return
                    else:
                        print(f"Failed to download {image_link}: HTTP {response.status}")
            except Exception as e:
                if attempt < retries - 1:
                    await asyncio.sleep(delay)
                else:
                    print(f"Error downloading {image_link}: {e}")
        
        await create_placeholder_image(image_save_path)

async def download_images(image_links, download_folder, max_concurrency=100):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    semaphore = asyncio.Semaphore(max_concurrency)
    async with aiohttp.ClientSession() as session:
        tasks = [download_image(session, link, download_folder, semaphore) for link in image_links]
        await tqdm.gather(*tasks, desc="Downloading images", total=len(image_links))

def run_async_download(image_links, download_folder):
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # We're in an environment with an existing event loop (e.g., Jupyter)
        asyncio.create_task(download_images(image_links, download_folder))
    else:
        # We're in a regular Python environment
        loop.run_until_complete(download_images(image_links, download_folder))

In [3]:
from constants import entity_unit_map, allowed_units

In [4]:
torch.cuda.is_available()

True

In [5]:
# Load the ResNet model
resnet50 = models.resnet50(pretrained=True)
resnet50.eval()
# Define transforms for image preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])



# Initialize the OCR reader
reader = easyocr.Reader(['en'])

# def process_image(image_path, transform, resnet50, reader):
#     try:
#         # Extract image features
#         image = Image.open(image_path).convert('RGB')
#         image_tensor = transform(image).unsqueeze(0)
#         image_tensor = image_tensor.to(resnet50.device)
#         with torch.no_grad():
#             image_features = resnet50(image_tensor).squeeze().cpu().numpy()

#         # Extract text features
#         result = reader.readtext(str(image_path))
#         extracted_text = ' '.join([item[1] for item in result])

#         return image_features, extracted_text
#     except Exception as e:
#         print(f"Error processing image {image_path}: {str(e)}")
#         return np.zeros(2048), ""

def preprocess_image(image_path):
    image = Image.open(image_path).convert('RGB')
    return transform(image).unsqueeze(0)


def extract_features(image_path):
    # Extract image features
    image_tensor = preprocess_image(image_path)
    with torch.no_grad():
        image_features = resnet50(image_tensor).squeeze().numpy()
    
    # Extract text features
    result = reader.readtext(image_path)
    extracted_text = ' '.join([item[1] for item in result])
    
    return image_features, extracted_text

def process_dataset(df, image_folder):
    image_features_list = []
    extracted_texts = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        image_filename = Path(row['image_link']).name
        image_path = os.path.join(image_folder, image_filename)
        
        if os.path.exists(image_path):
            image_features, extracted_text = extract_features(image_path)
            image_features_list.append(image_features)
            extracted_texts.append(extracted_text)
        else:
            print(f"Image not found: {image_path}")
            image_features_list.append(np.zeros(2048))  # ResNet50 output size
            extracted_texts.append("")
    
    df['image_features'] = image_features_list
    df['extracted_text'] = extracted_texts
    
    return df



class DeepEntityExtractor(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DeepEntityExtractor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

def train_deep_model(X_train, y_train, X_val, y_val, input_dim, hidden_dim, output_dim, num_epochs=50, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DeepEntityExtractor(input_dim, hidden_dim, output_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=5)

    train_dataset = TensorDataset(torch.FloatTensor(X_train).to(device), torch.LongTensor(y_train).to(device))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = TensorDataset(torch.FloatTensor(X_val).to(device), torch.LongTensor(y_val).to(device))
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    best_val_accuracy = 0
    best_model_path = 'best_model.pth'

    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

        model.eval()
        val_accuracy = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                _, predicted = torch.max(outputs, 1)
                val_accuracy += (predicted == batch_y).sum().item()

        val_accuracy /= len(val_dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}")

        scheduler.step(val_accuracy)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), best_model_path)

    model.load_state_dict(torch.load(best_model_path))
    return model


def parse_prediction(pred, entity_name):
    try:
        value, unit = pred.split()
        value = float(value)
        if unit in entity_unit_map.get(entity_name, set()):
            return f"{value} {unit}"
    except:
        pass
    return ""



  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [6]:
# Load and process data
train_df = pd.read_csv('dataset/trial_train.csv')
test_df = pd.read_csv('dataset/trial_test.csv')

In [7]:
train_df.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [8]:


train_download_path = 'images/train_img'
test_download_path = 'images/test_img'

image_links_train = train_df['image_link'].tolist()
image_links_test = test_df['image_link'].tolist()

print("Downloading train images...")
run_async_download(image_links_train, train_download_path)
print("Downloading test images...")
run_async_download(image_links_test, test_download_path)
print("Download complete!")


Downloading train images...
Downloading test images...
Download complete!


Downloading images: 100%|██████████| 99/99 [00:00<00:00, 131.04it/s]
Downloading images: 100%|██████████| 256/256 [00:03<00:00, 83.73it/s] 


In [9]:
train_df = process_dataset(train_df, 'images/train_img')
test_df = process_dataset(test_df, 'images/test_img')



100%|██████████| 256/256 [09:02<00:00,  2.12s/it]
100%|██████████| 99/99 [00:24<00:00,  4.11it/s]


In [10]:
print([train_df.head(), test_df.head()])

[                                          image_link  group_id  entity_name  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   

     entity_value                                     image_features  \
0      500.0 gram  [-1.5245771, -2.7847428, -4.1273065, -4.761484...   
1         1.0 cup  [-4.910853, -2.370376, -1.9847587, -4.453485, ...   
2      0.709 gram  [-1.7896006, -1.1231364, -0.6803558, -4.866802...   
3      0.709 gram  [-1.6998017, 1.7567462, -2.383329, -2.7924333,...   
4  1400 milligram  [-2.1520514, 0.33016944, -3.4435117, -3.421331...   

                                      extracted_text  
0  PROPOS' NATUREJ INGREDIENT MENAGE

In [11]:

# Prepare features and labels
X = train_df[['extracted_text', 'entity_name', 'image_features']]
y = train_df['entity_value']

In [12]:
# Text feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(X['extracted_text']).toarray()

In [13]:


# Entity name encoding
entity_encoder = LabelEncoder()
X_entity = entity_encoder.fit_transform(X['entity_name']).reshape(-1, 1)


In [14]:
# Combine features
X_combined = np.hstack((X_text, X_entity, np.vstack(X['image_features'])))


In [15]:

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [16]:


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

In [17]:
# Prepare test data
X_test_text = tfidf.transform(test_df['extracted_text']).toarray()
X_test_entity = np.array([entity_encoder.transform([entity]) if entity in entity_encoder.classes_ else [-1] for entity in test_df['entity_name']]).reshape(-1, 1)
X_test_combined = np.hstack((X_test_text, X_test_entity, np.vstack(test_df['image_features'])))
X_test_scaled = scaler.transform(X_test_combined)

In [43]:
# Cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
deep_predictions = np.zeros((len(X_test_scaled), len(le.classes_)))

for fold, (train_index, val_index) in enumerate(skf.split(X_scaled, y_encoded)):
    print(f"Training fold {fold + 1}/{n_splits}")
    
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y_encoded[train_index], y_encoded[val_index]

    input_dim = X_train.shape[1]
    hidden_dim = 256
    output_dim = len(le.classes_)

    model = train_deep_model(X_train, y_train, X_val, y_val, input_dim, hidden_dim, output_dim)

    # Predict on test data
    model.eval()
    with torch.no_grad():
        test_outputs = model(torch.FloatTensor(X_test_scaled).to(device))
        deep_predictions += torch.softmax(test_outputs, dim=1).cpu().numpy()



Training fold 1/5
Epoch 1/50, Validation Accuracy: 0.0769
Epoch 2/50, Validation Accuracy: 0.0962
Epoch 3/50, Validation Accuracy: 0.1346
Epoch 4/50, Validation Accuracy: 0.1538
Epoch 5/50, Validation Accuracy: 0.1923
Epoch 6/50, Validation Accuracy: 0.2308
Epoch 7/50, Validation Accuracy: 0.2692
Epoch 8/50, Validation Accuracy: 0.2692
Epoch 9/50, Validation Accuracy: 0.2692
Epoch 10/50, Validation Accuracy: 0.3077
Epoch 11/50, Validation Accuracy: 0.3077
Epoch 12/50, Validation Accuracy: 0.3077
Epoch 13/50, Validation Accuracy: 0.3077
Epoch 14/50, Validation Accuracy: 0.2885
Epoch 15/50, Validation Accuracy: 0.2885
Epoch 16/50, Validation Accuracy: 0.2885
Epoch 17/50, Validation Accuracy: 0.2885
Epoch 18/50, Validation Accuracy: 0.2885
Epoch 19/50, Validation Accuracy: 0.2885
Epoch 20/50, Validation Accuracy: 0.2885
Epoch 21/50, Validation Accuracy: 0.2885
Epoch 22/50, Validation Accuracy: 0.2885
Epoch 23/50, Validation Accuracy: 0.2885
Epoch 24/50, Validation Accuracy: 0.2885
Epoch 2

  model.load_state_dict(torch.load(best_model_path))


Epoch 1/50, Validation Accuracy: 0.0588
Epoch 2/50, Validation Accuracy: 0.1373
Epoch 3/50, Validation Accuracy: 0.1765
Epoch 4/50, Validation Accuracy: 0.2157
Epoch 5/50, Validation Accuracy: 0.2157
Epoch 6/50, Validation Accuracy: 0.2157
Epoch 7/50, Validation Accuracy: 0.2157
Epoch 8/50, Validation Accuracy: 0.2549
Epoch 9/50, Validation Accuracy: 0.2549
Epoch 10/50, Validation Accuracy: 0.2353
Epoch 11/50, Validation Accuracy: 0.2353
Epoch 12/50, Validation Accuracy: 0.2353
Epoch 13/50, Validation Accuracy: 0.2353
Epoch 14/50, Validation Accuracy: 0.2353
Epoch 15/50, Validation Accuracy: 0.2353
Epoch 16/50, Validation Accuracy: 0.2353
Epoch 17/50, Validation Accuracy: 0.2353
Epoch 18/50, Validation Accuracy: 0.2353
Epoch 19/50, Validation Accuracy: 0.2353
Epoch 20/50, Validation Accuracy: 0.2353
Epoch 21/50, Validation Accuracy: 0.2353
Epoch 22/50, Validation Accuracy: 0.2353
Epoch 23/50, Validation Accuracy: 0.2353
Epoch 24/50, Validation Accuracy: 0.2353
Epoch 25/50, Validation A

In [44]:
deep_predictions /= n_splits
y_pred_encoded = np.argmax(deep_predictions, axis=1)
y_pred = le.inverse_transform(y_pred_encoded)

In [50]:
# Check if there are any non-numeric values in the 'index' column
print(test_df['index'].unique())  # This will show you any problematic values

['index' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20' '21' '22' '23'
 '24' '25' '26' '27' '28' '29' '30' '31' '32' '33' '34' '35' '36' '37'
 '38' '39' '40' '41' '42' '43' '44' '45' '46' '47' '48' '49' '50' '51'
 '52' '53' '54' '55' '56' '57' '58' '59' '60' '61' '62' '63' '64' '65'
 '66' '67' '68' '69' '70' '71' '72' '73' '74' '75' '76' '77' '78' '79'
 '80' '81' '82' '83' '84' '85' '86' '87' '88' '89' '90' '91' '92' '93'
 '94' '95' '96' '97' '98']


In [51]:

# Remove rows where 'index' is not a valid number
test_df = test_df[test_df['index'].apply(lambda x: str(x).isdigit())]  # Keep only rows with numeric 'index'


In [52]:

# Now convert 'index' to int safely
test_df['index'] = test_df['index'].astype(int)

In [48]:
# Post-process predictions
final_predictions = []
for pred, entity_name in zip(y_pred, test_df['entity_name']):
    final_predictions.append(parse_prediction(pred, entity_name))

In [53]:
# Ensure 'y_pred' is in the correct format as well
y_pred = pd.Series(y_pred, name='prediction')

In [54]:
# Create the final output DataFrame
output_df = pd.DataFrame({
    'index': test_df['index'],
    'prediction': y_pred
})

In [55]:
# Save the output
output_df.to_csv('test_out.csv', index=False)
