In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import easyocr
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)

In [3]:
train_df = pd.read_csv("C:/Users/jain2/Downloads/68e8d1d70b66d_student_resource/student_resource/dataset/train.csv")
test_df = pd.read_csv("C:/Users/jain2/Downloads/68e8d1d70b66d_student_resource/student_resource/dataset/test.csv")

In [4]:
torch.cuda.is_available()

True

In [5]:
import os
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())


def extract_text_from_image(url):
    try:
        results = reader.readtext(url, detail=0)
        return " ".join(results)
    except Exception:
        return ""


def process_batch_train(df, batch_id, save_dir="ocr_cache"):
    os.makedirs(save_dir, exist_ok=True)
    out_path = os.path.join(save_dir, f"train_batch_{batch_id}.csv")
    if os.path.exists(out_path):  # skip if already done
        print(f"Skipping batch {batch_id}, already processed.")
        return

    with Pool(processes=min(4, cpu_count() // 2)) as pool:  # safe for laptops
        texts = list(tqdm(pool.imap(extract_text_from_image, df["image_link"]), total=len(df)))
    df["ocr_text"] = texts
    df.to_csv(out_path, index=False)
    print(f"Saved {out_path}")

def process_batch_test(df, batch_id, save_dir="ocr_cache"):
    os.makedirs(save_dir, exist_ok=True)
    out_path = os.path.join(save_dir, f"test_batch_{batch_id}.csv")
    if os.path.exists(out_path):  # skip if already done
        print(f"Skipping batch {batch_id}, already processed.")
        return

    with Pool(processes=min(4, cpu_count() // 2)) as pool:  # safe for laptops
        texts = list(tqdm(pool.imap(extract_text_from_image, df["image_link"]), total=len(df)))
    df["ocr_text"] = texts
    df.to_csv(out_path, index=False)
    print(f"Saved {out_path}")


# run in chunks to avoid memory overload
batch_size = 2000
for i in range(0, 75000, batch_size):
    batch_id = i // batch_size
    batch_df_train = train_df.iloc[i:i + batch_size].copy()
    process_batch_train(batch_df_train, batch_id)
for i in range(0, 75000, batch_size):
    batch_id = i // batch_size
    batch_df_test = test_df.iloc[i:i + batch_size].copy()
    process_batch_test(batch_df_test, batch_id)

Skipping batch 0, already processed.
Skipping batch 1, already processed.
Skipping batch 2, already processed.
Skipping batch 3, already processed.
Skipping batch 4, already processed.
Skipping batch 5, already processed.
Skipping batch 6, already processed.
Skipping batch 7, already processed.
Skipping batch 8, already processed.
Skipping batch 9, already processed.
Skipping batch 10, already processed.
Skipping batch 11, already processed.
Skipping batch 12, already processed.
Skipping batch 13, already processed.
Skipping batch 14, already processed.
Skipping batch 15, already processed.
Skipping batch 16, already processed.
Skipping batch 17, already processed.
Skipping batch 18, already processed.
Skipping batch 19, already processed.
Skipping batch 20, already processed.
Skipping batch 21, already processed.
Skipping batch 22, already processed.
Skipping batch 23, already processed.
Skipping batch 24, already processed.
Skipping batch 25, already processed.
Skipping batch 26, alr

In [6]:
import glob
train_parts = [pd.read_csv(f) for f in sorted(glob.glob("ocr_cache/train_batch_*.csv"))]
train_df = pd.concat(train_parts, ignore_index=True)
test_parts = [pd.read_csv(f) for f in sorted(glob.glob("ocr_cache/test_batch_*.csv"))]
test_df = pd.concat(test_parts, ignore_index=True)

In [7]:
train_df["combined_text"] = train_df["catalog_content"].astype(str) + " " + train_df["ocr_text"].astype(str)
test_df["combined_text"] = test_df["catalog_content"].astype(str) + " " + test_df["ocr_text"].astype(str)

In [8]:
class PricingDataset(Dataset):
    def __init__(self, df, tokenizer, is_train=True):
        self.texts = df["combined_text"].tolist()
        self.is_train = is_train
        if is_train:
            self.labels = df["price"].values.astype(float)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=256,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in inputs.items()}
        if self.is_train:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [9]:
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1  # regression
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)

train_dataset = PricingDataset(train_data, tokenizer)
val_dataset = PricingDataset(val_data, tokenizer)
test_dataset = PricingDataset(test_df, tokenizer, is_train=False)

In [11]:
import evaluate
import torch.nn.functional as F

def smape(preds, labels):
    return np.mean(100 * np.abs(preds - labels) / ((np.abs(labels) + np.abs(preds)) / 2))

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.flatten()
    return {"smape": smape(preds, labels)}

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_ratio=0.1,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="smape",
    greater_is_better=False,
    seed=42,
    data_seed=42
)

In [None]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [17]:
preds = trainer.predict(test_dataset).predictions.flatten()
preds = np.maximum(preds, 0)  # enforce positive prices

KeyboardInterrupt: 

In [15]:
submission = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": preds
})
submission.to_csv("test_out7.csv", index=False)
print("Submission saved to test_out7.csv")

Submission saved to test_out7.csv
