In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from scipy.sparse import hstack
from tqdm import tqdm
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from preprocessing import load_and_combine_csv_files,clean_and_label_data
from PIL import Image
import requests
from io import BytesIO
from torchvision import transforms
from torchvision.models import resnet50, ResNet50_Weights
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [2]:


class GoodwillDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Image preprocessing
        self.image_transforms = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                               std=[0.229, 0.224, 0.225])
        ])
    
    def load_image_from_url(self, url):
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content)).convert('RGB')
            return self.image_transforms(img)
        except:
            # Return a blank image if there's an error
            return torch.zeros(3, 224, 224)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Get text data
        title = str(self.df.iloc[idx]['title'])
        description = str(self.df.iloc[idx]['description'])
        
        # Get image
        image_url = str(self.df.iloc[idx]['imageUrls'])
        image_tensor = self.load_image_from_url(image_url)
        
        # Tokenize text
        title_encoding = self.tokenizer(
            title,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        desc_encoding = self.tokenizer(
            description,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Get target price
        price = float(self.df.iloc[idx]['currentPrice'])
        
        return {
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'desc_input_ids': desc_encoding['input_ids'].squeeze(),
            'desc_attention_mask': desc_encoding['attention_mask'].squeeze(),
            'image': image_tensor,
            'price': torch.tensor(price, dtype=torch.float)
        }

In [3]:
class PricePredictor(nn.Module):
    def __init__(self, transformer_model, image_encoder):
        super().__init__()
        
        # Text encoder (BERT)
        self.transformer = transformer_model
        self.transformer_dim = self.transformer.config.hidden_size
        
        # Image encoder (ResNet50)
        self.image_encoder = image_encoder
        self.image_dim = 2048  # ResNet50 output dimension
        
        # Text fusion layer
        self.text_fusion = nn.Sequential(
            nn.Linear(self.transformer_dim * 2, self.transformer_dim),
            nn.LayerNorm(self.transformer_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Multimodal fusion layer
        self.multimodal_fusion = nn.Sequential(
            nn.Linear(self.transformer_dim + self.image_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Price prediction layers
        self.regressor = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        
        # Final activation for positive prices
        self.final_activation = nn.ReLU()
        
    def forward(self, title_input_ids, title_attention_mask, 
                desc_input_ids, desc_attention_mask, images):
        # Process text
        title_output = self.transformer(
            input_ids=title_input_ids,
            attention_mask=title_attention_mask
        )
        
        desc_output = self.transformer(
            input_ids=desc_input_ids,
            attention_mask=desc_attention_mask
        )
        
        # Get [CLS] token outputs
        title_features = title_output.last_hidden_state[:, 0, :]
        desc_features = desc_output.last_hidden_state[:, 0, :]
        
        # Fuse text features
        text_combined = torch.cat([title_features, desc_features], dim=1)
        text_fused = self.text_fusion(text_combined)
        
        # Process image
        image_features = self.image_encoder(images)
        image_features = image_features.view(image_features.size(0), -1)
        
        # Combine text and image features
        multimodal_features = torch.cat([text_fused, image_features], dim=1)
        fused_features = self.multimodal_fusion(multimodal_features)
        
        # Final prediction
        output = self.regressor(fused_features)
        return self.final_activation(output).squeeze()
        
def count_parameters_by_layer(model):
   print("\nParameters by Layer:")
   print("-" * 50)
   
   # BERT layers
   bert_params = sum(p.numel() for p in model.transformer.parameters())
   print(f"1. BERT Encoder: {bert_params:,} parameters")
   
   # ResNet layers
   resnet_params = sum(p.numel() for p in model.image_encoder.parameters())
   print(f"2. ResNet Encoder: {resnet_params:,} parameters")
   
   # Text Fusion layers
   print("\n3. Text Fusion Layer:")
   for i, layer in enumerate(model.text_fusion):
       layer_params = sum(p.numel() for p in layer.parameters())
       print(f"   {i+1}. {layer.__class__.__name__}: {layer_params:,} parameters")
   
   # Multimodal Fusion layers
   print("\n4. Multimodal Fusion Layer:")
   for i, layer in enumerate(model.multimodal_fusion):
       layer_params = sum(p.numel() for p in layer.parameters())
       print(f"   {i+1}. {layer.__class__.__name__}: {layer_params:,} parameters")
   
   # Regressor layers
   print("\n5. Regressor Layers:")
   for i, layer in enumerate(model.regressor):
       layer_params = sum(p.numel() for p in layer.parameters())
       print(f"   {i+1}. {layer.__class__.__name__}: {layer_params:,} parameters")
   
   # Total parameters
   total_params = sum(p.numel() for p in model.parameters())
   print("\n" + "-" * 50)
   print(f"Total Parameters: {total_params:,}")



In [4]:
# Parameters for loading data
directory = "/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/datasets/"
base_filename = "goodwill_items_job_"
num_files = 30

# Load and combine the CSV files
combined_df = load_and_combine_csv_files(directory, base_filename, num_files)

# Clean and label the data
cleaned_df, le_state, le_category = clean_and_label_data(combined_df)

# Split data using first 400000 rows for training
train_val_df = cleaned_df.iloc[:400000].copy()
test_df = cleaned_df.iloc[400000:].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(lambda x: str(x).lower().strip() if pd.notna(x) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(lambda x: str(x).lower().strip() if pd.notna(x) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['state_encod

In [5]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize text encoder (BERT)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
transformer_model = AutoModel.from_pretrained('bert-base-uncased')

# Initialize image encoder (ResNet50)
image_model = resnet50(weights=ResNet50_Weights.DEFAULT)
# Remove the final classification layer
image_model = nn.Sequential(*list(image_model.children())[:-1])

# Create datasets
train_dataset = GoodwillDataset(train_val_df, tokenizer)
test_dataset = GoodwillDataset(test_df, tokenizer)

# Create dataloaders with multiple workers for faster loading
train_loader = DataLoader(
    train_dataset, 
    batch_size=32, 
    shuffle=True,
    num_workers=4,  # Adjust based on your CPU cores
    pin_memory=True  # Faster data transfer to GPU
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=32, 
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

# Initialize model with both encoders
model = PricePredictor(
    transformer_model=transformer_model,
    image_encoder=image_model
).to(device)

count_parameters_by_layer(model)
# Freeze the pre-trained models (optional)
# for param in transformer_model.parameters():
#     param.requires_grad = False
# for param in image_model.parameters():
#     param.requires_grad = False

# Loss function
criterion = nn.MSELoss()

# Optimizer with different learning rates for pre-trained and new layers
# Collect parameters that require gradients
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if 'transformer' not in n and 'image_encoder' not in n],
        'lr': 1e-3  # Higher learning rate for new layers
    },
    {
        'params': [p for n, p in model.named_parameters() if 'transformer' in n or 'image_encoder' in n],
        'lr': 1e-5  # Lower learning rate for pre-trained layers
    }
]

optimizer = torch.optim.AdamW(
    optimizer_grouped_parameters,
    weight_decay=0.01  # L2 regularization
)

# Learning rate scheduler (optional)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=2,
    verbose=True
)


Using device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Parameters by Layer:
--------------------------------------------------
1. BERT Encoder: 109,482,240 parameters
2. ResNet Encoder: 23,508,032 parameters

3. Text Fusion Layer:
   1. Linear: 1,180,416 parameters
   2. LayerNorm: 1,536 parameters
   3. ReLU: 0 parameters
   4. Dropout: 0 parameters

4. Multimodal Fusion Layer:
   1. Linear: 1,442,304 parameters
   2. LayerNorm: 1,024 parameters
   3. ReLU: 0 parameters
   4. Dropout: 0 parameters

5. Regressor Layers:
   1. Linear: 131,328 parameters
   2. LayerNorm: 512 parameters
   3. ReLU: 0 parameters
   4. Dropout: 0 parameters
   5. Linear: 32,896 parameters
   6. LayerNorm: 256 parameters
   7. ReLU: 0 parameters
   8. Dropout: 0 parameters
   9. Linear: 129 parameters

--------------------------------------------------
Total Parameters: 135,780,673




In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(train_loader)
    for batch in progress_bar:
        # Move all inputs to device

        title_input_ids = batch['title_input_ids'].to(device)
        title_attention_mask = batch['title_attention_mask'].to(device)
        desc_input_ids = batch['desc_input_ids'].to(device)
        desc_attention_mask = batch['desc_attention_mask'].to(device)
        images = batch['image'].to(device)
        price = batch['price'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        output = model(title_input_ids, title_attention_mask,
                      desc_input_ids, desc_attention_mask,
                      images)
        
        # Calculate loss
        loss = criterion(output, price)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Update total loss and progress bar
        total_loss += loss.item()
        avg_loss = total_loss / (progress_bar.n + 1)
        progress_bar.set_description(f'Loss: {avg_loss:.4f}')
    
    return total_loss / len(train_loader)

num_epochs = 2
# Training loop
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    avg_loss = train_epoch(model, train_loader, criterion, optimizer, device)



Epoch 1/2


Loss: 23584.2382:   1%|          | 226/26179 [36:19<61:56:37,  8.59s/it] 

In [None]:
def evaluate(model, test_loader, device):
   model.eval()
   predictions = []
   actuals = []
   titles = []
   descriptions = []
   
   with torch.no_grad():
       for batch in tqdm(test_loader, desc="Evaluating"):
           # Move batch to device
           title_input_ids = batch['title_input_ids'].to(device)
           title_attention_mask = batch['title_attention_mask'].to(device)
           desc_input_ids = batch['desc_input_ids'].to(device)
           desc_attention_mask = batch['desc_attention_mask'].to(device)
           images = batch['image'].to(device)
           price = batch['price']
           
           # Get predictions
           output = model(title_input_ids, title_attention_mask,
                        desc_input_ids, desc_attention_mask,
                        images)
           
           # Store predictions and actual values
           predictions.extend(output.cpu().numpy())
           actuals.extend(price.numpy())
           
           # Store original text
           titles.extend(tokenizer.batch_decode(title_input_ids, skip_special_tokens=True))
           descriptions.extend(tokenizer.batch_decode(desc_input_ids, skip_special_tokens=True))
   
   return np.array(predictions), np.array(actuals), titles, descriptions

# Save model with evaluation metrics


# Get predictions
predictions, actuals, titles, descriptions = evaluate(model, test_loader, device)



In [None]:

model_name = 'ImageTextFusion'
save_dir = '/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/models'
if not os.path.exists(save_dir):
   os.makedirs(save_dir)
# Calculate metrics
mse = mean_squared_error(actuals, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(actuals, predictions)

print("\nTest Set Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}") 
print(f"R2 Score: {r2:.4f}")

# Save model
model_path = os.path.join(save_dir, f'{model_name}.pth')
torch.save({
   'model_state_dict': model.state_dict(),
   'optimizer_state_dict': optimizer.state_dict(),
   'test_mse': mse,
   'test_rmse': rmse,
   'test_r2': r2,
   'tokenizer': tokenizer,
}, model_path)
print(f'\nModel saved at: {model_path}')

In [None]:
from datetime import datetime
# Add predictions and calculate metrics
test_df['predicted_price'] = predictions
test_df['actual_price'] = test_df['currentPrice']
test_df['price_difference'] = test_df['predicted_price'] - test_df['actual_price']
test_df['price_difference_pct'] = ((test_df['actual_price'] - test_df['predicted_price']) / test_df['predicted_price']) * -100

# Create analysis dataframe with all relevant fields
analysis_df = test_df[[
   'title',
   'actual_price', 
   'predicted_price',
   'price_difference',
   'price_difference_pct',
   'mainCategory',
   'description',
   'pickupState',
   'imageUrls',
   'itemId'
]].copy()

# Round numeric columns
numeric_cols = ['actual_price', 'predicted_price', 'price_difference', 'price_difference_pct']
analysis_df[numeric_cols] = analysis_df[numeric_cols].round(2)

# Sort by price difference (descending order - largest gap first)
analysis_df = analysis_df.sort_values('price_difference', ascending=False)

# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
model_name = 'ImageTextFusion'
filename = f"{model_name}_predictions_{timestamp}.csv"
save_path = f"/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/results/{filename}"

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save analysis
analysis_df.to_csv(save_path, index=False)

print(f"\nAnalysis results saved at: {save_path}")
print(f"Total predictions: {len(analysis_df)}")

# Display sample of predictions
print('\nSample of predictions:')
sample_cols = ['title', 'actual_price', 'predicted_price', 'price_difference_pct']
print(analysis_df[sample_cols].head())