In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from scipy.sparse import hstack
from tqdm import tqdm
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from preprocessing import load_and_combine_csv_files,clean_and_label_data
import joblib
import os
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [14]:
class GoodwillDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Get text data
        title = str(self.df.iloc[idx]['title'])
        description = str(self.df.iloc[idx]['description'])
        
        # Tokenize title and description separately
        title_encoding = self.tokenizer(
            title,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        desc_encoding = self.tokenizer(
            description,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Get target price
        price = float(self.df.iloc[idx]['currentPrice'])
        
        return {
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'desc_input_ids': desc_encoding['input_ids'].squeeze(),
            'desc_attention_mask': desc_encoding['attention_mask'].squeeze(),
            'price': torch.tensor(price, dtype=torch.float)
        }

In [15]:
class PricePredictor(nn.Module):
    def __init__(self, transformer_model):
        super().__init__()
        
        # Transformer encoder
        self.transformer = transformer_model
        self.transformer_dim = self.transformer.config.hidden_size
        
        # Fusion layer
        self.fusion = nn.Sequential(
            nn.Linear(self.transformer_dim * 2, self.transformer_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(self.transformer_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
        
        # ReLU for final output to ensure positive prices
        self.final_activation = nn.ReLU()
        
    def forward(self, title_input_ids, title_attention_mask, 
                desc_input_ids, desc_attention_mask):
        # Get transformer outputs for title and description
        title_output = self.transformer(
            input_ids=title_input_ids,
            attention_mask=title_attention_mask
        )
        
        desc_output = self.transformer(
            input_ids=desc_input_ids,
            attention_mask=desc_attention_mask
        )
        
        # Get [CLS] token outputs
        title_features = title_output.last_hidden_state[:, 0, :]
        desc_features = desc_output.last_hidden_state[:, 0, :]
        
        # Concatenate features
        combined = torch.cat([title_features, desc_features], dim=1)
        
        # Fuse features
        fused = self.fusion(combined)
        
        # Pass through fully connected layers
        output = self.fc(fused)
        
        # Ensure positive output
        return self.final_activation(output).squeeze()

In [16]:
def count_parameters_by_layer(model):
   print("\nParameters by Layer:")
   print("-" * 50)
   
   # BERT layers
   bert_params = sum(p.numel() for p in model.transformer.parameters())
   print(f"1. BERT Encoder: {bert_params:,} parameters")
   
   # Fusion layers 
   print("\n2. Fusion Layer:")
   for i, layer in enumerate(model.fusion):
       layer_params = sum(p.numel() for p in layer.parameters())
       print(f"   {i+1}. {layer.__class__.__name__}: {layer_params:,} parameters")
   
   # Fully Connected layers
   print("\n3. Fully Connected Layers:")
   for i, layer in enumerate(model.fc):
       layer_params = sum(p.numel() for p in layer.parameters())
       print(f"   {i+1}. {layer.__class__.__name__}: {layer_params:,} parameters")
   
   # Total parameters
   total_params = sum(p.numel() for p in model.parameters())
   print("\n" + "-" * 50)
   print(f"Total Parameters: {total_params:,}")


In [17]:
# Parameters for loading data
directory = "/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/datasets/"
base_filename = "goodwill_items_job_"
num_files = 30

# Load and combine the CSV files
combined_df = load_and_combine_csv_files(directory, base_filename, num_files)

# Clean and label the data
cleaned_df, le_state, le_category = clean_and_label_data(combined_df)

# Split data using first 400000 rows for training
train_val_df = cleaned_df.iloc[:400000].copy()
test_df = cleaned_df.iloc[400000:].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(lambda x: str(x).lower().strip() if pd.notna(x) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(lambda x: str(x).lower().strip() if pd.notna(x) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['state_encod

In [18]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize tokenizer and transformer model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
transformer_model = AutoModel.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = GoodwillDataset(train_val_df, tokenizer)
test_dataset = GoodwillDataset(test_df, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model
model = PricePredictor(transformer_model).to(device)
count_parameters_by_layer(model)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Parameters by Layer:
--------------------------------------------------
1. BERT Encoder: 109,482,240 parameters

2. Fusion Layer:
   1. Linear: 1,180,416 parameters
   2. ReLU: 0 parameters
   3. Dropout: 0 parameters

3. Fully Connected Layers:
   1. Linear: 393,728 parameters
   2. ReLU: 0 parameters
   3. Dropout: 0 parameters
   4. Linear: 131,328 parameters
   5. ReLU: 0 parameters
   6. Dropout: 0 parameters
   7. Linear: 257 parameters

--------------------------------------------------
Total Parameters: 111,187,969


In [19]:
import time
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(train_loader)
    for batch in progress_bar:
        # Move batch to device
        title_input_ids = batch['title_input_ids'].to(device)
        title_attention_mask = batch['title_attention_mask'].to(device)
        desc_input_ids = batch['desc_input_ids'].to(device)
        desc_attention_mask = batch['desc_attention_mask'].to(device)
        price = batch['price'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        output = model(title_input_ids, title_attention_mask,
                      desc_input_ids, desc_attention_mask)
        
        # Calculate loss
        loss = criterion(output, price)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Update total loss and progress bar
        total_loss += loss.item()
        avg_loss = total_loss / (progress_bar.n + 1)  # Current average loss
        progress_bar.set_description(f'Loss: {avg_loss:.4f}')
    
    return total_loss / len(train_loader)

num_epochs = 3
save_dir = '/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Training loop
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    avg_loss = train_epoch(model, train_loader, criterion, optimizer, device)


Epoch 1/3


Loss: 15645.8238: 100%|██████████| 12500/12500 [50:32<00:00,  4.12it/s]


Epoch 2/3


Loss: 15277.4210:  97%|█████████▋| 12173/12500 [49:21<01:19,  4.11it/s]


KeyboardInterrupt: 

In [None]:
def evaluate(model, test_loader, device):
   model.eval()
   predictions = []
   actuals = []
   
   with torch.no_grad():
       for batch in tqdm(test_loader, desc="Evaluating"):
           # Move batch to device
           title_input_ids = batch['title_input_ids'].to(device)
           title_attention_mask = batch['title_attention_mask'].to(device)
           desc_input_ids = batch['desc_input_ids'].to(device)
           desc_attention_mask = batch['desc_attention_mask'].to(device)
           price = batch['price']
           
           # Get predictions
           output = model(title_input_ids, title_attention_mask,
                        desc_input_ids, desc_attention_mask)
           
           predictions.extend(output.cpu().numpy())
           actuals.extend(price.numpy())
   
   return np.array(predictions), np.array(actuals)

# Create save directory if it doesn't exist
save_dir = '/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/models'
if not os.path.exists(save_dir):
   os.makedirs(save_dir)

# Get predictions and calculate metrics
predictions, actuals = evaluate(model, test_loader, device)
mse = mean_squared_error(actuals, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(actuals, predictions)

print("\nTest Set Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}") 
print(f"R2 Score: {r2:.4f}")

# Save model with metrics
model_name = 'EmbedBERT'
model_path = os.path.join(save_dir, f'{model_name}.pth')
torch.save({
  'model_state_dict': model.state_dict(),
  'test_mse': mse,
  'test_rmse': rmse,
  'test_r2': r2,
  'tokenizer': tokenizer,
}, model_path)
print(f'\nModel saved at: {model_path}')

In [None]:
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
# Add predictions and calculate metrics
test_df['predicted_price'] = predictions
test_df['actual_price'] = test_df['currentPrice']
test_df['price_difference'] = test_df['predicted_price'] - test_df['actual_price']
test_df['price_difference_pct'] = ((test_df['actual_price'] - test_df['predicted_price']) / test_df['predicted_price']) * -100

# Create analysis dataframe
analysis_df = test_df[[
   'title',
   'actual_price',
   'predicted_price', 
   'price_difference',
   'price_difference_pct',
   'mainCategory',
   'description',
   'pickupState',
   'imageUrls',
   'itemId'
]].copy()

# Round numeric columns
numeric_cols = ['actual_price', 'predicted_price', 'price_difference', 'price_difference_pct']
analysis_df[numeric_cols] = analysis_df[numeric_cols].round(2)

# Sort by price difference
analysis_df = analysis_df.sort_values('price_difference', ascending=False)

# Save results
results_dir = '/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/results'
if not os.path.exists(results_dir):
   os.makedirs(results_dir)
   
filename = f"{model_name}_analysis_{timestamp}.csv"
save_path = os.path.join(results_dir, filename)
analysis_df.to_csv(save_path, index=False)
print(f"Analysis results saved at {save_path}")