In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from scipy.sparse import hstack
from tqdm import tqdm
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from preprocessing import load_and_combine_csv_files,clean_and_label_data
import joblib
import os
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [11]:
# Function to process data with TF-IDF
def process_data_with_tfidf(df, title_vectorizer=None, desc_vectorizer=None, is_training=False):
    """
    Process data by applying TF-IDF vectorization.

    Parameters:
        df (pd.DataFrame): The input data frame.
        title_vectorizer (TfidfVectorizer, optional): The vectorizer for the 'title' column.
        desc_vectorizer (TfidfVectorizer, optional): The vectorizer for the 'description' column.
        is_training (bool): Whether the function is being used in training mode (fitting vectorizers).

    Returns:
        tuple: Sparse matrices for title and description features, and fitted vectorizers (if training).
    """
    if is_training:
        if title_vectorizer is None:
            title_vectorizer = TfidfVectorizer(max_features=5000)
        if desc_vectorizer is None:
            desc_vectorizer = TfidfVectorizer(max_features=5000)

        # Fit and transform during training
        title_tfidf = title_vectorizer.fit_transform(df['title'])
        desc_tfidf = desc_vectorizer.fit_transform(df['description'])
    else:
        # Transform only during inference
        title_tfidf = title_vectorizer.transform(df['title'])
        desc_tfidf = desc_vectorizer.transform(df['description'])

    return title_tfidf, desc_tfidf, title_vectorizer, desc_vectorizer

# Function to split data

In [12]:
# Parameters for loading data
directory = "/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/datasets/"
base_filename = "goodwill_items_job_"
num_files = 30

# Load and combine the CSV files
combined_df = load_and_combine_csv_files(directory, base_filename, num_files)

# Clean and label the data
cleaned_df, le_state, le_category = clean_and_label_data(combined_df)

# Apply log transformation to the target variable
cleaned_df['currentPrice'] = np.log1p(cleaned_df['currentPrice'])

# Split the data into train, validation, and test sets
train_df = cleaned_df.iloc[:100000]
val_df = cleaned_df.iloc[100000:150000]
test_df = cleaned_df.iloc[150000:]

# Combine training and validation sets for initial processing
train_val_df = pd.concat([train_df, val_df], axis=0)

# Perform TF-IDF processing on the combined training and validation set
title_train_val, desc_train_val, title_vectorizer, desc_vectorizer = process_data_with_tfidf(
    train_val_df, is_training=True
)

# Perform TF-IDF processing on the test set using the fitted vectorizers
title_test, desc_test, _, _ = process_data_with_tfidf(
    test_df, title_vectorizer=title_vectorizer, desc_vectorizer=desc_vectorizer, is_training=False
)

# Separate training and validation features
title_features_train = title_train_val[:len(train_df)]
desc_features_train = desc_train_val[:len(train_df)]
title_features_val = title_train_val[len(train_df):]
desc_features_val = desc_train_val[len(train_df):]

# Combine all features for training
state_encoded_train = train_df['state_encoded'].to_numpy()
category_encoded_train = train_df['category_encoded'].to_numpy()

feature_matrix_train = hstack([
    title_features_train,
    desc_features_train,
    state_encoded_train.reshape(-1, 1),
    category_encoded_train.reshape(-1, 1)
])

# Combine all features for validation
state_encoded_val = val_df['state_encoded'].to_numpy()
category_encoded_val = val_df['category_encoded'].to_numpy()

feature_matrix_val = hstack([
    title_features_val,
    desc_features_val,
    state_encoded_val.reshape(-1, 1),
    category_encoded_val.reshape(-1, 1)
])

# Combine all features for testing
state_encoded_test = test_df['state_encoded'].to_numpy()
category_encoded_test = test_df['category_encoded'].to_numpy()

feature_matrix_test = hstack([
    title_test,
    desc_test,
    state_encoded_test.reshape(-1, 1),
    category_encoded_test.reshape(-1, 1)
])

print("\nAfter cleaning, splitting, and TF-IDF processing:")
print("Feature matrix (training) shape:", feature_matrix_train.shape)
print("Feature matrix (validation) shape:", feature_matrix_val.shape)
print("Feature matrix (test) shape:", feature_matrix_test.shape)


error: nothing to repeat at position 73

In [None]:
# Prepare features and target
X_train = feature_matrix_train
X_val = feature_matrix_val
X_test = feature_matrix_test
y_train = train_df['currentPrice'].to_numpy()
y_val = val_df['currentPrice'].to_numpy()
y_test = test_df['currentPrice'].to_numpy()

# Convert sparse matrices to CSR format for efficient processing
X_train = X_train.tocsr()
X_val = X_val.tocsr()
X_test = X_test.tocsr()

# Log the shapes of the prepared features and targets
print("\nPrepared features and targets:")
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Prepare CatBoost Pools for training and validation
train_pool = Pool(data=X_train, label=y_train)
val_pool = Pool(data=X_val, label=y_val)

# Initialize the CatBoost Regressor
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    verbose=100
)

# Train the model with validation set
model.fit(train_pool, eval_set=val_pool, use_best_model=True)

# Make predictions on the test set
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # Convert predictions back from log scale if applicable
y_test_actual = np.expm1(y_test)  # Convert actual test values back from log scale if applicable

# Evaluate the model
mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_actual, y_pred)

# Print evaluation metrics
print("\nTest Set Metrics:")
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

# Save the trained model
save_dir = '/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

model_name = 'CatBoostPricePrediction'
model_path = os.path.join(save_dir, f'{model_name}.cbm')
model.save_model(model_path)
print(f'\nModel saved at: {model_path}')


In [None]:
# Make predictions on the test set
y_pred_log = model.predict(X_test)  # Predictions are in log scale if trained with log-transformed targets
y_pred = np.expm1(y_pred_log)  # Convert predictions back to original scale if applicable
y_test_actual = np.expm1(y_test)  # Convert actual test values back to original scale if applicable

# Evaluate the model
mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_actual, y_pred)

# Print evaluation metrics
print("\nFinal Test Set Metrics:")
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

# Save the trained model if not already saved
save_dir = '/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

model_name = 'CatBoostPricePrediction_TF_IDF'
model_path = os.path.join(save_dir, f'{model_name}.cbm')
model.save_model(model_path)
print(f'\nModel saved at: {model_path}')


In [None]:
from datetime import datetime
# Add predictions and calculate metrics
test_df['predicted_price'] = y_pred
test_df['actual_price'] = y_test_actual
test_df['price_difference'] = test_df['predicted_price'] - test_df['actual_price']
test_df['price_difference_pct'] = ((test_df['actual_price'] - test_df['predicted_price']) / test_df['predicted_price']) * -100

# Create analysis dataframe with all relevant fields
analysis_df = test_df[[
   'title',
   'actual_price',
   'predicted_price', 
   'price_difference',
   'price_difference_pct',
   'mainCategory',
   'description',
   'pickupState',
   'imageUrls',
   'itemId'
]].copy()

# Round numeric columns
numeric_cols = ['actual_price', 'predicted_price', 'price_difference', 'price_difference_pct']
analysis_df[numeric_cols] = analysis_df[numeric_cols].round(2)

# Sort by price difference percentage (descending order - largest gap first)
analysis_df = analysis_df.sort_values('price_difference', ascending=False)

# Save results with model name and timestamp
model_name = "CatBoostPricePrediction_TF_IDF"
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
filename = f"{model_name}_undervalued_products_{timestamp}.csv"
save_path = f"/sise/eliorsu-group/yuvalgor/courses/Data-mining-in-Big-Data/results/{filename}"
analysis_df.to_csv(save_path, index=False)
print(f"Analysis results saved at {save_path}")
