In [1]:
import pandas as pd
from itertools import combinations
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import ndcg_score
import numpy as np
from tqdm import tqdm
import csv

# Integrate tqdm with Pandas
tqdm.pandas()

df = pd.read_csv('data/Featured_Sports_and_Outdoors.csv')

In [None]:
# Assign relevance scores based on features
def relevance_score(row):
    return row['rating'] * 0.4 + row['sentiment_score'] * 0.3 + row['normalized_helpful_votes'] * 0.3

df['relevance_score'] = df.progress_apply(relevance_score, axis=1)

In [None]:
# Initialize CSV file
csv_filename = 'pair_labels.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['pair_index1', 'pair_index2', 'label'])

# Batch processing parameters
batch_size = 1000

# Process pairs in batches
for start in tqdm(range(0, len(df.index), batch_size), desc="Processing Batches"):
    batch_indices = df.index[start:start+batch_size]
    
    # Generate pairs within the batch
    batch_pairs = list(combinations(batch_indices, 2))
    
    # Open CSV file in append mode
    with open(csv_filename, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Process each pair in the batch
        for i, j in batch_pairs:
            if df.at[i, 'relevance_score'] > df.at[j, 'relevance_score']:
                label = 1
            else:
                label = 0
            
            # Write to CSV
            csv_writer.writerow([i, j, label])

In [2]:
from concurrent.futures import ThreadPoolExecutor

csv_filename = 'pair_labels.csv'
pair_labels_df = pd.read_csv(csv_filename)

# Prepare data for training
def create_pairs_dataframe(df, pair_labels_df, output_file='pairs_data.csv', batch_size=1000):
    num_batches = (len(pair_labels_df) // batch_size) + 1
    
    def process_batch(batch_idx):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(pair_labels_df))
        batch_pair_labels = pair_labels_df.iloc[start_idx:end_idx]
        
        batch_pairs_data = []
        for idx, row in batch_pair_labels.iterrows():
            i = row['pair_index1']
            j = row['pair_index2']
            label = row['label']
            
            review_1_data = df.loc[i].to_dict()
            review_2_data = df.loc[j].to_dict()
            
            batch_pairs_data.append({
                'review_1': review_1_data,
                'review_2': review_2_data,
                'label': label
            })
        
        # Convert batch data to DataFrame
        batch_df = pd.DataFrame(batch_pairs_data)
        
        # Append batch data to the CSV file
        if batch_idx == 0:
            batch_df.to_csv(output_file, index=False, mode='w', header=True)
        else:
            batch_df.to_csv(output_file, index=False, mode='a', header=False)
    
    # Using ThreadPoolExecutor to parallelize batch processing
    with ThreadPoolExecutor() as executor:
        list(tqdm(executor.map(process_batch, range(num_batches)), total=num_batches, desc="Creating Pairs DataFrame"))

create_pairs_dataframe(df, pair_labels_df)

Creating Pairs DataFrame:   0%|          | 56/45485 [00:57<12:56:15,  1.03s/it]


KeyboardInterrupt: 

In [None]:
pairs_df = pd.read_csv('pairs_data.csv')

# Split data into training and test sets
train_df, test_df = train_test_split(pairs_df, test_size=0.2, random_state=42)

In [None]:
# Convert data for XGBoost
def create_xgb_data(df):
    X = []
    y = []
    for _, row in df.iterrows():
        features = []
        for feature in ['rating', 'word_count', 'avg_sentence_length', 'sentiment_score', 'review_length', 'normalized_helpful_votes']:
            features.append(row['review_1'][feature] - row['review_2'][feature])
        X.append(features)
        y.append(row['label'])
    return np.array(X), np.array(y)

X_train, y_train = create_xgb_data(train_df)
X_test, y_test = create_xgb_data(test_df)

In [None]:
# Train XGBoost model with GPU
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'rank:pairwise',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'tree_method': 'gpu_hist',
    'eval_metric': 'ndcg'
}

bst = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
# Predict and evaluate
y_pred = bst.predict(dtest)
ndcg = ndcg_score([y_test], [y_pred])

In [None]:
print(f'NDCG Score: {ndcg}')