In [1]:
import pandas as pd
from itertools import combinations
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import ndcg_score
import numpy as np
from tqdm import tqdm
import csv

# Integrate tqdm with Pandas
tqdm.pandas()

df = pd.read_csv('data/Featured_Sports_and_Outdoors.csv')
df = df.head(10000)

In [2]:
# Assign relevance scores based on features
def relevance_score(row):
    return row['rating'] * 0.4 + row['sentiment_score'] * 0.3 + row['normalized_helpful_votes'] * 0.3

df['relevance_score'] = df.progress_apply(relevance_score, axis=1)

100%|██████████| 10000/10000 [00:00<00:00, 26993.05it/s]


In [3]:
# Initialize CSV file
csv_filename = 'pair_labels_10000.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['pair_index1', 'pair_index2', 'label'])

# Batch processing parameters
batch_size = 1000

# Process pairs in batches
for start in tqdm(range(0, len(df.index), batch_size), desc="Processing Batches"):
    batch_indices = df.index[start:start+batch_size]
    
    # Generate pairs within the batch
    batch_pairs = list(combinations(batch_indices, 2))
    
    # Open CSV file in append mode
    with open(csv_filename, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Process each pair in the batch
        for i, j in batch_pairs:
            if df.at[i, 'relevance_score'] > df.at[j, 'relevance_score']:
                label = 1
            else:
                label = 0
            
            # Write to CSV
            csv_writer.writerow([i, j, label])

Processing Batches: 100%|██████████| 10/10 [02:15<00:00, 13.52s/it]


In [4]:
from concurrent.futures import ThreadPoolExecutor

csv_filename = 'pair_labels_10000.csv'
pair_labels_df = pd.read_csv(csv_filename)

# Prepare data for training
def create_pairs_dataframe(df, pair_labels_df, output_file='pairs_data_10000.csv', batch_size=500):
    num_batches = (len(pair_labels_df) // batch_size) + 1
    
    def process_batch(batch_idx):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(pair_labels_df))
        batch_pair_labels = pair_labels_df.iloc[start_idx:end_idx]
        
        batch_pairs_data = []
        for idx, row in batch_pair_labels.iterrows():
            i = row['pair_index1']
            j = row['pair_index2']
            label = row['label']
            
            review_1_data = df.loc[i].to_dict()
            review_2_data = df.loc[j].to_dict()
            
            batch_pairs_data.append({
                'review_1': review_1_data,
                'review_2': review_2_data,
                'label': label
            })
        
        # Convert batch data to DataFrame
        batch_df = pd.DataFrame(batch_pairs_data)
        
        # Append batch data to the CSV file
        if batch_idx == 0:
            batch_df.to_csv(output_file, index=False, mode='w', header=True)
        else:
            batch_df.to_csv(output_file, index=False, mode='a', header=False)
    
    # Using ThreadPoolExecutor to parallelize batch processing
    with ThreadPoolExecutor(max_workers=12) as executor:
        list(tqdm(executor.map(process_batch, range(num_batches)), total=num_batches, desc="Creating Pairs DataFrame"))

create_pairs_dataframe(df, pair_labels_df)

Creating Pairs DataFrame: 100%|██████████| 9991/9991 [1:00:10<00:00,  2.77it/s]


In [9]:
pairs_df = pd.read_csv('pairs_data_10000.csv')

# Split data into training and test sets
train_df, test_df = train_test_split(pairs_df, test_size=0.2, random_state=42)

In [6]:
with open('pairs_data_10000.csv', 'r', encoding='utf-8') as file:
    for idx, line in enumerate(file):
        if idx == 503:  
            print(line)
            break


"{'timestamp': '2021-02-10 14:02:26.894', 'rating': 5.0, 'helpful_vote': 0, 'title': 'Great stirrups with bar none grip', 'text': 'I have to say, the grip on these are pretty great. So great in fact that when my boy walked off the mounting block when I had one foot in the stirrup, I became hopelessly stuck and pulled the entire saddle askew in my attempt to free myself. So yeah. Buy these.', 'asin': 'B002HPNBMU', 'verified_purchase': True, 'user_id': 'AGGZ357AO26RQZVRLGU4D4N52DZQ', 'language': 'en', 'word_count': 58, 'avg_sentence_length': 14.5, 'sentiment_score': 0.8856, 'key_phrases': ""['I', 'the grip', 'these', 'fact', 'my boy', 'the mounting block', 'I', 'one foot', 'the stirrup', 'I', 'the entire saddle askew', 'my attempt', 'myself', 'these']"", 'review_length': 261, 'normalized_helpful_votes': 0.0, 'relevance_score': 2.26568}","{'timestamp': '2021-03-04 19:30:44.430', 'rating': 5.0, 'helpful_vote': 0, 'title': 'Nice bait', 'text': 'My grandson is always losing these lures But w

In [7]:
import csv

# Function to correct the CSV file by rewriting specific lines
def correct_csv_line(csv_file, line_number, corrected_line):
    # Read all lines from the CSV file
    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
        lines = list(csv.reader(file))

    # Replace the specific line with the corrected content
    lines[line_number - 1] = corrected_line  # line_number - 1 because list is zero-indexed

    # Write the corrected lines back to the CSV file
    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(lines)

# Example usage to correct line 102 in pairs_data.csv
csv_file = 'pairs_data_10000.csv'
line_number = 502  # Adjust this as per the actual line number in your file

# Corrected line based on the provided example (adjust as needed)
corrected_line = [
    '{"timestamp": "2021-02-10 14:02:26.894", "rating": 5.0, "helpful_vote": 0, "title": "Great stirrups with bar none grip", "text": "I have to say, the grip on these are pretty great. So great in fact that when my boy walked off the mounting block when I had one foot in the stirrup, I became hopelessly stuck and pulled the entire saddle askew in my attempt to free myself. So yeah. Buy these.", "asin": "B002HPNBMU", "verified_purchase": true, "user_id": "AGGZ357AO26RQZVRLGU4D4N52DZQ", "language": "en", "word_count": 58, "avg_sentence_length": 14.5, "sentiment_score": 0.8856, "key_phrases": "[\'I\', \'the grip\', \'these\', \'fact\', \'my boy\', \'the mounting block\', \'I\', \'one foot\', \'the stirrup\', \'I\', \'the entire saddle askew\', \'my attempt\', \'myself\', \'these\']", "review_length": 261, "normalized_helpful_votes": 0.0, "relevance_score": 2.26568}',
    '{"timestamp": "2021-03-04 19:30:44.430", "rating": 5.0, "helpful_vote": 0, "title": "Nice bait", "text": "My grandson is always losing these lures But we talked about it...He said that was a good thing because that means the fish are biting on them a lot which makes sense LOL So I got another box. He loves them too", "asin": "B08SPY5HM5", "verified_purchase": false, "user_id": "AFFZVSTUS3U2ZD22A2NPZSKOCPGQ", "language": "en", "word_count": 45, "avg_sentence_length": 22.5, "sentiment_score": 0.9313, "key_phrases": "[\'My grandson\', \'these lures\', \'we\', \'it\', \'He\', \'that\', \'a good thing\', \'that\', \'the fish\', \'them\', \'which\', \'sense\', \'I\', \'another box\', \'He\', \'them\']", "review_length": 210, "normalized_helpful_votes": 0.0, "relevance_score": 2.27939}',
    "0"
]

# Correct the CSV line
correct_csv_line(csv_file, line_number, corrected_line)

print(f"Line {line_number} in {csv_file} has been corrected.")


Line 502 in pairs_data_10000.csv has been corrected.


In [None]:
import pandas as pd
import numpy as np

def create_xgb_data(df):
    X = []
    y = []
    
    for index, row in df.iterrows():
        features = []
        for feature in ['rating', 'word_count', 'avg_sentence_length', 'sentiment_score', 'review_length', 'normalized_helpful_votes']:
            try:
                review1_features = eval(row['review_1'])  
                review2_features = eval(row['review_2']) 
                feature_diff = review1_features.get(feature, 0) - review2_features.get(feature, 0)
            except Exception as e:
                print(f"Error processing features: {e}")
                feature_diff = 0  
            
            features.append(feature_diff)
        
        X.append(features)
        y.append(row['label'])
    
    return np.array(X), np.array(y)

X_train, y_train = create_xgb_data(train_df)
X_test, y_test = create_xgb_data(test_df)


In [11]:
# Train XGBoost model with GPU
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'rank:pairwise',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'tree_method': 'gpu_hist',
    'eval_metric': 'ndcg'
}

bst = xgb.train(params, dtrain, num_boost_round=100)


    E.g. tree_method = "hist", device = "cuda"



In [12]:
# Predict and evaluate
y_pred = bst.predict(dtest)
ndcg = ndcg_score([y_test], [y_pred])


    E.g. tree_method = "hist", device = "cuda"



In [13]:
print(f'NDCG Score: {ndcg}')

NDCG Score: 0.9436917984256741


In [14]:
import pickle

# Save the model to disk
with open('pairwise_ranking_model.pkl', 'wb') as f:
    pickle.dump(bst, f)

In [None]:
import pandas as pd

# Assuming the given DataFrame is named `df`
# Select relevant columns
columns = ['text', 'sentiment_score', 'word_count', 'avg_sentence_length', 'normalized_helpful_votes', 'relevance_score']
df_selected = df[columns]

# Save to a new CSV file
df_selected.to_csv('reviews_for_classification.csv', index=False)

print("CSV file 'reviews_for_classification.csv' created successfully.")
