In [14]:
import pandas as pd
import pickle
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load your existing DataFrame with relevant features
df = pd.read_csv('reviews_for_classification.csv')

# Load pairwise ranking model (assuming it's already trained and saved)
with open('pairwise_ranking_model.pkl', 'rb') as f:
    pairwise_ranking_model = pickle.load(f)

# Extract relevant features from DataFrame for classification
X_features = df[['sentiment_score', 'word_count', 'avg_sentence_length', 'normalized_helpful_votes']]

# Convert X_features to DMatrix
dfeatures = xgb.DMatrix(X_features)

# Assuming `ranking_scores` are extracted from pairwise ranking model
ranking_scores = pairwise_ranking_model.predict(dfeatures)

# Combine ranking scores with existing features
df['ranking_scores'] = ranking_scores

# Split data into features and target for classification
X = df[['sentiment_score', 'word_count', 'avg_sentence_length', 'normalized_helpful_votes', 'ranking_scores']]
y = df['relevance_score']  # Assuming 'relevance_score' is your target for classification

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert continuous relevance scores to binary labels
threshold = 0.5  # Example threshold for classification

y_train_class = np.where(y_train >= threshold, 1, 0)
y_test_class = np.where(y_test >= threshold, 1, 0)

# Example: Use Logistic Regression as a classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train_class)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate classifier performance
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test_class, y_pred))
print('Accuracy:', accuracy_score(y_test_class, y_pred))



              precision    recall  f1-score   support

           0       0.48      0.19      0.27        63
           1       0.97      0.99      0.98      1937

    accuracy                           0.97      2000
   macro avg       0.73      0.59      0.63      2000
weighted avg       0.96      0.97      0.96      2000

Accuracy: 0.968



    E.g. tree_method = "hist", device = "cuda"



In [15]:
# Assuming X_features contains extracted features for each review
# Step 2: Use pairwise ranking model
dfeatures = xgb.DMatrix(X_features)
ranking_scores = pairwise_ranking_model.predict(dfeatures)

# Step 3: Combine features for classification
X_combined = pd.concat([X_features, pd.DataFrame(ranking_scores, columns=['ranking_scores'])], axis=1)

# Step 4: Use classification model to predict relevance
y_pred = classifier.predict(X_combined)

# Step 5: Interpret results
relevant_reviews = df[y_pred == 1]
irrelevant_reviews = df[y_pred == 0]

# Optionally, print or analyze relevant_reviews and irrelevant_reviews
print("Relevant Reviews:")
print(relevant_reviews.head())
print()
print("Irrelevant Reviews:")
print(irrelevant_reviews.head())


Relevant Reviews:
                                                text  sentiment_score  \
0  Not gonna lie- they are not much to look at. L...           0.9982   
1                                 I love it. Pretty!           0.8268   
2  Huge fan of B Vertigo and this dressage pad do...           0.8197   
3  This was great for a slightly too-short girth!...           0.8715   
4  I have to say, the grip on these are pretty gr...           0.8856   

   word_count  avg_sentence_length  normalized_helpful_votes  relevance_score  \
0         393                15.72                       0.8          2.53946   
1           6                 3.00                       0.0          2.24804   
2          33                16.50                       0.0          2.24591   
3          18                 6.00                       0.0          2.26145   
4          58                14.50                       0.0          2.26568   

   ranking_scores  
0             0.0  
1             0.