In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
import pandas as pd
import numpy as np
from joblib import load, dump
from copy import deepcopy
from statistics import mean

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

In [9]:
df = pd.read_csv('data/Features.csv')

In [10]:
df

Unnamed: 0,product,answer_option,label,review_len,Rn,Rp,Rs,Rc,Rd,Rsc
0,Accucheck,Fast and accurate delivery,0,4,0.232859,0.300000,0.616667,0.005420,1.0,0.0000
1,Accucheck,Expected a longer expire date. Your Product Li...,0,14,0.596318,-0.100000,0.400000,0.017615,1.0,0.0000
2,Accucheck,I liked the prompt service,0,5,0.319747,0.600000,0.800000,0.006775,1.0,0.4215
3,Accucheck,Good product,0,2,0.546925,0.700000,0.600000,0.002710,0.0,0.4404
4,Accucheck,I not needed,0,3,0.000000,0.000000,0.000000,0.004065,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...
1650,shampoo,Its not much effective as it has been stated i...,0,12,0.500000,-0.300000,0.800000,0.028640,0.0,-0.3724
1651,shampoo,Liked it very nicely working now my scalp is a...,1,11,0.166375,0.690000,0.900000,0.026253,0.0,0.5709
1652,shampoo,its my regular choice,0,4,0.500000,0.000000,0.076923,0.009547,0.0,0.0000
1653,shampoo,Good but not very effective,0,5,0.000000,0.234615,0.607692,0.011933,0.0,-0.4032


### Ranking is a canonical problem for humans. It is easy to classify whether a review is useful (informative) or not. However, ranking reviews on the basis of usefulness, is a complex task. Our ranking methodology is based on this simple education.

#### Pairwise ranking approach is applied to rank reviews in the semi-supervised learning method. The pairwise ranking approach looks at a pair of documents at a time in a loss function and predicts a relative ordering. The objective is not to determine the relevance score but to find which document is more relevant than others. This relevance is developed to judge the preference of one review over another.
#### In the semi-supervised learning method, mapping is constructed between input and output. This input-output pair in the training model is used to learn the system.
#### Review Segregation: We segregated two sets of reviews on which we train our model.
+ Set 0 represents reviews with label 0, i.e., ones that are not informative. These include reviews based on delivery, customer support, packaging, etc. These reviews do not describe the product.
+ Set 1 represents reviews with label 1, i.e., reviews that are informative and are better than all reviews of Set 0;
#### How we segregated and determined labels for reviews:
### `Our entire review ranking system is based on the idea that it is easier for humans to binary classify reviews which we call Set 0 and Set 1.`

For each product 'Accucheck', 'Becadexamin', 'Evion', 'Neurobion','SevenseascodLiverOil', 'Shelcal', 'Supradyn','shampoo', we asked 10 different people to label reviews as a 1 (informative review) and 0 ( not informative review). Different participants were asked to label so that there is no bias and the model learns to its best.

In [11]:
data_split = pd.crosstab(df['product'],df['label'])
data_split

label,0,1
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Accucheck,310,85
Becadexamin,53,27
Evion,89,33
Neurobion,280,136
SevenseascodLiverOil,59,22
Shelcal,259,124
Supradyn,50,23
shampoo,56,49


## Building the training set:
#### We pairwise compared each review of set1 with all reviews of set0 and vice-versa
+ (Rx, Ry,1) where x∈Set1 and y∈Set0 → Rx is better than Ry
+ (Ry, Rx, 0) where x∈Set1 and y∈Set0 → Ry is worst than Rx
<br>

#### This now becomes a classification problem.

<hr>

![PairwiseRanking](Photos/PairwiseRanking.png)

In [12]:
def building_training_data(df):
    A = df[df['label']==1]
    A.loc[df['label']==1,'join'] = 'j'
    B = df[df['label']==0]
    B.loc[df['label']==0,'join'] = 'j'
    trainset1 = pd.merge(A,B,how='outer',on='join')
    trainset2 = pd.merge(B,A,how='outer',on ='join')

    trainset = pd.merge(trainset1,trainset2,how='outer')
    return trainset

In [13]:
product_list = df['product'].unique()
data_stack = []
for product in product_list:
    temp = deepcopy(df[df['product']==product].iloc[:,2:])
    build_data = building_training_data(temp)
    print(product, len(temp), len(build_data))
    build_data.drop(columns = ['join','label_y'],inplace=True)
    data = build_data.iloc[:,1:]
    data['target'] = build_data.iloc[:,0]
    data_stack.append(data)

Accucheck 395 52700
Becadexamin 80 2862
Evion 122 5874
Neurobion 416 76160
SevenseascodLiverOil 81 2596
Shelcal 383 64232
Supradyn 73 2300
shampoo 105 5488


In [14]:
train = pd.concat(data_stack).reset_index(drop = True)

In [15]:
train

Unnamed: 0,review_len_x,Rn_x,Rp_x,Rs_x,Rc_x,Rd_x,Rsc_x,review_len_y,Rn_y,Rp_y,Rs_y,Rc_y,Rd_y,Rsc_y,target
0,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,4,0.232859,0.300000,0.616667,0.005420,1.0,0.0000,1
1,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,14,0.596318,-0.100000,0.400000,0.017615,1.0,0.0000,1
2,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,5,0.319747,0.600000,0.800000,0.006775,1.0,0.4215,1
3,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,2,0.546925,0.700000,0.600000,0.002710,0.0,0.4404,1
4,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,2,0.546925,0.700000,0.600000,0.002710,0.0,0.4404,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212207,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,15,0.479132,0.700000,0.600000,0.035800,0.0,0.1531,0
212208,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,7,0.392391,0.316667,0.600000,0.016706,0.0,0.2382,0
212209,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,25,0.417103,0.233333,0.255556,0.059666,0.0,0.5927,0
212210,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,10,0.467972,0.500000,0.500000,0.023866,0.0,0.1779,0


In [16]:
X = train.iloc[:,:-1].values
y = train.iloc[:,-1].values

from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle = True, stratify = y) 
print("Test Len:",len(X_test)," ",len(y_test))

Test Len: 42443   42443


In [17]:
train

Unnamed: 0,review_len_x,Rn_x,Rp_x,Rs_x,Rc_x,Rd_x,Rsc_x,review_len_y,Rn_y,Rp_y,Rs_y,Rc_y,Rd_y,Rsc_y,target
0,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,4,0.232859,0.300000,0.616667,0.005420,1.0,0.0000,1
1,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,14,0.596318,-0.100000,0.400000,0.017615,1.0,0.0000,1
2,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,5,0.319747,0.600000,0.800000,0.006775,1.0,0.4215,1
3,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,2,0.546925,0.700000,0.600000,0.002710,0.0,0.4404,1
4,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,2,0.546925,0.700000,0.600000,0.002710,0.0,0.4404,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212207,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,15,0.479132,0.700000,0.600000,0.035800,0.0,0.1531,0
212208,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,7,0.392391,0.316667,0.600000,0.016706,0.0,0.2382,0
212209,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,25,0.417103,0.233333,0.255556,0.059666,0.0,0.5927,0
212210,10,0.407447,0.50,0.500000,0.023866,0.0,-0.1263,10,0.467972,0.500000,0.500000,0.023866,0.0,0.1779,0


# Spot Checking-
+ Linear Model
+ Non-Linear Model
+ Ensemble Model

<hr>

## Linear Model: Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

Training Accuracy
 0.8511624619335686
Test Accuracy
 0.8499163584101029
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       0.85      0.85      0.85     84885
           1       0.85      0.85      0.85     84884

    accuracy                           0.85    169769
   macro avg       0.85      0.85      0.85    169769
weighted avg       0.85      0.85      0.85    169769

Test 
               precision    recall  f1-score   support

           0       0.85      0.85      0.85     21221
           1       0.85      0.85      0.85     21222

    accuracy                           0.85     42443
   macro avg       0.85      0.85      0.85     42443
weighted avg       0.85      0.85      0.85     42443



### Accuracy: 85%
### F1-score: 85%

## Non-Linear Model: DecisionTree

In [19]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train,y_train)

print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

Training Accuracy
 0.9963185269395473
Test Accuracy
 0.9839785123577504
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     84885
           1       1.00      0.99      1.00     84884

    accuracy                           1.00    169769
   macro avg       1.00      1.00      1.00    169769
weighted avg       1.00      1.00      1.00    169769

Test 
               precision    recall  f1-score   support

           0       0.98      0.99      0.98     21221
           1       0.99      0.98      0.98     21222

    accuracy                           0.98     42443
   macro avg       0.98      0.98      0.98     42443
weighted avg       0.98      0.98      0.98     42443



## Ensemble Model: RandomForest

In [20]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=50, n_jobs = -1, oob_score = True,random_state=42)
classifier.fit(X_train,y_train)

print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

print("Test\nConfusion Matrix: \n", confusion_matrix(y_test, classifier.predict(X_test)))

Training Accuracy
 0.9963185269395473
Test Accuracy
 0.9879367622458356
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     84885
           1       1.00      1.00      1.00     84884

    accuracy                           1.00    169769
   macro avg       1.00      1.00      1.00    169769
weighted avg       1.00      1.00      1.00    169769

Test 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     21221
           1       0.99      0.99      0.99     21222

    accuracy                           0.99     42443
   macro avg       0.99      0.99      0.99     42443
weighted avg       0.99      0.99      0.99     42443

Test
Confusion Matrix: 
 [[20978   243]
 [  269 20953]]


In [21]:
## Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when oob_score is True.
classifier.oob_score_

0.98775394801171

In [22]:
feature_importances = pd.DataFrame(classifier.feature_importances_,
                                   index = train.iloc[:,:-1].columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
Rd_y,0.18947
Rd_x,0.180334
review_len_x,0.123746
review_len_y,0.100583
Rc_y,0.083603
Rc_x,0.064536
Rsc_x,0.044181
Rsc_y,0.042823
Rn_y,0.041362
Rn_x,0.038366


In [23]:
dump(classifier, 'randomforest.joblib', compress = 2)

['randomforest.joblib']

## RandomForest Classifier Weights Saved. 
### Accuracy: 0.98
### oob_score: 0.98

+ Note, if in your usecase data is too small to split to train-test-split then one can train model on entire data and measure out of bag score. 

<hr>

## PART 2. Model Ranking Metric

### Accuracy of Ranking Methodology
+ After sorting the reviews by the review score, we wanted all reviews in Set 1 to be above all reviews of Set 0.
+ To test this hypothesis, we developed the following Ranking Metric
+ Let the number of 1s in our Dataset be x.
### `Ranking Accuracy on Single Product = Number of 1s found in first x positions / x`

In [24]:
classifier = load('randomforest.joblib')

In [25]:
product_list = df['product'].unique()
df['win']=0
df['lose']=0
df['review_score'] = 0.0
df.reset_index(inplace = True, drop = True)


def score_giver(C,D):
    E = pd.merge(C,D,how='outer',on='j')
    E.drop(columns=['j'],inplace = True)
    q= classifier.predict(E.values)
    return Counter(q)

for product in product_list:
    data = df[df['product']==product]
    for indx in data.index:
        review = df.iloc[indx, 3:-3]
        review['j'] = 'jn'
        C = pd.DataFrame([review])
        D = data[data.index!=indx].iloc[:,3:-3]
        D['j'] = 'jn'
        score = score_giver(C,D)
        df.at[indx, 'win'] = 0 if score.get(1) is None else score.get(1)
        df.at[indx, 'lose'] = 0 if score.get(0) is None else score.get(0)
        df.at[indx, 'review_score'] = float(0 if score.get(1) is None else score.get(1)) / len(data) * 1.0

df = df.sort_values(by = ['product','review_score'], ascending = False)

r_accuracy =[]
for product in product_list:
    x = data_split[data_split.index == product][1][0]
    number_of_1_in_x = Counter(df[df['product']==product].iloc[:x, ]['label']).get(1)
    rank_accuracy = float(number_of_1_in_x*1.0 / x*1.0)
    print("Product: {} | Rank Accuracy: {}".format(product, rank_accuracy))
    r_accuracy.append(rank_accuracy)
print("Mean Rank Accuracy: {}".format(mean(r_accuracy)))

Product: Accucheck | Rank Accuracy: 0.9647058823529412
Product: Becadexamin | Rank Accuracy: 0.9629629629629629
Product: Evion | Rank Accuracy: 1.0
Product: Neurobion | Rank Accuracy: 0.9044117647058824
Product: SevenseascodLiverOil | Rank Accuracy: 1.0
Product: Shelcal | Rank Accuracy: 0.9435483870967742
Product: Supradyn | Rank Accuracy: 1.0
Product: shampoo | Rank Accuracy: 1.0
Mean Rank Accuracy: 0.97195362463982


In [26]:
df

Unnamed: 0,product,answer_option,label,review_len,Rn,Rp,Rs,Rc,Rd,Rsc,win,lose,review_score
1564,shampoo,Wash your head within 3 days for sometimes Or ...,1,39,1.199331,0.150000,0.300000,0.073986,1.0,-0.1823,104,0,0.990476
1643,shampoo,I've too much dandruff and acne on my scalp .....,1,25,0.417103,0.233333,0.255556,0.059666,0.0,0.5927,103,1,0.980952
1615,shampoo,I was diagnosed with Seborrheic Dermatitis a d...,1,79,0.935921,0.008750,0.510000,0.136038,0.0,0.7184,101,3,0.961905
1588,shampoo,it is a best recommended anti dandruff shampoo...,1,23,0.381944,0.850000,0.450000,0.047733,0.0,0.7814,99,5,0.942857
1602,shampoo,Excellent outstanding product,1,3,0.251014,0.750000,0.937500,0.007160,0.0,0.8271,99,5,0.942857
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,Accucheck,prompt delivery,0,2,0.364110,0.000000,0.000000,0.002710,1.0,0.0000,113,281,0.286076
191,Accucheck,timely delivery,0,2,0.452900,0.000000,0.000000,0.002710,1.0,0.0000,113,281,0.286076
18,Accucheck,Late delivery,0,2,0.344364,-0.300000,0.600000,0.002710,1.0,0.0000,29,365,0.073418
116,Accucheck,Late delivery,0,2,0.344364,-0.300000,0.600000,0.002710,1.0,0.0000,29,365,0.073418


In [27]:
df.iloc[:, [0,1,-1]].to_csv('data/train_ranked_output.csv',index = False)

In [29]:
from zipfile import ZipFile
import urllib.request
from io import BytesIO
folder = urllib.request.urlopen('https://s3.amazonaws.com/projex.dezyre.com/ecommerce-product-reviews-ranking-sentiment-analysis/materials/data.zip')
zipfile = ZipFile(BytesIO(folder.read()))
zipfile.namelist()

['Features.csv',
 'Preprocessed_Reviews.csv',
 'test.csv',
 'test_ranked_output.csv',
 'test_withoutlabel.csv',
 'train.csv',
 'train_ranked_output.csv']

In [31]:
t = pd.read_csv(zipfile.open('test.csv'))