## Data Loading

In [1]:
import gzip, json
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

data = []

for review in parse("Software.json.gz"):
  data.append(review)

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split

indices = np.arange(len(data))
indices_train, indices_test = train_test_split(indices, test_size=0.1, random_state=42)
indices_train, indices_val = train_test_split(indices_train, test_size=0.1/0.9, random_state=42)

In [3]:
data_np = np.array(data)
data_train = data_np[indices_train]
data_val = data_np[indices_val]
data_test = data_np[indices_test]

In [4]:
y_train = [0 if ('vote' not in d) or (d['vote']==0)  else 1 for d in data_train]
y_val = [0 if ('vote' not in d) or (d['vote']==0)  else 1 for d in data_val]
y_test = [0 if ('vote' not in d) or (d['vote']==0)  else 1 for d in data_test]

## Naive Bayes

## Non-text (review) features

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# fit the ID encoder on full data set
userID_str = np.array([[d['reviewerID']] for d in data])
userID_encoder = LabelEncoder()
userID_encoder.fit(userID_str.squeeze())
userID_one_hot_encoder = OneHotEncoder()
userID_one_hot_encoder.fit(userID_str)

itemID_str = np.array([[d['asin']] for d in data])
itemID_encoder = LabelEncoder()
itemID_encoder.fit(itemID_str.squeeze())
itemID_one_hot_encoder = OneHotEncoder()
itemID_one_hot_encoder.fit(itemID_str)

In [None]:
import textstat
import os
import pickle 

if os.path.isfile('readability_scores.pickle'):
    with open('readability_scores.pickle', 'rb') as handle:
        readability_scores = pickle.load(handle)
    
else:
    # Might take 2-3 mins, please be patient!
    readability_scores = {}
    for d in data:
        if 'reviewText' in d:
            readability_scores[d['reviewText']] = textstat.flesch_reading_ease(d['reviewText'])

In [None]:
def get_meta_features(d, length=False, rating=False, readability=False, verified=False, userID=False, itemID=False):
    feature_vec = []
    
    if length:
        if 'reviewText' not in d:
            feature_vec.append(0)
        else:
            feature_vec.append(len(d['reviewText']))
            
    if rating:
        feature_vec.append(d['overall'])
    
    if readability:
        if 'reviewText' not in d:
            feature_vec.append(100)
        else:
            feature_vec.append(readability_scores[d['reviewText']])
            
    if verified:
        if 'verified' not in d:
            feature_vec.append(0)
        elif d['verified']:
            feature_vec.append(1)
        else:
            feature_vec.append(0)
            
    if userID:
        feature_vec.extend(list(userID_one_hot_encoder.transform([[d["reviewerID"]]]).toarray()[0]))
                                          
    if itemID:
        feature_vec.extend(list(itemID_one_hot_encoder.transform([[d["asin"]]]).toarray()[0]))      
        
    return feature_vec
        

In [8]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
def test_metrics(clf, X_t, y_t):
    y_pred = clf.predict(X_t)
    precision, recall, _, _ = precision_recall_fscore_support(y_t, y_pred, average='binary')
    accuracy = accuracy_score(y_t, y_pred)
    return accuracy, precision, recall

In [10]:
X_train = [get_meta_features(d, length=True, rating=True, readability=True, verified=True, userID=False, itemID=False) for d in data_train]
X_val = [get_meta_features(d, length=True, rating=True, readability=True, verified=True, userID=False, itemID=False) for d in data_val]
X_test = [get_meta_features(d, length=True, rating=True, readability=True, verified=True, userID=False, itemID=False) for d in data_test]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(X_train)

In [20]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(scaler.transform(X_train), y_train)
test_metrics(clf, scaler.transform(X_test), y_test)

(0.759098032387254, 0.5540712468193384, 0.6821741854636592)

In [11]:
# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(scaler.transform(X_train), y_train)
test_metrics(clf, scaler.transform(X_test), y_test)

(0.7476275465784433, 0.5358605930907979, 0.6864035087719298)

In [26]:
# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(scaler.transform(X_train), y_train)
test_metrics(clf, scaler.transform(X_test), y_test)

(0.740531951941494, 0.5225301909879236, 0.769266917293233)

## Text Features

In [9]:
train_documents = []
for d in data_train: 
    strs = []
    if 'summary' in d:
        strs.append(d['summary'])
    if 'reviewText' in d:
        strs.append(d['reviewText'])
    train_documents.append(" ".join(strs))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(stop_words='english', max_features=300)
bow_vectorizer.fit(train_documents)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=300)
tfidf_vectorizer.fit(train_documents)

In [12]:
w2v_features = np.load('text-feature_word2vec.npz')['word2vec']
w2v_dict = {}

count = 0 
for d in data:
    if 'reviewText' in d:
        w2v_dict[d['reviewText']] = w2v_features[count, :]
    else:
        w2v_dict[''] =  w2v_features[count, :]
    count+=1

In [13]:
all_docs = []
for d in data: 
    strs = []
    if 'summary' in d:
        strs.append(d['summary'])
    if 'reviewText' in d:
        strs.append(d['reviewText'])
    all_docs.append(" ".join(strs))
    

In [14]:
tfidf_vecs = tfidf_vectorizer.transform(all_docs).toarray()

tfidf_dict = {}
count = 0
for d in data:
    if 'reviewText' in d:
        tfidf_dict[d['reviewText']] = tfidf_vecs[count, :]
    else:
        tfidf_dict[''] =  tfidf_vecs[count, :]
    count+=1

In [15]:
bow_vecs = bow_vectorizer.transform(all_docs).toarray()

bow_dict = {}
count = 0
for d in data:
    if 'reviewText' in d:
        bow_dict[d['reviewText']] = bow_vecs[count, :]
    else:
        bow_dict[''] =  bow_vecs[count, :]
    count+=1

In [27]:
def get_text_features(d, BoW=False, tfidf=False, w2v=False):
    
    feature_vec = []
    
    if 'reviewText' in d:
        reviewText = d['reviewText']
    else:
        reviewText = ''
    
    if BoW:
        feature_vec.extend(bow_dict[reviewText].tolist())
    if tfidf:
        feature_vec.extend(tfidf_dict[reviewText].ravel())
    if w2v:
        feature_vec.extend(w2v_dict[reviewText].tolist())
        
    return feature_vec
       

In [35]:
X_train = [get_text_features(d, BoW=True, tfidf=False, w2v=False) for d in data_train]
X_val = [get_text_features(d, BoW=True, tfidf=False, w2v=False) for d in data_val]
X_test = [get_text_features(d, BoW=True, tfidf=False, w2v=False) for d in data_test]

# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(X_train, y_train)
print('Logistic ',test_metrics(clf, X_test, y_test))

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(X_train, y_train)
print('Ridge ', test_metrics(clf, X_test, y_test))

# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(X_train, y_train)
print('Random Forest ', test_metrics(clf, X_test, y_test))

Logistic  (0.7470398746299843, 0.536599386816556, 0.6579730576441103)
Ridge  (0.7381377328922166, 0.523442967109867, 0.6444235588972431)
Random Forest  (0.7135425735678217, 0.4884905991917062, 0.6531954887218046)


In [None]:
X_train = [get_text_features(d, BoW=False, tfidf=True, w2v=False) for d in data_train]
X_val = [get_text_features(d, BoW=False, tfidf=True, w2v=False) for d in data_val]
X_test = [get_text_features(d, BoW=False, tfidf=True, w2v=False) for d in data_test]

In [42]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(X_train, y_train)
print('Logistic ',  test_metrics(clf, X_test, y_test))

Logistic  (0.7235112310639039, 0.5016963307061956, 0.7528195488721805)


In [None]:
# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(X_train, y_train)
print('Ridge ', test_metrics(clf, X_test, y_test))

In [None]:
# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(X_train, y_train)
print('Random forest ', test_metrics(clf, X_test, y_test))

In [None]:
X_train = [get_text_features(d, BoW=False, tfidf=True, w2v=False) for d in data_train]
X_val = [get_text_features(d, BoW=False, tfidf=True, w2v=False) for d in data_val]
X_test = [get_text_features(d, BoW=False, tfidf=True, w2v=False) for d in data_test]

# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(X_train, y_train)
print('Logistic ',  test_metrics(clf, X_test, y_test))

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(X_train, y_train)
print('Ridge ', test_metrics(clf, X_test, y_test))

# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(X_train, y_train)
print('Random forest ', test_metrics(clf, X_test, y_test))

In [None]:
X_train = [get_text_features(d, BoW=False, tfidf=False, w2v=True) for d in data_train]
X_val = [get_text_features(d, BoW=False, tfidf=False, w2v=True) for d in data_val]
X_test = [get_text_features(d, BoW=False, tfidf=False, w2v=True) for d in data_test]

# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(X_train, y_train)
print('Logistic ',  test_metrics(clf, X_test, y_test))

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(X_train, y_train)
print('Ridge ', test_metrics(clf, X_test, y_test))

# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(X_train, y_train)
print('Random forest ', test_metrics(clf, X_test, y_test))

## Text + HandCrafted Features

In [None]:
def get_features(d, BoW=False, tfidf=False, w2v=False):
    a = get_text_features(d, BoW, tfidf, w2v)
    a.extend(get_meta_features(d, True, True, True, True))
    return a

In [None]:
X_train = [get_features(d, BoW=True, tfidf=False, w2v=False) for d in data_train]
X_val = [gett_features(d, BoW=True, tfidf=False, w2v=False) for d in data_val]
X_test = [get_features(d, BoW=True, tfidf=False, w2v=False) for d in data_test]

# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(X_train, y_train)
print('Logistic ',test_metrics(clf, X_test, y_test))

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(X_train, y_train)
print('Ridge ', test_metrics(clf, X_test, y_test))

# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(X_train, y_train)
print('Random Forest ', test_metrics(clf, X_test, y_test))

In [None]:
X_train = [get_features(d, BoW=False, tfidf=True, w2v=False) for d in data_train]
X_val = [get_features(d, BoW=False, tfidf=True, w2v=False) for d in data_val]
X_test = [get_features(d, BoW=False, tfidf=True, w2v=False) for d in data_test]

# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(X_train, y_train)
print('Logistic ',  test_metrics(clf, X_test, y_test))

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(X_train, y_train)
print('Ridge ', test_metrics(clf, X_test, y_test))

# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(X_train, y_train)
print('Random forest ', test_metrics(clf, X_test, y_test))

In [None]:
X_train = [get_features(d, BoW=False, tfidf=False, w2v=True) for d in data_train]
X_val = [get_features(d, BoW=False, tfidf=False, w2v=True) for d in data_val]
X_test = [get_features(d, BoW=False, tfidf=False, w2v=True) for d in data_test]

# Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, class_weight='balanced', C= 1.0).fit(X_train, y_train)
print('Logistic ',  test_metrics(clf, X_test, y_test))

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(random_state=0, class_weight='balanced',alpha= 1.0).fit(X_train, y_train)
print('Ridge ', test_metrics(clf, X_test, y_test))

# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5).fit(X_train, y_train)
print('Random forest ', test_metrics(clf, X_test, y_test))