## Read Data

In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas as pd
import numpy as np
import nltk
import string
import xgboost as xgb
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from gensim.models import Word2Vec
import time

Using TensorFlow backend.


In [2]:
alexa_reviews = pd.read_csv('./amazon_alexa.tsv', sep="\t")

In [3]:
alexa_reviews.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [4]:
## Prototype func for classification
def train_model(classifier, feature_vector_train, label, feature_vector_valid, label_y, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return classifier, metrics.accuracy_score(predictions, label_y)

## Classification of reviews

In [5]:
#remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
alexa_reviews['verified_reviews_corrected'] = alexa_reviews['verified_reviews'].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords]))

# lemmarizer or stemmer
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
# sentences = [' '.join([lemmatizer.lemmatize(word, 'v') for word in doc.split()]) for doc in sentences]
alexa_reviews['verified_reviews_corrected'] = alexa_reviews['verified_reviews_corrected'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# remove short words
alexa_reviews['verified_reviews_corrected'] = alexa_reviews['verified_reviews_corrected'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>=3]))

alexa_reviews['verified_reviews_corrected'] = alexa_reviews['verified_reviews_corrected'].apply(lambda x: " ".join(x.lower().translate(str.maketrans('','',string.punctuation)).split()))

In [6]:
# split dataset into training and test
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(alexa_reviews['verified_reviews_corrected'], alexa_reviews['rating'], test_size = 0.3, random_state = 42, stratify = alexa_reviews['rating'])

In [7]:
## build count vectorizer as features
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(alexa_reviews['verified_reviews_corrected'])
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

In [8]:
# build tf-idf vectors as features
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(alexa_reviews['verified_reviews_corrected'])
xtrain_tfidf_unigram =  tfidf_vect.transform(train_x)
xvalid_tfidf_unigram =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(alexa_reviews['verified_reviews_corrected'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(alexa_reviews['verified_reviews_corrected'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [9]:
# train a word2vec and use a word2vec with a tf-idf or mean for classification
## find similar words of all the reviews
sentences = alexa_reviews['verified_reviews_corrected'].values
sentences = [x.split() for x in sentences]

print('Training word2vec...')
time_start = time.time()
word_model = Word2Vec(sentences, size=100, min_count=2, window=5, iter=1000, workers = -1)
print('Word2Vec done! Time elapsed: %.3f seconds' %(time.time()-time_start))
pretrained_weights = word_model.wv.vectors
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)

Training word2vec...
Word2Vec done! Time elapsed: 2.951 seconds
Result embedding shape: (2386, 100)


In [14]:
## use trained word2vec for modelling new model
xtrain_word2vec = np.array([np.mean([word_model[w] for w in words if w in word_model] or [np.zeros(pretrained_weights.shape)], axis=0) for words in train_x])
xvalid_word2vec = np.array([np.mean([word_model[w] for w in words if w in word_model] or [np.zeros(pretrained_weights.shape)], axis=0) for words in valid_x])

  
  
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# Naive Bayes on Count Vectors
count_model, accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count, valid_y)
print("NB, Count Vectors: %0.3f" %accuracy)

# Naive Bayes on Word Level TF IDF Vectors
tfidf_unigram_model, accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram, valid_y)
print("NB, WordLevel TF-IDF: %0.3f" %accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_ngram_model, accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y)
print("NB, N-Gram Vectors: %0.3f" %accuracy)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_ngram_chars_model, accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, valid_y)
print("NB, CharLevel Vectors: %0.3f" %accuracy)

NB, Count Vectors: 0.786
NB, WordLevel TF-IDF: 0.728
NB, N-Gram Vectors: 0.726




NB, CharLevel Vectors: 0.735


In [28]:
# Naive Bayes on Character Level TF IDF Vectors
tfidf_ngram_chars_model, accuracy = train_model(linear_model.LogisticRegression(), xtrain_word2vec, train_y, xvalid_word2vec, valid_y)
print("NB, CharLevel Vectors: %0.3f" %accuracy)



ValueError: setting an array element with a sequence.