In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:

import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
data =pd.read_csv("/content/gdrive/MyDrive/Google Colab/new_data_analyzed.csv", on_bad_lines='skip')

In [5]:
documents = data['text']
documents.shape

(99999,)

In [6]:
target = data['stars']
data['stars'].value_counts()

5    44391
4    25337
3    11362
1    10921
2     7988
Name: stars, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
documents_train, documents_test, target_train, target_test = \
train_test_split(documents, target, random_state = 1, test_size=0.3)

In [8]:
documents_train.iloc[1]

"My first time to eat here and would think twice if i'll ever wanted to do it again. Waited for a house salad for 30 minutes and our meal for another 30 minutes which i think is totally unacceptable to wait for you food for an 1 hour."

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True, max_features = 5000
                            )
# Train the model with my training data
documents_train_vec = vectorizer.fit_transform(documents_train).toarray()
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()
# Use the trained model to transform your test data
documents_test_vec = vectorizer.transform(documents_test).toarray()



In [10]:
def get_top_values(lst, n, labels):
    #Given a list of values, find the indices with the highest n values.
    #Return the labels for each of these indices.
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  

def get_bottom_values(lst, n, labels):
    #Given a list of values, find the indices with the lowest n values.
    #Return the labels for each of these indices.
    return [labels[i] for i in np.argsort(lst)[:n]] 

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
# Draw an arbitrary review from test (unseen in training) documents
arbitrary_review = np.random.choice(documents_test, 1)
arbitrary_review[0]

'I called the Drain Team on a Monday morning due to a stopped up sink pipe beyond the J turn.  They picked up the phone on the first try (what are you serious?  Yes this is an actual operation).  The operator apologized that she could not dispatch someone within the hour because of their morning workload (I had already told her this was not urgent).  Dan, the man, showed up in 2 hours.  He delivered a fair estimate, completed the work (it took longer than estimated and I WAS NOT CHARGED), suggested additional work for the future, and completely cleaned up any mess.\\n\\nI just moved to St Petersburg and am quickly discovering how many really sucky house vendors are floating around.  In a sea of suckitude, the Drain Team was a breath of fresh air.'

In [12]:
# Transform the drawn review(s) to vector(s)
arbitrary_review_vec = vectorizer.transform([arbitrary_review[0]]).toarray()
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_score = cosine_similarity(arbitrary_review_vec, documents_train_vec)

# Let me find top 5 similar reviews
n = 5
similar_reviews = get_top_values(similarity_score[0], n, list(documents_train))

print('My search query: \n%s\n' % (arbitrary_review[0]))
print('Top %s similar reviews:' % n)
for i in range(n):
    print('No. %d review is %s.\n' % (i+1, similar_reviews[i]))

My search query: 
I called the Drain Team on a Monday morning due to a stopped up sink pipe beyond the J turn.  They picked up the phone on the first try (what are you serious?  Yes this is an actual operation).  The operator apologized that she could not dispatch someone within the hour because of their morning workload (I had already told her this was not urgent).  Dan, the man, showed up in 2 hours.  He delivered a fair estimate, completed the work (it took longer than estimated and I WAS NOT CHARGED), suggested additional work for the future, and completely cleaned up any mess.\n\nI just moved to St Petersburg and am quickly discovering how many really sucky house vendors are floating around.  In a sea of suckitude, the Drain Team was a breath of fresh air.

Top 5 similar reviews:
No. 1 review is Completed ahead of schedule (said it would take 3.5 hours, but took just 2.5) and done well - the house was cleaned better than the team at CheckMaid. I'm very happy and think I'll now be 

Build model to predict rate based on reviews

1. Naive-Bayes Classifier

In [13]:
# Build a Naive-Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB()
clf_NB.fit(documents_train_vec, target_train)

MultinomialNB()

In [14]:
print('The accuracy score for train data set is %f, for test data set is: %f' % \
     (clf_NB.score(documents_train_vec, target_train), clf_NB.score(documents_test_vec, target_test)))

The accuracy score for train data set is 0.601523, for test data set is: 0.577733
