In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
data =pd.read_csv("/content/gdrive/MyDrive/Google colab/new_data_analyzed.csv", on_bad_lines='skip')

In [6]:
documents = data['text']
documents.shape

(99999,)

In [8]:
target = data['stars']
data['stars'].value_counts()

5    44391
4    25337
3    11362
1    10921
2     7988
Name: stars, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
documents_train, documents_test, target_train, target_test = \
train_test_split(documents, target, random_state = 1, test_size=0.3)

In [10]:
documents_train.iloc[1]

"My first time to eat here and would think twice if i'll ever wanted to do it again. Waited for a house salad for 30 minutes and our meal for another 30 minutes which i think is totally unacceptable to wait for you food for an 1 hour."

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True, max_features = 5000
                            )
# Train the model with my training data
documents_train_vec = vectorizer.fit_transform(documents_train).toarray()
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()
# Use the trained model to transform your test data
documents_test_vec = vectorizer.transform(documents_test).toarray()



In [12]:
def get_top_values(lst, n, labels):
    #Given a list of values, find the indices with the highest n values.
    #Return the labels for each of these indices.
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  

def get_bottom_values(lst, n, labels):
    #Given a list of values, find the indices with the lowest n values.
    #Return the labels for each of these indices.
    return [labels[i] for i in np.argsort(lst)[:n]] 

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
# Draw an arbitrary review from test (unseen in training) documents
arbitrary_review = np.random.choice(documents_test, 1)
arbitrary_review[0]

'Great fresh oysters! Ordered a half dozen and sat down enjoy them and people watch. Only thing that could have made them a bit better would to be a little more chilled. I love my oysters cold!'

In [14]:
# Transform the drawn review(s) to vector(s)
arbitrary_review_vec = vectorizer.transform([arbitrary_review[0]]).toarray()
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_score = cosine_similarity(arbitrary_review_vec, documents_train_vec)

# Let me find top 5 similar reviews
n = 5
similar_reviews = get_top_values(similarity_score[0], n, list(documents_train))

print('My search query: \n%s\n' % (arbitrary_review[0]))
print('Top %s similar reviews:' % n)
for i in range(n):
    print('No. %d review is %s.\n' % (i+1, similar_reviews[i]))

My search query: 
Great fresh oysters! Ordered a half dozen and sat down enjoy them and people watch. Only thing that could have made them a bit better would to be a little more chilled. I love my oysters cold!

Top 5 similar reviews:
No. 1 review is Captain Just's is located directly across the street from Yuengling Brewery and with this place, you don't want to judge a book by it's cover. Make sure you check in on Foursquare so you can receive a free draft beer (I got a Yuengling of course). They have a daily happy hour from 11:00-8:00 and you can score draft beer for only $1.25. My two yuenglings cost less than Gabe's Coke. I love when that happens!\n\nCheck out the Oyster Menu! First the prices are amazing, only $8 for a dozen of classic oysters. Second, they have the largest selection of specialty Oysters I have ever seen. Cheeseburger Oysters? Philly Oysters? Dessert Oysters!?!? I had all intentions of ordering the Apple Crisp Oyster - I just couldn't bring myself to do so once I

In [15]:
# Build a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(n_estimators=100, max_depth = 25, min_samples_leaf= 5, 
                             min_samples_split=5, random_state = 1, n_jobs = -1, verbose = 1)
clf_RF.fit(documents_train_vec, target_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished


RandomForestClassifier(max_depth=25, min_samples_leaf=5, min_samples_split=5,
                       n_jobs=-1, random_state=1, verbose=1)

In [16]:
print('The accuracy score for train data set is %f, for test data set is: %f' % \
     (clf_RF.score(documents_train_vec, target_train), clf_RF.score(documents_test_vec, target_test)))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    6.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.0s


The accuracy score for train data set is 0.539151, for test data set is: 0.502533


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.9s finished


In [17]:
n = 20
print ('Top 20 words by ranking are %s.' % (", ".join(i for i in get_top_values(clf_RF.feature_importances_, n, words))))

Top 20 words by ranking are amazing, best, great, good, worst, ok, rude, pretty, delicious, told, highly, terrible, love, bit, decent, recommend, minutes, asked, horrible, little.
