### Yelp Review Classification (Positive vs Negative)

Data Source: https://www.kaggle.com/c/yelp-recruiting

In [44]:
from __future__ import print_function
import pandas as pd
import numpy as np

In [20]:
# read data to dataframe
fname = 'yelp_training_set/yelp_training_set_review.json'
data = pd.read_json(fname, lines=True)
print(data.shape)
print(data.head())
print()

(229907, 8)
              business_id       date               review_id  stars  \
0  9yKzy9PApeiPPOUJEtnvkg 2011-01-26  fWKvX83p0-ka4JS3dc6E5A      5   
1  ZRJwVLyzEJq1VAihDhYiow 2011-07-27  IjZ33sJrzXqU-0X6U8NwyA      5   
2  6oRAC4uyJCsJl1X0WZpVSA 2012-06-14  IESLBzqUCLdSzSqm0eCSxQ      4   
3  _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27  G-WvGaISbqqaMHlNnByodA      5   
4  6ozycU1RpktNG2-1BroVtw 2012-01-05  1uJFq2r5QfJG_6ExMRCaGw      5   

                                                text    type  \
0  My wife took me here on my birthday for breakf...  review   
1  I have no idea why some people give bad review...  review   
2  love the gyro plate. Rice is so good and I als...  review   
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...  review   
4  General Manager Scott Petello is a good egg!!!...  review   

                  user_id                                    votes  
0  rLtl8ZkDX5vH5nAx9C3q5Q  {u'funny': 0, u'useful': 5, u'cool': 2}  
1  0a2KyEL0d3Yb1V6aivbIuQ  {u'funny': 

In [15]:
# check information
# print('Type contains:')
# print(data['type'].unique())
# print('Null values:')
# print(data.isnull().any())
print(data['stars'].unique())
'''
results show:
every line is a review entry;
no Null values present
'''

[5 4 2 3 1]


'\nresults show:\nevery line is a review entry;\nno Null values present\n'

In [23]:
# drop features
# since we are only classifying positive and negative reviews, take only the useful features
useful_features = ['stars', 'text']
data_useful = data.loc[:, useful_features]
data_useful = data_useful.drop(data['stars'] == 3)
print(data_useful.head())
print()

# convert stars to binary values 0 and 1
data_useful['label'] = data_useful['stars'].apply(lambda x : 1 if x > 3 else 0)

# separte x and y data
X_data = data_useful['text']
Y_data = data_useful['label']
print('X shape:', X_data.shape)
print('Y shape:', Y_data.shape)
print()

# check number of positive and negative reviews
print(Y_data.value_counts())

   stars                                               text
2      4  love the gyro plate. Rice is so good and I als...
3      5  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4      5  General Manager Scott Petello is a good egg!!!...
5      4  Quiessence is, simply put, beautiful.  Full wi...
6      5  Drop what you're doing and drive here. After I...

X shape: (229905L,)
Y shape: (229905L,)

1    156069
0     73836
Name: label, dtype: int64


In [25]:
# partition train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, random_state=0)
print('Train shape:', X_train.shape)
print(sum(y_train == 1))
print('Test shape:', X_test.shape)

Train shape: (172428L,)
117082
Test shape: (57477L,)


### Bag of Words Method

Use frequency vector of word occurences. It does not keep the order information.
Tokenizing and stop-words filtering are done internally.

CountVectorizer simply computes the frequency of words. TfidfVectorizaer considers the relative frequency - it pays more attention to the words that have high frequency in certain documents and not all.

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [72]:
# TESTING
# text = pd.Series(['is are Main main, clothes, cloth, mice, mouse running, run, meets, met, meet'])
# print(text)
# vect = CountVectorizer()
# vect.fit(text)
# feat_names = vect.get_feature_names()
# print(feat_names)
'''the tokenization in sklearn simply extract words (ignoring letter case)'''

'the tokenization in sklearn simply extract words (ignoring letter case)'

#### use CountVectorizer

In [30]:
vect = CountVectorizer()

In [61]:
vect = CountVectorizer(min_df=3)

#### use TF-IDF

In [56]:
vect = TfidfVectorizer()

In [66]:
vect = TfidfVectorizer(min_df=3)

In [67]:
vect = TfidfVectorizer(ngram_range=(1,2))

### Transform data

In [68]:
vect.fit(X_train)
feat_names = vect.get_feature_names()
print('num_features:', len(feat_names))
# print(feat_names[::1000])

X_train_vect = vect.transform(X_train)
print(X_train_vect.shape)

X_test_vect = vect.transform(X_test)
print(X_test_vect.shape)

num_features: 2869511
(172428, 2869511)
(57477, 2869511)


### use model

In [69]:
# use logistic regression
clf = LogisticRegression()
clf.fit(X_train_vect, y_train)
preds = clf.predict(X_test_vect)

In [70]:
# compute accuracy
accuracy = accuracy_score(y_test, preds)
print(accuracy)

0.87803817179


In [71]:
feat_names = np.array(feat_names)
sorted_coef_index = clf.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feat_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feat_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
[u'ok' u'not' u'worst' u'bland' u'mediocre' u'nothing' u'overpriced'
 u'rude' u'average' u'horrible']

Largest Coefs: 
[u'great' u'delicious' u'amazing' u'awesome' u'love' u'excellent' u'best'
 u'perfect' u'fantastic' u'wonderful']


### References
- https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
- http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
- http://scikit-learn.org/stable/modules/feature_extraction.html