In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

%matplotlib inline

In [2]:
# Read yelp.csv into a DataFrame.
path = r'./data/googleplaystore.csv'
google = pd.read_csv(path)

# Create a new DataFrame that only contains the 5-star and 1-star reviews.
google_best_worst = google[(google.Rating == 4) | (google.Rating == 1)]

# Define X and y.
X = google_best_worst.App
y = google_best_worst.Rating

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [33]:
print(len(X_train))
print(len(X_test)) 

438
146


Tokenization

In [34]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [35]:
# Rows are documents, columns are terms (aka "tokens" or "features", individual words in this situation).
X_train_dtm.shape

(438, 937)

In [36]:
# Last 50 features
print((vect.get_feature_names()[-50:]))

['watch', 'weather', 'web', 'webcam', 'wefi', 'western', 'what', 'wheels', 'whitening', 'wi', 'widget', 'wifi', 'wilson', 'with', 'witness', 'wizard', 'wkbw', 'woodman', 'words', 'workout', 'world', 'writing', 'xcom', 'xv', 'yandex', 'yaoi', 'yojna', 'you', 'your', 'ziprealty', 'zombie', 'zombies', 'zone', 'zoom', 'zoomzoomnation', 'zooper', 'zoosk', 'シュフー', '初一', '十五', '骰寶', '롯데', '멤버십', '모바일', '사용', '엘포인트', '적립', '카드', '쿠폰', '포인트']


In [37]:
# Don't convert to lowercase.
vect = CountVectorizer(lowercase=False)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape
vect.get_feature_names()[-10:]

['骰寶', '롯데', '멤버십', '모바일', '사용', '엘포인트', '적립', '카드', '쿠폰', '포인트']

In [38]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [39]:
vect = CountVectorizer(lowercase=False)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(438, 996)

In [40]:
vect = CountVectorizer(ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(438, 1904)

In [41]:
print(vect.get_feature_names()[-50:])

['xv', 'xv new', 'yandex', 'yandex transport', 'yaoi', 'yaoi ooku', 'yojna', 'yojna jansampark', 'you', 'your', 'your be', 'your ex', 'your freedom', 'your prank', 'ziprealty', 'ziprealty real', 'zombie', 'zombies', 'zombies run', 'zone', 'zone result', 'zoom', 'zoom fx', 'zoomzoomnation', 'zooper', 'zooper widget', 'zoosk', 'zoosk dating', 'シュフー', '初一', '初一 十五', '十五', '骰寶', '롯데', '멤버십', '멤버십 적립', '모바일', '모바일 카드', '사용', '사용 모바일', '엘포인트', '엘포인트 포인트', '적립', '적립 사용', '카드', '카드 쿠폰', '쿠폰', '쿠폰 롯데', '포인트', '포인트 멤버십']


Predict the star rating with the new features from CountVectorizer.

In [42]:
vect = CountVectorizer()

X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

print(metrics.accuracy_score(y_test, y_pred_class))

0.9726027397260274


In [43]:
# Calculate null accuracy. **Interesting never know that syntax                                                                            
y_test_binary = np.where(y_test==5, 1, 0) # five stars become 1, one stars become 0
print('Percent 5 Stars:', y_test_binary.mean())
print('Percent 1 Stars:', 1 - y_test_binary.mean())

Percent 5 Stars: 0.0
Percent 1 Stars: 1.0


In [49]:
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [51]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  1904
Accuracy:  0.9726027397260274


Stopword Removal

In [52]:
tokenize_test(vect)

Features:  1904
Accuracy:  0.9726027397260274


In [53]:
print(vect.get_stop_words())

None


Change the minimum document frequency for terms and test the model's performance

In [56]:
X_train_dtm

<438x937 sparse matrix of type '<class 'numpy.int64'>'
	with 1514 stored elements in Compressed Sparse Row format>

In [57]:
vect.vocabulary_

{'anatomy': 77,
 'extreme': 668,
 'motorbike': 1177,
 'jump': 964,
 '3d': 14,
 'extreme motorbike': 669,
 'motorbike jump': 1178,
 'jump 3d': 965,
 'foodpanda': 746,
 'local': 1049,
 'food': 744,
 'delivery': 488,
 'foodpanda local': 747,
 'local food': 1051,
 'food delivery': 745,
 'hulu': 907,
 'stream': 1614,
 'tv': 1741,
 'movies': 1179,
 'more': 1173,
 'hulu stream': 908,
 'stream tv': 1615,
 'tv movies': 1743,
 'movies more': 1180,
 'differential': 501,
 'dx': 580,
 'differential dx': 502,
 '24': 12,
 'megapixel': 1125,
 'hd': 864,
 'camera': 295,
 '24 megapixel': 13,
 'megapixel hd': 1126,
 'hd camera': 865,
 'quiz': 1414,
 'learn': 1007,
 'python': 1405,
 'quiz learn': 1415,
 'learn python': 1011,
 'vz': 1795,
 'navigator': 1205,
 'vz navigator': 1796,
 'browser': 258,
 'pop': 1366,
 'music': 1187,
 'pop music': 1367,
 'lovoo': 1071,
 'superhero': 1635,
 'doctor': 529,
 'er': 643,
 'surgery': 1639,
 'superhero doctor': 1636,
 'doctor er': 530,
 'er surgery': 646,
 'recycling': 

In [58]:
# Finally, let's convert the sparse matrix to a typical ndarray using .toarray()
#   - Remember, this takes up a lot more memory than the sparse matrix! However, this conversion is sometimes necessary.

X_test_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [59]:
# We will use this function below for simplicity.

# Define a function that accepts a vectorizer and calculates the accuracy.
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print(('Features: ', X_train_dtm.shape[1]))
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print(('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)))

In [60]:
# min_df ignores words that occur less than twice ('df' means "document frequency").
vect = CountVectorizer(min_df=2, max_features=10000)
tokenize_test(vect)

('Features: ', 251)
('Accuracy: ', 0.9726027397260274)
