In [41]:
import pandas as pd
import numpy as np
import nltk
import json
import time
import random

from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [42]:
data = pd.read_json('data.json')
X = data['text']
y = data['rating'].astype(np.float64)

In [43]:
'''

Data Preprocessing

'''
en_stop = get_stop_words('en')

def filter_sentence(el):
    tokens = word_tokenize(el)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = stem_words(tokens)
    tokens = lemma_words(tokens)
    
    #tokens = [el for el in tokens if random.random() >= 0.75]
    ret_str = " ".join(tokens) 
    
    return ret_str 


#Credit to https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html
#for stem_words and lemma_words
def stem_words(words):
    stemmer = PorterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemma_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

start_time = time.time()
X = [filter_sentence(el) for el in X]
print("--- %s seconds ---" % (time.time() - start_time))

y_new = []
for el in y:
    ret = 0
    if el <= 5:
        ret = 0
    else:
        ret = 1
    y_new.append(ret)
y = y_new

--- 415.33158898353577 seconds ---


In [44]:
with open('data_clean_X.json', 'w') as outfile:
    json.dump(X, outfile)
with open('data_clean_y.json', 'w') as outfile:
    json.dump(y, outfile)

In [45]:
with open('data_clean_X.json') as json_data:
    X = json.load(json_data)
    json_data.close()
with open('data_clean_y.json') as json_data:
    y = json.load(json_data)
    json_data.close()

In [46]:
# vectorizer = TfidfVectorizer()
# start_time = time.time()
# X = vectorizer.fit_transform(X)
# print("--- %s seconds ---" % (time.time() - start_time))
# print('Shape of Sparse Matrix: ', X.shape)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=92)

In [48]:
def train_drop(el):
    tokens = word_tokenize(el)
    tokens = [el for el in tokens if random.random() >= 0.75]
    ret_str = " ".join(tokens) 
    return ret_str 
#X_train = [train_drop(el) for el in X_train]
vectorizer = TfidfVectorizer()
start_time = time.time()
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

print("--- %s seconds ---" % (time.time() - start_time))
print('Shape of Sparse Train Matrix: ', X_train.shape)

--- 14.93538522720337 seconds ---
Shape of Sparse Train Matrix:  (40000, 60975)


In [49]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=92)

start_time = time.time()
clf = RandomForestClassifier(n_estimators=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("--- %s seconds ---" % (time.time() - start_time))

print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

--- 76.06661200523376 seconds ---
[[4254  770]
 [ 853 4123]]


             precision    recall  f1-score   support

          0       0.83      0.85      0.84      5024
          1       0.84      0.83      0.84      4976

avg / total       0.84      0.84      0.84     10000

