In [57]:
import pandas as pd
import numpy as np
import nltk
import json

from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [58]:
data = pd.read_json('data.json')
X = data['text']
y = data['rating'].astype(np.float64)

In [59]:
'''

Data Preprocessing

'''
en_stop = get_stop_words('en')

def filter_sentence(el):
    tokens = word_tokenize(el)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in en_stop]
    stems = stem_words(tokens)
    lemmas = lemma_words(stems)
    
    ret_str = " ".join(lemmas) 
    
    return ret_str 


#Credit to https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html
#for stem_words and lemma_words
def stem_words(words):
    stemmer = PorterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemma_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

X = [filter_sentence(el) for el in X]

y_new = []
for el in y:
    ret = 0
    if el <= 5:
        ret = 0
    else:
        ret = 1
    y_new.append(ret)
y = y_new

In [60]:
with open('data_clean_X.json', 'w') as outfile:
    json.dump(X, outfile)
with open('data_clean_y.json', 'w') as outfile:
    json.dump(y, outfile)

In [61]:
with open('data_clean_X.json') as json_data:
    X = json.load(json_data)
    json_data.close()
with open('data_clean_y.json') as json_data:
    y = json.load(json_data)
    json_data.close()

In [62]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
print('Shape of Sparse Matrix: ', X.shape)

Shape of Sparse Matrix:  (50000, 66911)


In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size=0.20, random_state=92)

clf = RandomForestClassifier(n_estimators=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[4222  802]
 [ 865 4111]]


             precision    recall  f1-score   support

          0       0.83      0.84      0.84      5024
          1       0.84      0.83      0.83      4976

avg / total       0.83      0.83      0.83     10000

