In [10]:
import pandas as pd
import numpy as np
import nltk
import re
import datetime
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import words
from nltk.corpus import stopwords
from contractions import CONTRACTION_MAP
# from stopwords import stop_words
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
def expand_contractions(word):
    """
    This function expands words such as I'll to I will.
    :param word: a single review
    :returns: the expanded words
    """
    expanded = ' '.join([CONTRACTION_MAP[t] if t in CONTRACTION_MAP else t for t in word.split(" ") ])
    return expanded

def get_wordnet_pos(word):
    """
    This function gets the wordnet postag of each words.
    :param word: word in each review texts
    :returns: the postag of each word 
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "R":wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)
    
def preprocessing_text(text):
    """
    This function preprocesses the review texts by performing contractions, removing numbers and
    punctuations, make all the characters into lowercase, tokenization, lemmatization as well as removing stopwords.
    :param text: a single text review
    :returns: a list of preprocessed words
    """
    #contractions
    expanded_text=expand_contractions(text)
    #remove numbers
    numbers_removed = re.sub(r'\d+','',expanded_text)
    #remove punctuation
    punct_removed = re.sub(r'[^\w\s]','',numbers_removed)
    #tokenization
    tokens = nltk.word_tokenize(punct_removed.lower())
    
    #remove stop words and lemmatization
    lem_words = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    for word in tokens:
        temp_word = lemmatizer.lemmatize(word,get_wordnet_pos(word))
        if  temp_word not in stop_words:
            lem_words.append(temp_word)

    return lem_words

def calculate_tfidf(text):
    def dummy_func(docs):
        return docs
    
#     loaded_model = pickle.load(open(filename, 'rb'))
    transformer = TfidfTransformer()
    loaded_vec = TfidfVectorizer(analyzer='word',tokenizer=dummy_func, preprocessor=dummy_func, token_pattern=None,vocabulary = pickle.load(open("feature.pkl", "rb")))
    tfidf = loaded_vec.fit_transform(text) 
    
    return tfidf



In [20]:
'''
This is the basic user interface used to predict rating from a text review using Tkinter.
'''

from tkinter import *
from tkinter import font
from tkinter.scrolledtext import ScrolledText
# import train_model

        
def reset():
    '''
    This function resets the review textbox and the predicted ratings textbox.
    '''
    textbox.delete(1.0,END)
    result.config(state='normal')
    result.delete(0,END)
    result.config(state='disabled')

    
def predict():
    '''
    This function displays the predicted rating.
    '''
    result.config(state='normal')
    result.delete(0,END)
    
    user_review = textbox.get(1.0,END) #gets the review text inputted by user
    rev_list = []
    Xx = preprocessing_text(user_review)
    rev_list.append(Xx)
    tfidf = calculate_tfidf(rev_list)
    
    loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
    prediction = loaded_model.predict(tfidf)
    
    result.insert(END,prediction)
    result.config(state='disabled')


root = Tk()
root.geometry("800x400")    #Sets the size of window
root.configure(background='#DEFFF0')   #Sets the background colour

root.title('Ratings Prediction')  #Sets the title of the root

#Displays the title
main_title = Label(root,text="Ratings Prediction",font=("Elephant",30),bg='#DEFFF0')
main_title.pack(fill=BOTH, expand=0)
#To underline the main_title
f = font.Font(main_title, main_title.cget("font"))
f.configure(underline=True)
main_title.configure(font=f)

#Displays text asking user to enter a review text
text = Label(root, text="Enter a review text: ",font=("Times New Roman",15),bg='#DEFFF0')
text.pack(fill=BOTH, expand=0)

#Displays the textbox where user will input the review text
textbox=ScrolledText(root,height=5,width=40,font=("Times New Roman",13))
textbox.pack(fill=Y,expand=0)

#Create a frame to put the buttons in
frame = Frame(root, height="200", width="200", bg="#DEFFF0",borderwidth = 13)
frame.pack()

#Create a button named "Predict" to predict the ratings
pred_button = Button(frame,text="Predict",font=("Times New Roman",11),command = predict,bg='black',fg='white',height=2,width=10)
pred_button.pack(padx=50, side=LEFT)

#Create a button to clear the review text that user inputted
clear_button = Button(frame,text="Clear Review Text",font=("Times New Roman",11),bg='black',fg='white',command=lambda: textbox.delete(1.0,END),height=2,width=20)
clear_button.pack(padx=50,side=LEFT)

#Create a button to reset the review text that user inputted as well as the displayed rating
reset_button = Button(frame,text="Reset",font=("Times New Roman",11),command=reset,bg='black',fg='white',height=2,width=10)
reset_button.pack(padx=50, side=LEFT)

#Displays the text 
text2 = Label(root,text="The rating for this review is: ",font=("Times New Roman",15),bg='#DEFFF0')
text2.pack(padx=5,expand=0)
  
#Displays the predicted rating
result = Entry(root,font=("Times New Roman",13),bg='white')
result.config(state='disabled',disabledbackground='white',disabledforeground='black',justify=CENTER)
result.pack(fill=Y,expand=0)


root.mainloop() 


(1, 28914)
  (0, 25273)	0.5773502691896258
  (0, 8757)	0.5773502691896258
  (0, 4128)	0.5773502691896258
(1, 28914)
  (0, 27060)	0.4082482904638631
  (0, 12781)	0.4082482904638631
  (0, 12337)	0.4082482904638631
  (0, 10544)	0.4082482904638631
  (0, 5845)	0.4082482904638631
  (0, 4587)	0.4082482904638631
(1, 28914)
  (0, 27060)	0.4082482904638631
  (0, 12781)	0.4082482904638631
  (0, 12337)	0.4082482904638631
  (0, 10544)	0.4082482904638631
  (0, 5845)	0.4082482904638631
  (0, 4587)	0.4082482904638631
(1, 28914)
  (0, 26376)	0.21320071635561041
  (0, 23787)	0.42640143271122083
  (0, 22832)	0.42640143271122083
  (0, 21163)	0.21320071635561041
  (0, 21129)	0.21320071635561041
  (0, 16479)	0.21320071635561041
  (0, 14534)	0.21320071635561041
  (0, 13084)	0.42640143271122083
  (0, 10463)	0.21320071635561041
  (0, 6638)	0.21320071635561041
  (0, 6529)	0.21320071635561041
  (0, 5495)	0.21320071635561041
  (0, 4988)	0.21320071635561041
(1, 28914)
  (0, 23038)	0.4472135954999579
  (0, 22832)	0