In [8]:
import pandas as pd                            # to analyse data that are stored in a csv file
import numpy as np                             # to provide a large set of numeric datatypes that can be used to construct arrays
import nltk                                    # a platform for building Python programs to work with human language data
import re                                      # regex model
import pickle                                  # used to load classifier and tfidf vocabulary
import datetime                                # to take note of the start and end time of model training
from nltk.corpus import stopwords              # to remove stopwords
from nltk.stem import WordNetLemmatizer        # to lemmatize
from nltk.corpus import wordnet                # used to check whether the word is an adjective, noun, verb or adverb
from sklearn.feature_extraction.text import TfidfTransformer         # to run tfidf transformer on the given data
from sklearn.feature_extraction.text import TfidfVectorizer          # to run tfidf vectorizer on the given data

In [9]:
def get_wordnet_pos(word):
    """
    This function gets the wordnet postag of each words.
    :param word: word in each review texts
    :returns: the postag of each word 
    Retrieved from: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "R":wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)

def preprocess(review):
    """
    This function takes in a list and preprocess accordingly. 
    :param review: list as input
    :returns: preprocessed words
    Retrieved from https://pythonspot.com/nltk-stop-words/
    """
    stop_words = set(stopwords.words('english'))
    result = re.sub(r'\d+','', review)      # Remove numbers/ digits
    result = re.sub(r'[^\w\s]','',result)   # Remove puntuations
    val = result.lower()                    # Convert all the reviews to lowercase
    new_list = []
    for word in val.split():                # tokenize and stop words removal
        if word not in stop_words:
            new_list.append(word)
            
    return new_list
    
    
def lemmatize_it(series_list):
    """
    This function is to carry out lemmatization on the
    tokenized review
    :series_list: list object that contains token to be lemmatized
    :returns: lemmatized word
    Retrieved from: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    """
    lemmatizer = WordNetLemmatizer()
    stem_it = []
    for i in series_list:
        lem = lemmatizer.lemmatize(i, get_wordnet_pos(i))
        stem_it.append(lem)
    
    return stem_it
    
def preprocessing_text(text):
    """
    This function preprocesses the review texts by calling the respective function
    :param text: a single text review
    :returns: a list of preprocessed words
    """
    proc = preprocess(text)     # Vectorize method
    val = lemmatize_it(proc)
    
    return val

def calculate_tfidf(text):
    """
    This function calculates the tf-idf values for the review text that user inputted.
    :params text: review text that user inputted
    :returns: the tfidf values calculated
    # Retrieved from https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    """
    def dummy_func(docs):
        """
        This is just a dummy function.
        """
        return docs
    
    # Vectorize the tokens with saved tfidf vocabulary
    transformer = TfidfTransformer()
    loaded_vec = TfidfVectorizer(analyzer='word',tokenizer=dummy_func, preprocessor=dummy_func, token_pattern=None,vocabulary = pickle.load(open("feature.pkl", "rb")))
    tfidf = transformer.fit_transform(loaded_vec.fit_transform(text))
       
    return tfidf

In [10]:
'''
This is the basic user interface used to predict rating from a text review using Tkinter.
Retrieved from https://docs.python.org/2/library/tkinter.html
'''

from tkinter import *
from tkinter import font    
from tkinter.scrolledtext import ScrolledText

import string

def check_punc_num(user_input):
    '''
    This function checks whether the user input consists of only numbers, punctuations and/or whitespaces.
    :params user_input: the text review that user inputted
    '''
    count=0
    for i in user_input:               #loops through all the characters of the user input
        if i in string.punctuation:    #if it is a punctuation
            count += 1
            
        elif i.isdigit():              #if it is a number
            count += 1
            
        elif i in string.whitespace:   #if it is a whitespace
            count += 1
    
    if count == len(user_input):  #check whether the number of punctuations,numbers and whitespaces are the same length as input
        return True
    
    else:
        return False
    
def print_rating(user_review):
    if len(textbox.get("1.0", "end-1c")) == 0:  #checks whether the textbox is empty
        messagebox.showwarning("Warning","Textbox is empty!")
        raise Exception('Textbox is empty!')
        
    elif (check_punc_num(user_review) == True):  #checks whether user input contains numbers, punctuations and/or whitespaces ONLY
        messagebox.showwarning("Warning","Review should not only contain numbers, symbols and/or whitespaces.")
        raise Exception('Review should not only contain numbers, symbols and/or whitespaces')
        
    
    else:      #predict and output the rating
        rev_list = []
        proc = preprocessing_text(user_review)   # preprocesses the review text inputted by user
        rev_list.append(proc)     # appends the preprocessed review text into the list. This is to ensure that all the words are
                                  # considered as a review for tf-idf calculation

        tfidf = calculate_tfidf(rev_list)        # calculates the tf-idf values

        loaded_model = pickle.load(open('finalized_model.sav', 'rb'))   # loads the trained model
        prediction = loaded_model.predict(tfidf)                        # predicts the rating

        for rating in prediction:
            result.insert(END,rating)     # outputs the predicted rating
        
        return True
    
    return False
    
        
def reset():
    '''
    This function resets the review textbox and the predicted ratings textbox.
    '''
    textbox.delete(1.0,END)        # deletes the text inside the textbox (user input)
    result.config(state='normal')  # allows the output textbox to be modified
    result.delete(0,END)           # deletes the text inside the output textbox
    result.config(state='disabled')  # disable the output textbox to be modified

    
def predict():
    '''
    This function displays the predicted rating.
    '''
    result.config(state='normal')      # allows the output textbox to be modified
    result.delete(0,END)               # deletes the text inside the output textbox
    user_review = textbox.get(1.0,END) # gets the review text inputted by user
    te = print_rating(user_review)          #calls this function to predict rating    
    print(te)
    result.config(state='disabled')    # disable the output textbox to be modified


root = Tk()
root.geometry("800x400")               # Sets the size of window
root.configure(background='#DEFFF0')   # Sets the background colour

root.title('Ratings Prediction')       # Sets the title of the root

# Displays the title
main_title = Label(root,text="Ratings Prediction",font=("Times New Roman",30),bg='#DEFFF0')
main_title.pack(fill=BOTH, expand=0)
# To underline the main_title
f = font.Font(main_title, main_title.cget("font"))
f.configure(underline=True)
main_title.configure(font=f)

# Displays text asking user to enter a review text
text = Label(root, text="Enter a review text: ",font=("Times New Roman",15),bg='#DEFFF0')
text.pack(fill=BOTH, expand=0)

# Displays the textbox where user will input the review text
textbox=ScrolledText(root,height=5,width=40,font=("Times New Roman",13))
textbox.pack(fill=Y,expand=0)

# Create a frame to put the buttons in
frame = Frame(root, height="200", width="200", bg="#DEFFF0",borderwidth = 13)
frame.pack()

# Create a button named "Predict" to predict the ratings
pred_button = Button(frame,text="Predict",font=("Times New Roman",11),command = predict,bg='black',fg='white',height=2,width=10)
pred_button.pack(padx=50, side=LEFT)

# Create a button to clear the review text that user inputted
clear_button = Button(frame,text="Clear Review Text",font=("Times New Roman",11),bg='black',fg='white',command=lambda: textbox.delete(1.0,END),height=2,width=20)
clear_button.pack(padx=50,side=LEFT)

# Create a button to reset the review text that user inputted as well as the displayed rating
reset_button = Button(frame,text="Reset",font=("Times New Roman",11),command=reset,bg='black',fg='white',height=2,width=10)
reset_button.pack(padx=50, side=LEFT)

# Displays the text 
text2 = Label(root,text="The rating for this review is: ",font=("Times New Roman",15),bg='#DEFFF0')
text2.pack(padx=5,expand=0)
  
# Displays the predicted rating
result = Entry(root,font=("Times New Roman",13),bg='white')
result.config(state='disabled',disabledbackground='white',disabledforeground='black',justify=CENTER)
result.pack(fill=Y,expand=0)


root.mainloop() 



True
