In [1]:
import pandas as pd                            # to analyse data that are stored in a csv file
import numpy as np                             # to provide a large set of numeric datatypes that can be used to construct arrays
import nltk                                    # a platform for building Python programs to work with human language data
import re                                      # regex model
import datetime
from nltk.corpus import stopwords              # to remove stopwords
from nltk.stem import WordNetLemmatizer        # to lemmatize
from nltk.corpus import wordnet                # used to check whether the word is an adjective, noun, verb or adverb
from sklearn.feature_extraction.text import TfidfTransformer         # to run tfidf transformer on the given data
from sklearn.feature_extraction.text import TfidfVectorizer          # to run tfidf vectorizer on the given data

# Read the csv file
dat = pd.read_csv('preprocessed_review.csv')
dat.head(5)

Unnamed: 0.1,Unnamed: 0,title,rating,date,author,review,product
0,0,Worth Buying For The Pictures Alone (As Ever),4,6 April 2014,By\n\n \n\n Copnovelist\n\n \n\n on 6 ...,Part of the magic for me growing up as a boy ...,Hornby 2014 Catalogue
1,1,Amazing detail fabulous photography.,5,11 April 2015,By\n\n \n\n richard\n\n \n\n on 11 Apr...,"Amazing detail, every credit to the photograp...",Hornby 2014 Catalogue
2,2,'Great Purchase',5,23 April 2014,By\n\n \n\n Pinkhandbag\n\n \n\n on 23...,This was purchased on behalf of my Dad. He is...,Hornby 2014 Catalogue
3,3,Great Catalogue,5,11 Jun. 2014,By\n\n \n\n Gary John Mapson\n\n \n\n ...,Everything I really needed to see what was on...,Hornby 2014 Catalogue
4,4,I collect them all as the glossy pictures are...,5,7 Dec. 2014,By\n\n \n\n David Baker\n\n \n\n on 7 ...,I collect them all as the glossy pictures are...,Hornby 2014 Catalogue


In [2]:
def get_wordnet_pos(word):
    """
    This function gets the wordnet postag of each words.
    :param word: word in each review texts
    :returns: the postag of each word 
    Retrieved from: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "R":wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)
    
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess(review):
    """
    This function takes in a list and preprocess accordingly. 
    :param review: list as input
    :returns: preprocessed words
    Retrieved from https://pythonspot.com/nltk-stop-words/
    """
    result = re.sub(r'\d+','', review)      # Remove numbers/ digits
    result = re.sub(r'[^\w\s]','',result)   # Remove puntuations
    val = result.lower()                    # Convert all the reviews to lowercase
    new_list = []
    for word in val.split():                # tokenize and stop words removal
        if word not in stop_words:
            new_list.append(word)
    
    return new_list
    
    
def lemmatize_it(series_list):
    """
    This function is to carry out lemmatization on the
    tokenized review
    :series_list: a list that contains token to be lemmatized
    :returns: lemmatized word
    Retrieved from: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    """
    stem_it = []
    for i in series_list:
        lem = lemmatizer.lemmatize(i, get_wordnet_pos(i))
        stem_it.append(lem)
        
    return stem_it

In [3]:
dat['processed'] = np.nan
dat['processed'] = dat['review'].apply(preprocess)
dat['processed'] = dat['processed'].apply(lemmatize_it)

In [4]:
# concatenating the processed review with ratings column
result = pd.concat([dat['processed'],dat['rating']], axis=1)
result.head(10)

Unnamed: 0,processed,rating
0,"[part, magic, grow, boy, buy, give, new, hornb...",4
1,"[amaze, detail, every, credit, photographer, b...",5
2,"[purchase, behalf, dad, always, ask, look, gau...",5
3,"[everything, really, need, see, offer, hornby,...",5
4,"[collect, glossy, picture, great, nice, still,...",5
5,"[great, book, extremely, useful, insight, futu...",5
6,"[useful, info, someonelike, start, back, hobby...",5
7,"[well, produce, good, quality, cataloguesuper,...",5
8,"[happy, communication, funkybuys]",4
9,"[great, buy]",5


In [5]:
# Calculating the tf-idf values
# Retrieved from https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

X_train_1 = result['processed'].values
Y = result['rating'].values

def identity_tokenizer(text):
    """
    Just a dummy function.
    """
    return text

vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, analyzer='word',preprocessor = identity_tokenizer,lowercase=True)    
X = vectorizer.fit_transform(X_train_1)

print("Shape of the vectorizer: ",X.shape)


Shape of the vectorizer:  (28212, 28732)


In [None]:
"""
Train model
LinearSVC retrieved from https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
ADASYN retrieved from https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.ADASYN.html
make_pipeline retrieved from https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.pipeline.make_pipeline.html#imblearn.pipeline.make_pipeline
"""
from imblearn.over_sampling import ADASYN                   # to do oversampling
from sklearn.svm import LinearSVC                           # to train support vector machine models
from imblearn.pipeline import make_pipeline                 # construct a pipeline from given estimators. Automates a machine learning workflow

begin = datetime.datetime.now()

pipeline = make_pipeline(ADASYN(),LinearSVC())

pipeline.fit(X, Y)

print("Duration for training: ",datetime.datetime.now() - begin)

In [11]:
# Save the tf-idf model to disk for future use
# Retrieved from https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
import pickle

filename = 'finalized_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))
pickle.dump(vectorizer.vocabulary_,open("feature.pkl","wb"))