In [16]:
import pandas as pd
import numpy as np
import nltk
import re
import datetime
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import words
from nltk.corpus import stopwords
from contractions import CONTRACTION_MAP
# from stopwords import stop_words
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#Read the csv file
dat = pd.read_csv('review_ver2.csv', encoding = "ISO-8859-1")

# Change the display size
pd.set_option('display.max_columns',20)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

print(dat.head(5))

   Unnamed: 0                                              title  rating             date                                             author                                             review                product
0           0     Worth Buying For The Pictures Alone (As Ever)        4    6 April 2014    By\n\n    \n\n    Copnovelist\n\n  \n\n on 6 ...   Part of the magic for me growing up as a boy ...  Hornby 2014 Catalogue
1           1              Amazing detail fabulous photography.        5   11 April 2015    By\n\n    \n\n    richard\n\n  \n\n on 11 Apr...   Amazing detail, every credit to the photograp...  Hornby 2014 Catalogue
2           2                                  'Great Purchase'        5   23 April 2014    By\n\n    \n\n    Pinkhandbag\n\n  \n\n on 23...   This was purchased on behalf of my Dad. He is...  Hornby 2014 Catalogue
3           3                                   Great Catalogue        5    11 Jun. 2014    By\n\n    \n\n    Gary John Mapson\n\n  \n\n ...

In [17]:
def expand_contractions(word):
    """
    This function expands words such as I'll to I will.
    :param word: a single review
    :returns: the expanded words
    """
    expanded = ' '.join([CONTRACTION_MAP[t] if t in CONTRACTION_MAP else t for t in word.split(" ") ])
    return expanded

def get_wordnet_pos(word):
    """
    This function gets the wordnet postag of each words.
    :param word: word in each review texts
    :returns: the postag of each word 
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "R":wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)
    
def preprocessing_text(text):
    """
    This function preprocesses the review texts by performing contractions, removing numbers and
    punctuations, make all the characters into lowercase, tokenization, lemmatization as well as removing stopwords.
    :param text: a single text review
    :returns: a list of preprocessed words
    """
    #contractions
    expanded_text=expand_contractions(text)
    #remove numbers
    numbers_removed = re.sub(r'\d+','',expanded_text)
    #remove punctuation
    punct_removed = re.sub(r'[^\w\s]','',numbers_removed)
    #tokenization
    tokens = nltk.word_tokenize(punct_removed.lower())
    
    #remove stop words and lemmatization
    lem_words = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    for word in tokens:
        temp_word = lemmatizer.lemmatize(word,get_wordnet_pos(word))
        if  temp_word not in stop_words:
            lem_words.append(temp_word)

    return lem_words


In [30]:
# def get_bigram(lem_words):
#     """
#     This function gets the bigram of the review texts.
#     :param lem_words: a list of preprocessed words
#     :returns: a list of bigram words or just a single word (if unable to perform bigram)
#     """
    
#     if len(lem_words) <= 1: #the review contains a single word only, hence unable to perform bigram
#         return lem_words
    
#     else:
#         #gets the bigram in the form of [('wordA','wordB'),('wordB,'wordC'),...]
#         bigrm = list(nltk.bigrams(lem_words))

#         #make the bigram in this format ['wordA wordB','wordB wordC',...]
#         bigrm_list = []
#         separator = ' '
#         for i in range(len(bigrm)):
#             bigrm_list.append(separator.join(bigrm[i]))   
#         return bigrm_list

In [32]:
lem_tokens = dat['review'].apply(preprocessing_text)

# bigram_list = lem_tokens.apply(get_bigram)

# print(lem_tokens[0])

In [37]:

df_unigram = pd.DataFrame({'review':lem_tokens})
# df_rate = pd.DataFrame({'rating':new_rating})
#concatenating the new data frame with ratings column
result = pd.concat([df_unigram,dat['rating']],axis=1)
print(result.head(10))

                                              review  rating
0  [part, magic, grow, boy, buy, give, new, hornb...       4
1  [amaze, detail, every, credit, photographer, b...       5
2  [purchase, behalf, dad, always, ask, look, gau...       5
3  [everything, really, need, see, offer, hornby,...       5
4  [collect, glossy, picture, great, nice, still,...       5
5  [great, book, extremely, useful, insight, futu...       5
6  [useful, info, someonelike, start, back, hobby...       5
7  [well, produce, good, quality, cataloguesuper,...       5
8                  [happy, communication, funkybuys]       4
9                                       [great, buy]       5


In [34]:
#Calculating the tf-idf values

X_train_1 = result['review'].values
Y = result['rating'].values

def identity_tokenizer(text):
    """
    Just a dummy function.
    """
    return text

vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, analyzer='word',preprocessor = identity_tokenizer,lowercase=True)    
X = vectorizer.fit_transform(X_train_1)

print("shape of the vectorizer: ",X.shape)


shape of the vectorizer:  (28212, 288098)


In [35]:
"""
Train model
"""
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import ADASYN
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report,confusion_matrix

from sklearn.svm import LinearSVC

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from imblearn.pipeline import make_pipeline


begin = datetime.datetime.now()

pipeline = make_pipeline(ADASYN(),LinearSVC())

pipeline.fit(X, Y)


print("duration for training: ",datetime.datetime.now() - begin)

(28212, 288098) (28212,)
begin 2019-05-23 14:33:21.820161
0.921522756273926
final end 2019-05-23 14:34:17.519915
0:00:55.699754


In [36]:
# Save the tf-idf model to disk
import pickle


filename = 'finalized_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))
pickle.dump(vectorizer.vocabulary_,open("feature.pkl","wb"))