In [1]:

import nltk
import string
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from collections import Counter


%matplotlib inline



### Clean up & Featurtes Generation

In [3]:
# Load data
reviews = pd.read_csv('data/Reviews.csv')

check_columns = reviews.columns[1:] # Remove index column from the subset
print("Duplicated Raws: ", reviews.duplicated(check_columns).sum())

# Delete duplicates:
reviews = reviews.drop_duplicates(check_columns, keep='first')
print("Deleting Duplicates ---> Updated size: ", reviews.shape)


# Add Words count feature:
print("Adding Words Count Feature")
reviews['WordsCount'] = reviews['Text'].str.count(' ') + 1

# Calculate the relative Helpfullness according to the ratings of the reviews:
print("Calculating the Helpfulness Rank")
reviews['HelpfullnessRank'] = round(reviews.HelpfulnessNumerator / reviews.HelpfulnessDenominator, 1) * 10


# Check the Origin of the NaN values:
select_1 = ((reviews.HelpfulnessNumerator == 0) & (reviews.HelpfulnessDenominator == 0))
select_2 = ((reviews.HelpfulnessNumerator != 0) & (reviews.HelpfulnessDenominator == 0))
print("Select 1: {}, Select 2: {}".format(select_1.sum(), select_2.sum()))

# Drop the Nan values - No indication for the helpfullness:
print("Deleting NaN Values")
reviews = reviews.dropna() 

# Down sample (x 1/4 randomly) the records with max helpfulness score to reduce the unbalanced distribution:
temp_max_score = reviews[reviews['HelpfullnessRank'] == 10]
temp_not_max = reviews[reviews['HelpfullnessRank'] != 10]
max_random_samples = temp_max_score.sample(frac=0.25) # sample randomly

reviews = pd.concat([temp_not_max, max_random_samples])
print("Downsampling max score records ---> {}".format(reviews.shape))

print("Adding the reviewer's popularity (number of reviews written)")
users_rating = pd.read_csv('data/users_rating') # A query which consist of the user popularity information
users_popularity = users_rating[['UserId', 'Total_Reviews']]
reviews = reviews.merge(users_popularity)
reviews = reviews.rename(columns={'Total_Reviews': 'Total_Reviews_by_Reviewer'})

# Get product frequency among the reviews:
print("Adding the frequency of product")
product_occurace = pd.DataFrame(reviews.ProductId.value_counts()).reset_index()
product_occurace = product_occurace.rename( columns={"index": "ProductId", "ProductId": "freq"})
reviews = reviews.merge(product_occurace)
reviews = reviews.rename(columns={'freq': 'ProductFrequency'})


print("Clear products with less than 10 reviews")
reviews = reviews[reviews.ProductFrequency >=10] # take only products with at least 10 reviews
print("Final shape of dataset: ", reviews.shape)

# Add the variance of Helpfullness score:
print("Adding the helpfullness score variace product vise")
helpfulness_var = reviews.groupby('ProductId').agg('var')['HelpfullnessRank']
helpfulness_score_var = pd.DataFrame({'ProductId': np.array(helpfulness_var.index), 'HelpfullnessVar': helpfulness_var.values})
reviews = reviews.merge(helpfulness_score_var)


Duplicated Raws:  281
Deleting Duplicates ---> Updated size:  (568173, 10)
Adding Words Count Feature
Calculating the Helpfulness Rank
Select 1: 269850, Select 2: 0
Deleting NaN Values
Downsampling max score records ---> (159547, 12)
Adding the reviewer's popularity (number of reviews written)
Adding the frequency of product
Clear products with less than 10 reviews
Final shape of dataset:  (89135, 14)
Adding the helpfullness score variace product vise


#### Clean Text

In [5]:
with open('data/appo_dict.pkl', 'rb') as f:
    APPO = pickle.load(f)
    
    
from spacy.en import English
import spacy
import regex as re
nlp = spacy.load("en")
parser = English()
parser.vocab["not"].is_stop = False
parser.vocab["cannot"].is_stop = False


from nltk.tokenize import TweetTokenizer
from IPython.display import clear_output


C = 0
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    global C
    C+=1
    if C % 100 == 0:
        print(C)
        clear_output(wait=True)
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n"," ",comment)
    # remove leaky elements like ip,user
    #comment=re.sub("\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}","",comment)
    #removing usernames
#     comment=re.sub("\[\[.*\]","",comment)
    #Split the sentences into words
     
    tokenizer=TweetTokenizer()
    words=tokenizer.tokenize(comment)
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    #print([APPO[word] if word in APPO else word for word in words])
    words = " ".join([APPO[word] if word in APPO else word for word in words])
    words = [token.lemma_  for token in parser(words) if (not token.is_stop and not token.is_space) ]
    words = [word for word in words if word not in ['"','s','.',';',',']]
    #words=[lem.lemmatize(word, "v") for word in words]
    #words = [w for w in words if not w in eng_stopwords]
    
    clean_sent = " ".join(words)
    return(clean_sent)

In [6]:
reviews['Text_clean'] = reviews.Text.agg(clean)

with open('data/reviews_post_processing.pkl', 'wb') as f:
    pickle.dump(reviews ,f)

89100


In [2]:
with open('data/reviews_post_processing.pkl', 'rb') as f:
    reviews = pickle.load(f)

#### Feature Eng.

In [3]:
def count_uppercase(text):
    return sum(1 for char in text if char.isupper())

def count_lowercase(text):
    return sum(1 for char in text if char.islower())

def count_punctuation(text):
    return sum([1 for char in text if char in string.punctuation])

def count_dots(text):
    return sum([1 for char in text if char == '.'])

def count_exclamation_marks(text):
    return sum([1 for char in text if char == '!'])

def count_question_marks(text):
    return sum([1 for char in text if char == '?'])

def count_digits(text):
    return sum([1 for char in text if char.isdigit()])

def count_stop_words(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    return sum([1 for w in word_tokens if  w in stop_words])

def count_sentiment_words(text, threshold=0.5):
    word_tokens = word_tokenize(text)
    pos_word_list=[]
    neu_word_list=[]
    neg_word_list=[]
    sentence_length = len(word_tokens)
    sid = SentimentIntensityAnalyzer()


    for word in word_tokens:
        if (sid.polarity_scores(word)['compound']) >= threshold:
            pos_word_list.append(word)
        elif (sid.polarity_scores(word)['compound']) <= -threshold:
            neg_word_list.append(word)
        else:
            neu_word_list.append(word)
    return len(pos_word_list)/sentence_length, len(neg_word_list)/sentence_length, len(neu_word_list)/sentence_length

def sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(text)
    return tuple(ss.values())


def calculate_lexical_diversity(text):
    word_tokens = word_tokenize(text)
    return len(set(word_tokens))/ len(word_tokens)

def most_common_pos(text):
    pos = nltk.pos_tag(word_tokenize(text))
    return Counter([x[1] for x in pos]).most_common(3)

def count_words(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return len(tokenizer.tokenize(text))

In [4]:
eps = np.finfo(float).eps
print('Word count')
reviews['WordCount'] = reviews['Text'].agg(count_words)
reviews['WordCountSummary'] = reviews['Summary'].agg(count_words)

print('Stop words')
reviews['StopWords'] = reviews['Text'].agg(count_stop_words)

print('Uppercase')
reviews['UpperCount'] = reviews['Text'].agg(count_uppercase)
reviews['UpperCountSummary'] = reviews['Summary'].agg(count_uppercase)

print('Lowercase')
reviews['LowerCount'] = reviews['Text'].agg(count_lowercase)
reviews['LowerCountSummary'] = reviews['Summary'].agg(count_lowercase)

print('Dots')
reviews['DotCount'] = reviews['Text'].agg(count_dots)
reviews['DotCountSummary'] = reviews['Summary'].agg(count_dots)

print('Exclamation')
reviews['Exclamation'] = reviews['Text'].agg(count_exclamation_marks)
reviews['ExclamationSummary'] = reviews['Summary'].agg(count_exclamation_marks)

print('Question')
reviews['Question'] = reviews['Text'].agg(count_question_marks)
reviews['QuestionSummary'] = reviews['Summary'].agg(count_question_marks)

print('Punctuation')
reviews['CountPunctuation'] = reviews['Text'].agg(count_punctuation)
reviews['CountPunctuationSummary'] = reviews['Summary'].agg(count_punctuation)

print('Digits')
reviews['CountDigits'] = reviews['Text'].agg(count_digits)
reviews['CountDigitsSummary'] = reviews['Summary'].agg(count_digits)

print('Lexical')
reviews['Lexical'] = reviews['Text'].agg(calculate_lexical_diversity)
reviews['LexicalSummary'] = reviews['Summary'].agg(calculate_lexical_diversity)

print('Upper/Lower')
reviews['UpperLowerR'] = reviews['UpperCount'] / (reviews['LowerCount'] + eps)
reviews['UpperLowerSumR'] = reviews['UpperCountSummary'] / (reviews['LowerCountSummary'] + eps)

print('Capital/dots')
reviews['DotCapitalR'] = reviews['UpperCount'] / (reviews['DotCount'] + eps)
reviews['DotCapitalSumR'] = reviews['UpperCountSummary'] / (reviews['DotCountSummary'] + eps)

print('CapitalRatio')
reviews['CapitalsRatio'] = reviews['UpperCount'] / (reviews['UpperCountSummary'] + eps)

print('Get Reviews sentiment')
reviews[['neg', 'neu', 'pos', 'compound']] = reviews['Text'].agg(sentiment_analysis).apply(pd.Series)

print('Get log Features')
reviews['ProductFreqlog'] = np.log2(reviews['ProductFrequency'])
reviews['ReviewsbyReviewerlog'] = np.log2(reviews['Total_Reviews_by_Reviewer'])
reviews['WordCountlog'] = np.log2(reviews['WordCount'])

print('Timestamps')
reviews['TimeStamp'] = pd.to_datetime(reviews.Time, unit='s')
reviews['Year'] = reviews.TimeStamp.dt.year
reviews['Month'] = reviews.TimeStamp.dt.month
reviews['Day'] = reviews.TimeStamp.dt.day

Word count
Stop words
Uppercase
Lowercase
Dots
Exclamation
Question
Punctuation
Digits
Lexical
Upper/Lower
Capital/dots
CapitalRatio
Get Reviews sentiment
Get log Features
Timestamps


In [5]:
with open('data/reviews_post_feature_eng.pkl', 'wb') as f:
    pickle.dump(reviews ,f)