#### Load Post-Processed Data:


In [2]:
import pickle

with open('data/reviews_post_processing.pkl', 'rb') as f:
    reviews = pickle.load(f)

In [None]:
def count_uppercase(text):
    return sum(1 for char in text if char.isupper())

def count_lowercase(text):
    return sum(1 for char in text if char.islower())

def count_punctuation(text):
    return sum([1 for char in text if char in string.punctuation])

def count_dots(text):
    return sum([1 for char in text if char == '.'])

def count_exclamation_marks(text):
    return sum([1 for char in text if char == '!'])

def count_question_marks(text):
    return sum([1 for char in text if char == '?'])

def count_digits(text):
    return sum([1 for char in text if char.isdigit()])

def count_stop_words(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    return sum([1 for w in word_tokens if  w in stop_words])

def count_sentiment_words(text, threshold=0.5):
    word_tokens = word_tokenize(text)
    pos_word_list=[]
    neu_word_list=[]
    neg_word_list=[]
    sentence_length = len(word_tokens)
    sid = SentimentIntensityAnalyzer()


    for word in word_tokens:
        if (sid.polarity_scores(word)['compound']) >= threshold:
            pos_word_list.append(word)
        elif (sid.polarity_scores(word)['compound']) <= -threshold:
            neg_word_list.append(word)
        else:
            neu_word_list.append(word)
    return len(pos_word_list)/sentence_length, len(neg_word_list)/sentence_length, len(neu_word_list)/sentence_length

def sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(text)
    return tuple(ss.values())


def calculate_lexical_diversity(text):
    word_tokens = word_tokenize(text)
    return len(set(word_tokens))/ len(word_tokens)

def most_common_pos(text):
    pos = nltk.pos_tag(word_tokenize(text))
    return Counter([x[1] for x in pos]).most_common(3)

def count_words(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return len(tokenizer.tokenize(text))

In [None]:
eps = np.finfo(float).eps
print('Word count')
reviews['WordCount'] = reviews['Text'].agg(count_words)
reviews['WordCountSummary'] = reviews['Summary'].agg(count_words)

print('Stop words')
reviews['StopWords'] = reviews['Text'].agg(count_stop_words)

print('Uppercase')
reviews['UpperCount'] = reviews['Text'].agg(count_uppercase)
reviews['UpperCountSummary'] = reviews['Summary'].agg(count_uppercase)

print('Lowercase')
reviews['LowerCount'] = reviews['Text'].agg(count_lowercase)
reviews['LowerCountSummary'] = reviews['Summary'].agg(count_lowercase)

print('Dots')
reviews['DotCount'] = reviews['Text'].agg(count_dots)
reviews['DotCountSummary'] = reviews['Summary'].agg(count_dots)

print('Exclamation')
reviews['Exclamation'] = reviews['Text'].agg(count_exclamation_marks)
reviews['ExclamationSummary'] = reviews['Summary'].agg(count_exclamation_marks)

print('Question')
reviews['Question'] = reviews['Text'].agg(count_question_marks)
reviews['QuestionSummary'] = reviews['Summary'].agg(count_question_marks)

print('Punctuation')
reviews['CountPunctuation'] = reviews['Text'].agg(count_punctuation)
reviews['CountPunctuationSummary'] = reviews['Summary'].agg(count_punctuation)

print('Digits')
reviews['CountDigits'] = reviews['Text'].agg(count_digits)
reviews['CountDigitsSummary'] = reviews['Summary'].agg(count_digits)

print('Lexical')
reviews['Lexical'] = reviews['Text'].agg(calculate_lexical_diversity)
reviews['LexicalSummary'] = reviews['Summary'].agg(calculate_lexical_diversity)

print('Upper/Lower')
reviews['UpperLowerR'] = reviews['UpperCount'] / (reviews['LowerCount'] + eps)
reviews['UpperLowerSumR'] = reviews['UpperCountSummary'] / (reviews['LowerCountSummary'] + eps)

print('Capital/dots')
reviews['DotCapitalR'] = reviews['UpperCount'] / (reviews['DotCount'] + eps)
reviews['DotCapitalSumR'] = reviews['UpperCountSummary'] / (reviews['DotCountSummary'] + eps)

print('CapitalRatio')
reviews['CapitalsRatio'] = reviews['UpperCount'] / (reviews['UpperCountSummary'] + eps)

print('Get Reviews sentiment')
reviews[['neg', 'neu', 'pos', 'compound']] = reviews['Text'].agg(sentiment_analysis).apply(pd.Series)

print('Get log Features')
reviews['ProductFreqlog'] = np.log2(reviews['ProductFrequency'])
reviews['ReviewsbyReviewerlog'] = np.log2(reviews['Total_Reviews_by_Reviewer'])
reviews['WordCountlog'] = np.log2(reviews['WordCount'])

print('Timestamps')
reviews['TimeStamp'] = pd.to_datetime(reviews.Time, unit='s')
reviews['Year'] = reviews.TimeStamp.dt.year
reviews['Month'] = reviews.TimeStamp.dt.month
reviews['Day'] = reviews.TimeStamp.dt.day

In [None]:
with open('data/reviews_post_feature_eng.pkl', 'wb') as f:
    pickle.dump(reviews ,f)

In [25]:
import pickle

from sklearn.model_selection import train_test_split

with open('data/reviews_post_feature_eng.pkl', 'rb') as f:
    reviews = pickle.load(f)

reviews = reviews[reviews['HelpfullnessRank'] <=10]

label = 'HelpfullnessRank'
X = reviews.drop(label, axis=1)
y = reviews[label]

df_columns = X.columns


# Split to train, test and validation datasets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.49)

X_train.shape, X_test.shape, X_val.shape


((53480, 50), (18183, 50), (17471, 50))

In [26]:
import numpy as np
np.save('data/X_train',X_train)
np.save('data/X_val', X_val)
np.save('data/X_test', X_test)
np.save('data/y_train', y_train)
np.save('data/y_val', y_val)
np.save('data/y_test', y_test)
np.save('data/df_cols', df_columns)