# Notebook for calculating sentiment, readability and length

In [1]:
import pandas as pd
import numpy as np

In [2]:
import gzip
import csv
import string
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
%matplotlib inline

In [3]:
#read csv

# fields = ['Product_Id','Gender','Helpfulness','Review','Overall_Rating','Timestamp']
fields = ['business_id','gender','useful','text', 'stars', 'timestamp']
disclosed_flag = True
if disclosed_flag:
    undisclosed_df = pd.read_csv('disclosed_dataset.csv', usecols=fields)
    mapping = {'female' : 1, 'male' : 0}
    undisclosed_df.replace({'gender': mapping}, inplace=True)
else:
    undisclosed_df = pd.read_csv('undisclosed_predicted_dataset.csv', usecols=fields)
undisclosed_df.head()

Unnamed: 0,business_id,stars,useful,text,timestamp,gender
0,ujmEBvifdJM6h6RLv4wQIg,1.0,3.0,"Went in for a broken finger, was asked if I wa...",1525800000.0,1
1,I5TnTKHzJuLA0YLZPGCNwQ,5.0,0.0,Absolutely love this place! Great food for the...,1472864000.0,1
2,3dC3opMY67zrquz_yYc-tw,1.0,0.0,"Frequent this place a lot, well today there wa...",1493149000.0,1
3,_GqNHoWtBOksNcfVjnl0YQ,5.0,0.0,"Phil Brown is a very knowledgeable person, he ...",1491937000.0,1
4,WSGHEQdcdbBWXDpna99EiQ,5.0,0.0,I absolutely love this place. It's delicious a...,1480817000.0,1


In [4]:
df_dict = {}
if disclosed_flag:
    for i, g in undisclosed_df.groupby('gender'):
        df_dict[i] = g
        g.to_csv('disclosed_{}_.csv'.format(i), index=False)
else:
    for i, g in undisclosed_df.groupby('predicted_gender'):
        df_dict[i] = g
        g.to_csv('undisclosed_{}_.csv'.format(i), index=False)


In [None]:
df_dict

In [5]:
df_male = df_dict[0]
df_female = df_dict[1]
df_male.head()

Unnamed: 0,business_id,stars,useful,text,timestamp,gender
33,ujmEBvifdJM6h6RLv4wQIg,1.0,2.0,Interesting... Lisa P. has provided a canned r...,1485610000.0,0
34,ujmEBvifdJM6h6RLv4wQIg,2.0,1.0,I was experiencing some urological issues and ...,1481383000.0,0
35,nrJ_q34hhsOSyUv9nx3sqw,4.0,0.0,"At first glance, this place appears to be an a...",1423157000.0,0
36,iIaC8f8QgaWTYoFSZYasrw,5.0,0.0,"James was on time, friendly and professional. ...",1509405000.0,0
37,nmTOGH5cunMweQ0uNaG52A,5.0,5.0,We had 125 sq yds of carpet installed on 4/15/...,1430822000.0,0


In [6]:
not_punctuation = lambda w: not (len(w)==1 and (not w.isalpha()))
get_sent_count = lambda text: len(sent_tokenize(text))

In [7]:
TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
SPECIAL_CHARS = ['.', ',', '!', '?']

def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words

In [8]:
# nltk.download('cmudict')
prondict = cmudict.dict()

In [9]:
def numsyllables(word):
    try:
        return [len(list(y for y in x if (y[-1]).isdigit())) for x in prondict[word.lower()]]
    except KeyError:
        return [0]

In [10]:
def text_statistics(text):
    word_count = len(get_words(text))
    sent_count = get_sent_count(text)
    #if more than one pronunciation, take the largest no. of syllables
    syllable_count = sum(map(lambda w: max(numsyllables(w)), word_tokenize(text)))
    
    analyzedVars = {
        'word_cnt': float(word_count),
        'sentence_cnt': float(sent_count),
        'syllable_cnt': float(syllable_count),
    }
    
    return analyzedVars['word_cnt'],analyzedVars['sentence_cnt'], analyzedVars['syllable_cnt']

In [11]:
#Flesch Kincaid measure of readability

#readability ease
flesch_formula = lambda word_count, sent_count, syllable_count : 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count

def flesch(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    #print(word_count,sent_count,syllable_count)
    score = 0.0
    if word_count > 0.0:
        score = round(flesch_formula(word_count, sent_count, syllable_count))
    return score

#grade level
fk_formula = lambda word_count, sent_count, syllable_count : 0.39 * word_count / sent_count + 11.8 * syllable_count / word_count - 15.59

def flesch_kincaid(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    score = 0.0
    if word_count > 0.0:
        score = round(fk_formula(word_count, sent_count, syllable_count))
    return score

In [12]:
def length(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    return word_count

In [None]:
# import nltk
# nltk.download('punkt')

In [None]:
# pd.set_option('display.max_colwidth', -1)

In [None]:
# nltk.download('stopwords')

In [13]:
words = stopwords.words("english")
#remove punctuation for each word
#maketrans() method returns a translation table that maps each character in the 
#intab string into the character at the same position in the outtab string
table = str.maketrans('','', string.punctuation)

In [14]:
from  nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

# nltk.download('vader_lexicon')

sia = SIA()

def sentiment(text):
    cleaned_text = " ".join([i.translate(table) for i in text.split() if i.isalpha() if i not in words]).lower()
    return sia.polarity_scores(cleaned_text)

In [15]:
df_male = df_male.dropna()

In [17]:
df_male['Sentiment'] = df_male['text'].apply(lambda x: sentiment(x))

In [18]:
df_male['Grade_level'] = df_male['text'].apply(lambda x: flesch_kincaid(x))

In [19]:
df_male['length'] = df_male['text'].apply(lambda x: length(x))

In [20]:
df_male.to_csv('disclosed_male_l_s_r.csv',sep='|')

In [21]:
df_male.head()

Unnamed: 0,business_id,stars,useful,text,timestamp,gender,Sentiment,Grade_level,length
33,ujmEBvifdJM6h6RLv4wQIg,1.0,2.0,Interesting... Lisa P. has provided a canned r...,1485610000.0,0,"{'neg': 0.096, 'neu': 0.658, 'pos': 0.246, 'co...",9.0,173.0
34,ujmEBvifdJM6h6RLv4wQIg,2.0,1.0,I was experiencing some urological issues and ...,1481383000.0,0,"{'neg': 0.178, 'neu': 0.778, 'pos': 0.044, 'co...",7.0,149.0
35,nrJ_q34hhsOSyUv9nx3sqw,4.0,0.0,"At first glance, this place appears to be an a...",1423157000.0,0,"{'neg': 0.041, 'neu': 0.68, 'pos': 0.279, 'com...",7.0,183.0
36,iIaC8f8QgaWTYoFSZYasrw,5.0,0.0,"James was on time, friendly and professional. ...",1509405000.0,0,"{'neg': 0.0, 'neu': 0.68, 'pos': 0.32, 'compou...",8.0,40.0
37,nmTOGH5cunMweQ0uNaG52A,5.0,5.0,We had 125 sq yds of carpet installed on 4/15/...,1430822000.0,0,"{'neg': 0.055, 'neu': 0.745, 'pos': 0.2, 'comp...",8.0,282.0


In [22]:
import gc
del df_male
gc.collect()


100

In [23]:
# df_female = pd.read_csv('/media/backup/Data/Amazon/amazon_female.csv', sep='|',encoding='utf8',quoting=csv.QUOTE_NONE,usecols=fields)
df_female = df_female.dropna()
print('loaded dataset to memory')
df_female['Grade_level'] = df_female['text'].apply(lambda x: flesch_kincaid(x))
print('calculated grade-level')
df_female['length'] = df_female['text'].apply(lambda x: length(x))
print('calculated length')
df_female['Sentiment'] = df_female['text'].apply(lambda x: sentiment(x))
print('calculated sentiment')

loaded dataset to memory
calculated grade-level
calculated length
calculated sentiment


In [24]:
df_female = df_female.dropna()

In [25]:
if disclosed_flag:
    df_female.to_csv('disclosed_female_l_s_r.csv',sep='|')

In [26]:
df_female.count()

business_id    2413002
stars          2413002
useful         2413002
text           2413002
timestamp      2413002
gender         2413002
Grade_level    2413002
length         2413002
Sentiment      2413002
dtype: int64