# Notebook for calculating sentiment, readability and length

In [1]:
import pandas as pd
import numpy as np
import gzip
import csv
import string
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
%matplotlib inline

In [2]:
#read csv
disclosed_flag = False
if disclosed_flag:
    fields = ['business_id','gender','useful','text', 'stars', 'timestamp', 'categories']
    raw_df = pd.read_csv('disclosed_dataset.csv', usecols=fields)
    mapping = {'female' : 1, 'male' : 0}
    raw_df.replace({'gender': mapping}, inplace=True)
else:
    fields = ['business_id','predicted_gender','useful','text', 'stars', 'timestamp', 'categories']
    raw_df = pd.read_csv('undisclosed_predicted_dataset.csv', usecols=fields)
    raw_df.rename(columns={'predicted_gender': 'gender'}, inplace=True)
raw_df.head()

Unnamed: 0,gender,business_id,stars,useful,text,timestamp,categories
0,1.0,Hqs4YNST_ZHbshwyi4bnsQ,5.0,0.0,The customer service of the owner made me give...,1539293000.0,"Restaurants, Pizza, American (Traditional), It..."
1,1.0,Hqs4YNST_ZHbshwyi4bnsQ,1.0,0.0,I used to drive from summerlin to go here beca...,1538919000.0,"Restaurants, Pizza, American (Traditional), It..."
2,1.0,bPcqucuuClxYrIM8xWoArg,5.0,4.0,Did anyone mention bacon chocolate ice cream. ...,1317320000.0,"Nightlife, Bars, Italian, Pizza, Wine Bars, Re..."
3,1.0,hubbaEcYPYEZu5Ziz6i0lw,4.0,1.0,I like the Lamb Tandoori dish they do a lot. I...,1304258000.0,"Restaurants, Pakistani, Indian"
4,1.0,5aeR9KcboZmhDZlFscnYRA,5.0,1.0,So Fresh Mama let us host a Homeschool Board G...,1358296000.0,"Food, Restaurants, Internet Cafes, Juice Bars ..."


In [3]:
import os.path

df_dict = {}
if disclosed_flag:
    if not os.path.exists('disclosed_0.csv') or not os.path.exists('disclosed_1.csv'):
        for i, g in raw_df.groupby('gender'):
            df_dict[i] = g
            g.to_csv('disclosed_{}.csv'.format(i), index=False)
    else:
        df_male = pd.read_csv('disclosed_0.csv')
        df_female = pd.read_csv('disclosed_1.csv')
else:
    if not os.path.exists('undisclosed_0.csv') or not os.path.exists('undisclosed_1.csv'):
        print('undisclosed 0 or 1 csv not exist')
        for i, g in raw_df.groupby('gender'):
            df_dict[i] = g
            g.to_csv('undisclosed_{}.csv'.format(i), index=False)
    else:
        df_male = pd.read_csv('undisclosed_0.csv')
        df_female = pd.read_csv('undisclosed_1.csv')

undisclosed 0 or 1 csv not exist


In [4]:
df_male = df_dict[0]
df_female = df_dict[1]
df_male.head()

Unnamed: 0,gender,business_id,stars,useful,text,timestamp,categories
130,0.0,JJxI7OA8wgr8ZMuwaKborQ,4.0,0.0,"Tough to find places around here to eat, so my...",1375885000.0,"Peruvian, Restaurants"
134,0.0,IB8zLlGraOg9LU7qQVLPyg,4.0,129.0,Nach unserem enttäuschenden Besuch im Outlet C...,1443752000.0,"Restaurants, Fast Food, Shopping Centers, Shop..."
135,0.0,9edPSkfXKsJmkZYIaOmA7Q,4.0,204.0,Nachdem wir die Las Vegas North Premium Outlet...,1509952000.0,"Outlet Stores, Restaurants, Shopping Centers, ..."
136,0.0,GHS1rVjO-RMcRB6WJLpCDQ,3.0,113.0,In Las Vegas kann man zwischen zwei verschiede...,1442370000.0,"Outlet Stores, Shopping Centers, Restaurants, ..."
137,0.0,SMPbvZLSMMb7KU76YNYMGg,4.0,201.0,Nach einem Spaziergang durch das exklusive Cry...,1515274000.0,"Hotels & Travel, Arts & Entertainment, Hotels,..."


In [5]:
not_punctuation = lambda w: not (len(w)==1 and (not w.isalpha()))
get_sent_count = lambda text: len(sent_tokenize(text))

In [6]:
TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
SPECIAL_CHARS = ['.', ',', '!', '?']

def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words

In [7]:
# nltk.download('cmudict')
prondict = cmudict.dict()

In [8]:
def numsyllables(word):
    try:
        return [len(list(y for y in x if (y[-1]).isdigit())) for x in prondict[word.lower()]]
    except KeyError:
        return [0]

In [9]:
def text_statistics(text):
    word_count = len(get_words(text))
    sent_count = get_sent_count(text)
    #if more than one pronunciation, take the largest no. of syllables
    syllable_count = sum(map(lambda w: max(numsyllables(w)), word_tokenize(text)))
    
    analyzedVars = {
        'word_cnt': float(word_count),
        'sentence_cnt': float(sent_count),
        'syllable_cnt': float(syllable_count),
    }
    
    return analyzedVars['word_cnt'],analyzedVars['sentence_cnt'], analyzedVars['syllable_cnt']

In [10]:
#Flesch Kincaid measure of readability

#readability ease
flesch_formula = lambda word_count, sent_count, syllable_count: 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count

def flesch(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    #print(word_count,sent_count,syllable_count)
    score = 0.0
    if word_count > 0.0:
        score = round(flesch_formula(word_count, sent_count, syllable_count))
    return score

#grade level
fk_formula = lambda word_count, sent_count, syllable_count : 0.39 * word_count / sent_count + 11.8 * syllable_count / word_count - 15.59

def flesch_kincaid(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    score = 0.0
    if word_count > 0.0:
        score = round(fk_formula(word_count, sent_count, syllable_count))
    return score

In [11]:
def length(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    return word_count

In [15]:
# nltk.download('stopwords')
# nltk 'punkt'
words = stopwords.words("english")

#remove punctuation for each word
#maketrans() method returns a translation table that maps each character in the 
#intab string into the character at the same position in the outtab string
table = str.maketrans('', '', string.punctuation)

In [16]:
from  nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

# nltk.download('vader_lexicon')
sia = SIA()

def sentiment(text):
    cleaned_text = " ".join([i.translate(table) for i in text.split() if i.isalpha() if i not in words]).lower()
    return sia.polarity_scores(cleaned_text)

In [17]:
df_male = df_male.dropna()

In [18]:
df_male['Sentiment'] = df_male['text'].apply(lambda x: sentiment(x))

In [19]:
df_male['Grade_level'] = df_male['text'].apply(lambda x: flesch_kincaid(x))

In [20]:
df_male['length'] = df_male['text'].apply(lambda x: length(x))

In [21]:
if disclosed_flag:
    df_male.to_csv('disclosed_male_l_s_r.csv', sep='|')
else:
    df_male.to_csv('undisclosed_male_l_s_r.csv',sep='|')

In [22]:
df_male.head()

Unnamed: 0,gender,business_id,stars,useful,text,timestamp,categories,Sentiment,Grade_level,length
130,0.0,JJxI7OA8wgr8ZMuwaKborQ,4.0,0.0,"Tough to find places around here to eat, so my...",1375885000.0,"Peruvian, Restaurants","{'neg': 0.094, 'neu': 0.729, 'pos': 0.176, 'co...",9.0,53.0
134,0.0,IB8zLlGraOg9LU7qQVLPyg,4.0,129.0,Nach unserem enttäuschenden Besuch im Outlet C...,1443752000.0,"Restaurants, Fast Food, Shopping Centers, Shop...","{'neg': 0.132, 'neu': 0.868, 'pos': 0.0, 'comp...",-1.0,230.0
135,0.0,9edPSkfXKsJmkZYIaOmA7Q,4.0,204.0,Nachdem wir die Las Vegas North Premium Outlet...,1509952000.0,"Outlet Stores, Restaurants, Shopping Centers, ...","{'neg': 0.094, 'neu': 0.906, 'pos': 0.0, 'comp...",-1.0,203.0
136,0.0,GHS1rVjO-RMcRB6WJLpCDQ,3.0,113.0,In Las Vegas kann man zwischen zwei verschiede...,1442370000.0,"Outlet Stores, Shopping Centers, Restaurants, ...","{'neg': 0.169, 'neu': 0.822, 'pos': 0.01, 'com...",1.0,266.0
137,0.0,SMPbvZLSMMb7KU76YNYMGg,4.0,201.0,Nach einem Spaziergang durch das exklusive Cry...,1515274000.0,"Hotels & Travel, Arts & Entertainment, Hotels,...","{'neg': 0.056, 'neu': 0.944, 'pos': 0.0, 'comp...",0.0,249.0


In [23]:
import gc
del df_male
gc.collect()

100

In [24]:
df_female = df_female.dropna()
df_female['Grade_level'] = df_female['text'].apply(lambda x: flesch_kincaid(x))
df_female['length'] = df_female['text'].apply(lambda x: length(x))
df_female['Sentiment'] = df_female['text'].apply(lambda x: sentiment(x))

In [25]:
if disclosed_flag:
    df_female.to_csv('disclosed_female_l_s_r.csv', sep='|')
else:
    df_female.to_csv('undisclosed_female_l_s_r.csv', sep='|')