# Notebook for calculating sentiment, readability and length for the entire dataset covering each category

In [None]:
import pandas as pd
import numpy as np
import gzip
import csv
import string
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
%matplotlib inline

# Processing disclosed and undisclosed dataset seperately, so run code below twice after one dataset finished

In [None]:
# set flag to handle either disclosed or undisclosed that has gender label predicted by trained model
disclosed_flag = False
if disclosed_flag:
    fields = ['business_id','gender','useful','text', 'stars', 'timestamp', 'categories']
    raw_df = pd.read_csv('disclosed_dataset.csv', usecols=fields)
    mapping = {'female' : 1, 'male' : 0}
    raw_df.replace({'gender': mapping}, inplace=True)
else:
    fields = ['business_id','predicted_gender','useful','text', 'stars', 'timestamp', 'categories']
    raw_df = pd.read_csv('undisclosed_predicted_dataset.csv', usecols=fields)
    raw_df.rename(columns={'predicted_gender': 'gender'}, inplace=True)
raw_df.head()

In [None]:
import os.path

df_dict = {}
if disclosed_flag:
    if not os.path.exists('disclosed_0.csv') or not os.path.exists('disclosed_1.csv'):
        for i, g in raw_df.groupby('gender'):
            df_dict[i] = g
            g.to_csv('disclosed_{}.csv'.format(i), index=False)
    else:
        df_male = pd.read_csv('disclosed_0.csv')
        df_female = pd.read_csv('disclosed_1.csv')
else:
    if not os.path.exists('undisclosed_0.csv') or not os.path.exists('undisclosed_1.csv'):
        print('undisclosed 0 or 1 csv not exist')
        for i, g in raw_df.groupby('gender'):
            df_dict[i] = g
            g.to_csv('undisclosed_{}.csv'.format(i), index=False)
    else:
        df_male = pd.read_csv('undisclosed_0.csv')
        df_female = pd.read_csv('undisclosed_1.csv')

In [None]:
df_male = df_dict[0]
df_female = df_dict[1]
df_male.head()

In [None]:
not_punctuation = lambda w: not (len(w)==1 and (not w.isalpha()))
get_sent_count = lambda text: len(sent_tokenize(text))

In [None]:
TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
SPECIAL_CHARS = ['.', ',', '!', '?']

def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words

# set download flag to download dicts, stopwords etc. once while run at first time

In [None]:
# download dicts, stopwords etc. once while run at first time
need_download = False

In [None]:
if need_download:
    nltk.download('cmudict')
prondict = cmudict.dict()

In [None]:
def numsyllables(word):
    try:
        return [len(list(y for y in x if (y[-1]).isdigit())) for x in prondict[word.lower()]]
    except KeyError:
        return [0]

In [None]:
def text_statistics(text):
    word_count = len(get_words(text))
    sent_count = get_sent_count(text)
    # if more than one pronunciation, take the largest no. of syllables
    syllable_count = sum(map(lambda w: max(numsyllables(w)), word_tokenize(text)))
    
    analyzedVars = {
        'word_cnt': float(word_count),
        'sentence_cnt': float(sent_count),
        'syllable_cnt': float(syllable_count),
    }
    
    return analyzedVars['word_cnt'],analyzedVars['sentence_cnt'], analyzedVars['syllable_cnt']

In [None]:
# Flesch Kincaid measure of readability

# readability ease
flesch_formula = lambda word_count, sent_count, syllable_count: 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count

def flesch(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    #print(word_count,sent_count,syllable_count)
    score = 0.0
    if word_count > 0.0:
        score = round(flesch_formula(word_count, sent_count, syllable_count))
    return score

# grade level
fk_formula = lambda word_count, sent_count, syllable_count : 0.39 * word_count / sent_count + 11.8 * syllable_count / word_count - 15.59

def flesch_kincaid(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    score = 0.0
    if word_count > 0.0:
        score = round(fk_formula(word_count, sent_count, syllable_count))
    return score

In [None]:
def length(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    return word_count

In [None]:
if need_download:
    nltk.download('stopwords')
# nltk 'punkt'
words = stopwords.words("english")

# remove punctuation for each word
# maketrans() method returns a translation table that maps each character in the 
# intab string into the character at the same position in the outtab string
table = str.maketrans('', '', string.punctuation)

In [None]:
from  nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

if need_download:
    nltk.download('vader_lexicon')
sia = SIA()

def sentiment(text):
    cleaned_text = " ".join([i.translate(table) for i in text.split() if i.isalpha() if i not in words]).lower()
    return sia.polarity_scores(cleaned_text)

In [None]:
df_male = df_male.dropna()

In [None]:
df_male['length'] = df_male['text'].apply(lambda x: length(x))

In [None]:
df_male['Sentiment'] = df_male['text'].apply(lambda x: sentiment(x))

In [None]:
df_male['Grade_level'] = df_male['text'].apply(lambda x: flesch_kincaid(x))

In [None]:
if disclosed_flag:
    df_male.to_csv('disclosed_male_l_s_r.csv', sep='|')
else:
    df_male.to_csv('undisclosed_male_l_s_r.csv', sep='|')

In [None]:
df_male.head()

In [None]:
import gc
del df_male
gc.collect()

In [None]:
df_female = df_female.dropna()
df_female['length'] = df_female['text'].apply(lambda x: length(x))
df_female['Sentiment'] = df_female['text'].apply(lambda x: sentiment(x))
df_female['Grade_level'] = df_female['text'].apply(lambda x: flesch_kincaid(x))

In [None]:
if disclosed_flag:
    df_female.to_csv('disclosed_female_l_s_r.csv', sep='|')
else:
    df_female.to_csv('undisclosed_female_l_s_r.csv', sep='|')