# Notebook for calculating sentiment, readability and length for the entire dataset covering each category

In [None]:
import pandas as pd
import numpy as np
import gzip
import csv
import string
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
from pathlib import Path
%matplotlib inline

# Processing disclosed and undisclosed dataset seperately, so run code below twice after one dataset finished

# YELP

In [None]:
# set flag to handle either disclosed or undisclosed that has gender label predicted by trained model
dataset_dir = Path.cwd() / 'datasets/yelp'
disclosed_flag = False
fields = ['business_id','gender','useful','text', 'stars', 'timestamp', 'categories']
dis_raw_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv', usecols=fields)
mapping = {'female' : 1, 'male' : 0}
dis_raw_df.replace({'gender': mapping}, inplace=True)

fields = ['business_id','Gender','useful','text', 'stars', 'timestamp', 'categories']
undis_raw_df = pd.read_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', usecols=fields)
# undis_raw_df.rename(columns={'predicted_gender': 'Gender'}, inplace=True)
undis_raw_df.head()

# StackExchange

In [None]:
# set flag to handle either disclosed or undisclosed that has gender label predicted by trained model
dataset_dir = Path.cwd() /  'datasets/stackexchange'
fields = ['Gender', 'Score', 'Text', 'CreationDate', 'Reputation']
dis_raw_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv', usecols=fields)
mapping = {'female' : 1, 'male' : 0}
dis_raw_df.replace({'Gender': mapping}, inplace=True)
# dis_raw_df.rename(columns={ 'AnswerText': 'Text', 'CreationDate': 'Timestamp'}, inplace=True)
dis_raw_df.rename(columns={'CreationDate': 'Timestamp'}, inplace=True)
    
# fields = ['PredictedGender', 'Score', 'AnswerText', 'CreationDate', 'Reputation']
fields = ['Gender', 'Score', 'Text', 'CreationDate', 'Reputation']
undis_raw_df = pd.read_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', usecols=fields)
undis_raw_df.rename(columns={'CreationDate': 'Timestamp'}, inplace=True)
undis_raw_df.head()

# Reddit

In [None]:
# set flag to handle either disclosed or undisclosed that has gender label predicted by trained model
dataset_dir = Path.cwd() / 'datasets/reddit'
# UserName,Text,Timestamp,Score,Categories,Gender
fields = ['Gender', 'Score', 'Text', 'Timestamp', 'Categories']
dis_raw_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv', usecols=fields)
mapping = {'female' : 1, 'male' : 0}
dis_raw_df.replace({'Gender': mapping}, inplace=True)
    
fields = ['Gender', 'Score', 'Text', 'Timestamp', 'Categories']
undis_raw_df = pd.read_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', usecols=fields)

# seperate disclosed/undisclosed males and females

In [None]:
df_dict = {}
if (dataset_dir / 'disclosed_0.csv').exists() or not (dataset_dir / 'disclosed_1.csv').exists():
    print('disclosed 0 or 1 csv not exist')
    dis_raw_df.rename(columns={'gender': 'Gender'}, inplace=True)
    for i, g in dis_raw_df.groupby('Gender'):
        i = int(i)
        df_dict[i] = g
        g.to_csv(dataset_dir / 'disclosed_{}.csv'.format(i), index=False)
    dis_male_df = df_dict[0]
    dis_female_df = df_dict[1]
else:
    dis_male_df = pd.read_csv(dataset_dir / 'disclosed_0.csv')
    dis_female_df = pd.read_csv(dataset_dir / 'disclosed_1.csv')
if not (dataset_dir / 'undisclosed_0.csv').exists() or not (dataset_dir / 'undisclosed_1.csv').exists():
    print('undisclosed 0 or 1 csv not exist')
    for i, g in undis_raw_df.groupby('Gender'):
        i = int(i)
        df_dict[i] = g
        g.to_csv(dataset_dir / 'undisclosed_{}.csv'.format(i), index=False)
    undis_male_df = df_dict[0]
    undis_female_df = df_dict[1]
else:
    undis_male_df = pd.read_csv(dataset_dir / 'undisclosed_0.csv')
    undis_female_df = pd.read_csv(dataset_dir / 'undisclosed_1.csv')

In [None]:
undis_female_df.head()

In [None]:
import datetime
import time
def date_to_timestamp(date_str):
    if type(date_str) is str:
        return time.mktime(datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f').timetuple())
    else:
        return np.nan

In [None]:
undis_female_df['Timestamp'] = undis_female_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
undis_female_df.dropna(inplace=True)
undis_male_df['Timestamp'] = undis_male_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
undis_male_df.dropna(inplace=True)
dis_male_df['Timestamp'] = dis_male_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
dis_male_df.dropna(inplace=True)
dis_female_df['Timestamp'] = dis_female_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
dis_female_df.dropna(inplace=True)

In [None]:
not_punctuation = lambda w: not (len(w) == 1 and (not w.isalpha()))
get_sent_count = lambda text: len(sent_tokenize(text))

In [None]:
TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
SPECIAL_CHARS = ['.', ',', '!', '?']

def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words

# set download flag to download dicts, stopwords etc. once while run at first time

In [None]:
# download dicts, stopwords etc. once while run at first time
need_download = True

In [None]:
if need_download:
    nltk.download('cmudict')
prondict = cmudict.dict()

In [None]:
def numsyllables(word):
    try:
        return [len(list(y for y in x if (y[-1]).isdigit())) for x in prondict[word.lower()]]
    except KeyError:
        return [0]

In [None]:
def text_statistics(text):
    word_count = len(get_words(text))
    sent_count = get_sent_count(text)
    # if more than one pronunciation, take the largest no. of syllables
    syllable_count = sum(map(lambda w: max(numsyllables(w)), word_tokenize(text)))
    
    analyzedVars = {
        'word_cnt': float(word_count),
        'sentence_cnt': float(sent_count),
        'syllable_cnt': float(syllable_count),
    }
    
    return analyzedVars['word_cnt'],analyzedVars['sentence_cnt'], analyzedVars['syllable_cnt']

In [None]:
# Flesch Kincaid measure of readability

# readability ease
flesch_formula = lambda word_count, sent_count, syllable_count: 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count

def flesch(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    #print(word_count,sent_count,syllable_count)
    score = 0.0
    if word_count > 0.0:
        score = round(flesch_formula(word_count, sent_count, syllable_count))
    return score

# grade level
fk_formula = lambda word_count, sent_count, syllable_count : 0.39 * word_count / sent_count + 11.8 * syllable_count / word_count - 15.59

def flesch_kincaid(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    score = 0.0
    if word_count > 0.0 and sent_count > 0.0:
        score = round(fk_formula(word_count, sent_count, syllable_count))
    return score

In [None]:
def length(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    return word_count

In [None]:
if need_download:
    nltk.download('stopwords')
# nltk 'punkt'
words = stopwords.words("english")

# remove punctuation for each word
# maketrans() method returns a translation table that maps each character in the 
# intab string into the character at the same position in the outtab string
table = str.maketrans('', '', string.punctuation)

In [None]:
from  nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

if need_download:
    nltk.download('vader_lexicon')
sia = SIA()

def sentiment(text):
    cleaned_text = " ".join([i.translate(table) for i in text.split() if i.isalpha() if i not in words]).lower()
    return sia.polarity_scores(cleaned_text)

In [None]:
dis_male_df = dis_male_df.dropna()
dis_female_df = dis_female_df.dropna()

In [None]:
dis_male_df.head()

In [None]:
if 'Text' not in dis_male_df.columns:
    dis_male_df.rename(columns={'text': 'Text'}, inplace=True)
    dis_female_df.rename(columns={'text': 'Text'}, inplace=True)

dis_male_df['Length'] = dis_male_df['Text'].apply(lambda x: length(x))
dis_female_df['Length'] = dis_female_df['Text'].apply(lambda x: length(x))

In [None]:
dis_male_df['Sentiment'] = dis_male_df['Text'].apply(lambda x: sentiment(x))
dis_female_df['Sentiment'] = dis_female_df['Text'].apply(lambda x: sentiment(x))

In [None]:
dis_male_df['GradeLevel'] = dis_male_df['Text'].apply(lambda x: flesch_kincaid(x))
dis_female_df['GradeLevel'] = dis_female_df['Text'].apply(lambda x: flesch_kincaid(x))

In [None]:
dis_male_df.to_csv(dataset_dir / 'disclosed_male_l_s_r.csv', sep='|')
dis_female_df.to_csv(dataset_dir / 'disclosed_female_l_s_r.csv', sep='|')

In [None]:
# free memory
import gc
del dis_male_df
del dis_female_df
gc.collect()

In [None]:
undis_male_df = undis_male_df.dropna()
undis_female_df = undis_female_df.dropna()
undis_male_df.head()

In [None]:
undis_male_df.rename(columns={'text': 'Text'}, inplace=True)
undis_female_df.rename(columns={'text': 'Text'}, inplace=True)
undis_male_df['Length'] = undis_male_df['Text'].apply(lambda x: length(x))
undis_female_df['Length'] = undis_female_df['Text'].apply(lambda x: length(x))
undis_male_df['Sentiment'] = undis_male_df['Text'].apply(lambda x: sentiment(x))
undis_female_df['Sentiment'] = undis_female_df['Text'].apply(lambda x: sentiment(x))


In [None]:
undis_male_df.head()

In [None]:
undis_male_df['GradeLevel'] = undis_male_df['Text'].apply(lambda x: flesch_kincaid(x))
undis_female_df['GradeLevel'] = undis_female_df['Text'].apply(lambda x: flesch_kincaid(x))

In [None]:
undis_male_df.to_csv(dataset_dir / 'undisclosed_male_l_s_r.csv', sep='|')
undis_female_df.to_csv(dataset_dir / 'undisclosed_female_l_s_r.csv', sep='|')