# Notebook for calculating sentiment, readability and length for the entire dataset covering each category

In [1]:
import pandas as pd
import numpy as np
import gzip
import csv
import string
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
from pathlib import Path
%matplotlib inline

# Processing disclosed and undisclosed dataset seperately, so run code below twice after one dataset finished

# YELP

In [33]:
# set flag to handle either disclosed or undisclosed that has gender label predicted by trained model
dataset_dir = Path.cwd() / 'datasets/yelp'
disclosed_flag = False
fields = ['business_id','gender','useful','text', 'stars', 'timestamp', 'categories']
dis_raw_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv', usecols=fields)
mapping = {'female' : 1, 'male' : 0}
dis_raw_df.replace({'gender': mapping}, inplace=True)

fields = ['business_id','Gender','useful','text', 'stars', 'timestamp', 'categories']
undis_raw_df = pd.read_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', usecols=fields)
# undis_raw_df.rename(columns={'predicted_gender': 'Gender'}, inplace=True)
undis_raw_df.head()

Unnamed: 0,Gender,text,stars,useful,business_id,categories,timestamp
0,1.0,So it is 5 stars when Josh(Im pretty sure it i...,5.0,4.0,qmymSqVwHYRqdwfcBatzpQ,"American (New), Wine Bars, Sandwiches, Cocktai...",1290536000.0
1,1.0,I came to see Tron in IMAX because it was cent...,3.0,0.0,jobP3ywRd3QNZ_GCoPG2DQ,"Cinema, Arts & Entertainment",1294364000.0
2,1.0,Lets start out by saying that I am not a Beatl...,5.0,1.0,mz9ltimeAIy2c2qf5ctljw,"Arts & Entertainment, Music Venues, Nightlife,...",1304265000.0
3,1.0,So if you hate pumping your own gas they have ...,5.0,1.0,HZdtHOEaKUL2SlWj5owgCA,"Auto Detailing, Oil Change Stations, Automotiv...",1290833000.0
4,1.0,Why the hell does anyone go to Kona Grill? Un...,5.0,3.0,CWNMLT-ppaUjLMmrnYDPVg,"Asian Fusion, Restaurants, Hawaiian",1288994000.0


# StackExchange

In [2]:
# set flag to handle either disclosed or undisclosed that has gender label predicted by trained model
dataset_dir = Path.cwd() /  'datasets/stackexchange'
fields = ['Gender', 'Score', 'Text', 'CreationDate', 'Reputation']
dis_raw_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv', usecols=fields)
mapping = {'female' : 1, 'male' : 0}
dis_raw_df.replace({'Gender': mapping}, inplace=True)
# dis_raw_df.rename(columns={ 'AnswerText': 'Text', 'CreationDate': 'Timestamp'}, inplace=True)
dis_raw_df.rename(columns={'CreationDate': 'Timestamp'}, inplace=True)
    
# fields = ['PredictedGender', 'Score', 'AnswerText', 'CreationDate', 'Reputation']
fields = ['Gender', 'Score', 'Text', 'CreationDate', 'Reputation']
undis_raw_df = pd.read_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', usecols=fields)
undis_raw_df.rename(columns={'CreationDate': 'Timestamp'}, inplace=True)
undis_raw_df.head()

Unnamed: 0,Gender,Text,Timestamp,Score,Reputation
0,0.0,You realise that `nth-element` is a built-in f...,2013-06-26T12:51:07.277,0.0,187952.0
1,0.0,"Is this just a learning experience, or do peop...",2010-02-11T03:47:21.490,0.0,187952.0
2,0.0,That's the runtime-type-checking option. For a...,2008-12-29T11:01:43.270,0.0,187952.0
3,0.0,@seh: What the OP wants is to be able to creat...,2009-12-17T15:40:12.390,1.0,187952.0
4,0.0,"The reason is that if you use ordinals, then w...",2013-08-20T15:07:14.957,2.0,187952.0


# Reddit

In [3]:
# set flag to handle either disclosed or undisclosed that has gender label predicted by trained model
dataset_dir = Path.cwd() / 'datasets/reddit'
# UserName,Text,Timestamp,Score,Categories,Gender
fields = ['Gender', 'Score', 'Text', 'Timestamp', 'Categories']
dis_raw_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv', usecols=fields)
mapping = {'female' : 1, 'male' : 0}
dis_raw_df.replace({'Gender': mapping}, inplace=True)
    
fields = ['Gender', 'Score', 'Text', 'Timestamp', 'Categories']
undis_raw_df = pd.read_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', usecols=fields)

# seperate disclosed/undisclosed males and females

In [20]:
df_dict = {}
if (dataset_dir / 'disclosed_0.csv').exists() or not (dataset_dir / 'disclosed_1.csv').exists():
    print('disclosed 0 or 1 csv not exist')
    dis_raw_df.rename(columns={'gender': 'Gender'}, inplace=True)
    for i, g in dis_raw_df.groupby('Gender'):
        i = int(i)
        df_dict[i] = g
        g.to_csv(dataset_dir / 'disclosed_{}.csv'.format(i), index=False)
    dis_male_df = df_dict[0]
    dis_female_df = df_dict[1]
else:
    dis_male_df = pd.read_csv(dataset_dir / 'disclosed_0.csv')
    dis_female_df = pd.read_csv(dataset_dir / 'disclosed_1.csv')
if not (dataset_dir / 'undisclosed_0.csv').exists() or not (dataset_dir / 'undisclosed_1.csv').exists():
    print('undisclosed 0 or 1 csv not exist')
    for i, g in undis_raw_df.groupby('Gender'):
        i = int(i)
        df_dict[i] = g
        g.to_csv(dataset_dir / 'undisclosed_{}.csv'.format(i), index=False)
    undis_male_df = df_dict[0]
    undis_female_df = df_dict[1]
else:
    undis_male_df = pd.read_csv(dataset_dir / 'undisclosed_0.csv')
    undis_female_df = pd.read_csv(dataset_dir / 'undisclosed_1.csv')

disclosed 0 or 1 csv not exist
undisclosed 0 or 1 csv not exist


In [21]:
undis_female_df.head()

Unnamed: 0,Text,Timestamp,Score,Reputation,Gender
13,need to see your code.,2012-09-29T17:19:40.433,0,331,1
45,Great answer. I moved ``wn.ensure_loaded`` out...,2014-12-12T23:39:21.487,0,2514,1
73,I had the same problem with `knit`ing to Word....,2014-11-12T23:08:54.263,0,1121,1
101,Record for other order,2015-06-01T17:38:49.103,0,115,1
107,Thank u for support! It works for me.,2017-01-04T08:10:15.330,1,49,1


In [18]:
import datetime
import time
def date_to_timestamp(date_str):
    if type(date_str) is str:
        return time.mktime(datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f').timetuple())
    else:
        return np.nan

In [19]:
undis_female_df['Timestamp'] = undis_female_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
undis_female_df.dropna(inplace=True)
undis_male_df['Timestamp'] = undis_male_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
undis_male_df.dropna(inplace=True)
dis_male_df['Timestamp'] = dis_male_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
dis_male_df.dropna(inplace=True)
dis_female_df['Timestamp'] = dis_female_df['Timestamp'].apply(lambda x: date_to_timestamp(x))
dis_female_df.dropna(inplace=True)

TypeError: strptime() argument 1 must be str, not float

In [5]:
not_punctuation = lambda w: not (len(w) == 1 and (not w.isalpha()))
get_sent_count = lambda text: len(sent_tokenize(text))

In [6]:
TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
SPECIAL_CHARS = ['.', ',', '!', '?']

def get_words(text=''):
    words = []
    words = TOKENIZER.tokenize(text)
    filtered_words = []
    for word in words:
        if word in SPECIAL_CHARS or word == " ":
            pass
        else:
            new_word = word.replace(",","").replace(".","")
            new_word = new_word.replace("!","").replace("?","")
            filtered_words.append(new_word)
    return filtered_words

# set download flag to download dicts, stopwords etc. once while run at first time

In [8]:
# download dicts, stopwords etc. once while run at first time
need_download = True

In [9]:
if need_download:
    nltk.download('cmudict')
prondict = cmudict.dict()

[nltk_data] Downloading package cmudict to /home/yafei/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [10]:
def numsyllables(word):
    try:
        return [len(list(y for y in x if (y[-1]).isdigit())) for x in prondict[word.lower()]]
    except KeyError:
        return [0]

In [11]:
def text_statistics(text):
    word_count = len(get_words(text))
    sent_count = get_sent_count(text)
    # if more than one pronunciation, take the largest no. of syllables
    syllable_count = sum(map(lambda w: max(numsyllables(w)), word_tokenize(text)))
    
    analyzedVars = {
        'word_cnt': float(word_count),
        'sentence_cnt': float(sent_count),
        'syllable_cnt': float(syllable_count),
    }
    
    return analyzedVars['word_cnt'],analyzedVars['sentence_cnt'], analyzedVars['syllable_cnt']

In [12]:
# Flesch Kincaid measure of readability

# readability ease
flesch_formula = lambda word_count, sent_count, syllable_count: 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count

def flesch(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    #print(word_count,sent_count,syllable_count)
    score = 0.0
    if word_count > 0.0:
        score = round(flesch_formula(word_count, sent_count, syllable_count))
    return score

# grade level
fk_formula = lambda word_count, sent_count, syllable_count : 0.39 * word_count / sent_count + 11.8 * syllable_count / word_count - 15.59

def flesch_kincaid(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    score = 0.0
    if word_count > 0.0 and sent_count > 0.0:
        score = round(fk_formula(word_count, sent_count, syllable_count))
    return score

In [13]:
def length(text):
    word_count, sent_count, syllable_count = text_statistics(text)
    return word_count

In [14]:
if need_download:
    nltk.download('stopwords')
# nltk 'punkt'
words = stopwords.words("english")

# remove punctuation for each word
# maketrans() method returns a translation table that maps each character in the 
# intab string into the character at the same position in the outtab string
table = str.maketrans('', '', string.punctuation)

[nltk_data] Downloading package stopwords to /home/yafei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
from  nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

if need_download:
    nltk.download('vader_lexicon')
sia = SIA()

def sentiment(text):
    cleaned_text = " ".join([i.translate(table) for i in text.split() if i.isalpha() if i not in words]).lower()
    return sia.polarity_scores(cleaned_text)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/yafei/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
dis_male_df = dis_male_df.dropna()
dis_female_df = dis_female_df.dropna()

In [17]:
dis_male_df.head()

Unnamed: 0,Text,Timestamp,Score,Categories,Gender
2,just keep playing you'll be fine.,1451606402,1,fo4,0
3,this sideline shit makes nonsense,1451606403,5,CFB,0
4,dude just accept clemson is nasty,1451606409,0,CFB,0
5,kosh has been as good the last few years as sm...,1451606410,0,soccer,0
8,see i guess maybe i'm more forgiving because i...,1451606411,1,AskReddit,0


In [18]:
if 'Text' not in dis_male_df.columns:
    dis_male_df.rename(columns={'text': 'Text'}, inplace=True)
    dis_female_df.rename(columns={'text': 'Text'}, inplace=True)

dis_male_df['Length'] = dis_male_df['Text'].apply(lambda x: length(x))
dis_female_df['Length'] = dis_female_df['Text'].apply(lambda x: length(x))

In [19]:
dis_male_df['Sentiment'] = dis_male_df['Text'].apply(lambda x: sentiment(x))
dis_female_df['Sentiment'] = dis_female_df['Text'].apply(lambda x: sentiment(x))

In [20]:
dis_male_df['GradeLevel'] = dis_male_df['Text'].apply(lambda x: flesch_kincaid(x))
dis_female_df['GradeLevel'] = dis_female_df['Text'].apply(lambda x: flesch_kincaid(x))

In [21]:
dis_male_df.to_csv(dataset_dir / 'disclosed_male_l_s_r.csv', sep='|')
dis_female_df.to_csv(dataset_dir / 'disclosed_female_l_s_r.csv', sep='|')

In [22]:
# free memory
import gc
del dis_male_df
del dis_female_df
gc.collect()

68

In [23]:
undis_male_df = undis_male_df.dropna()
undis_female_df = undis_female_df.dropna()
undis_male_df.head()

Unnamed: 0,Gender,Text,Timestamp,Score,Categories
0,0.0,where's the banana for scale?!?!?!,1459303830,2,pics
1,0.0,lmao average bernie supporter,1457395012,1,trees
2,0.0,i hope they dont make him a pussy,1457504648,7,television
3,0.0,can anyone tell me what the temperature actual...,1457585810,1,survivor
4,0.0,"it is suddenly, when she was released devourer...",1457862203,1,leagueoflegends


In [24]:
undis_male_df.rename(columns={'text': 'Text'}, inplace=True)
undis_female_df.rename(columns={'text': 'Text'}, inplace=True)
undis_male_df['Length'] = undis_male_df['Text'].apply(lambda x: length(x))
undis_female_df['Length'] = undis_female_df['Text'].apply(lambda x: length(x))
undis_male_df['Sentiment'] = undis_male_df['Text'].apply(lambda x: sentiment(x))
undis_female_df['Sentiment'] = undis_female_df['Text'].apply(lambda x: sentiment(x))


In [25]:
undis_male_df.head()

Unnamed: 0,Gender,Text,Timestamp,Score,Categories,Length,Sentiment
0,0.0,where's the banana for scale?!?!?!,1459303830,2,pics,5.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,0.0,lmao average bernie supporter,1457395012,1,trees,5.0,"{'neg': 0.0, 'neu': 0.25, 'pos': 0.75, 'compou..."
2,0.0,i hope they dont make him a pussy,1457504648,7,television,8.0,"{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'comp..."
3,0.0,can anyone tell me what the temperature actual...,1457585810,1,survivor,9.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,0.0,"it is suddenly, when she was released devourer...",1457862203,1,leagueoflegends,37.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [26]:
undis_male_df['GradeLevel'] = undis_male_df['Text'].apply(lambda x: flesch_kincaid(x))
undis_female_df['GradeLevel'] = undis_female_df['Text'].apply(lambda x: flesch_kincaid(x))

In [27]:
undis_male_df.to_csv(dataset_dir / 'undisclosed_male_l_s_r.csv', sep='|')
undis_female_df.to_csv(dataset_dir / 'undisclosed_female_l_s_r.csv', sep='|')