In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
reddit_dir = Path.cwd() / 'datasets/reddit/'

# to collect sufficient training data, since one is not enough

In [None]:
df1 = pd.read_json(reddit_dir / 'RC_2016-01', lines=True)
tuned_df1 = df1[['author', 'body', 'created_utc', 'score',  'subreddit']]

In [None]:
df2 = pd.read_json(reddit_dir / 'RC_2016-02', lines=True)
tuned_df2 = df2[['author', 'body', 'created_utc', 'score',  'subreddit']]

In [None]:
df3 = pd.read_json(reddit_dir / 'RC_2016-03', lines=True)
tuned_df3 = df3[['author', 'body', 'created_utc', 'score',  'subreddit']]

In [None]:
tuned_df1.dropna(axis=0, inplace=True) # drop rows where NAN exists
tuned_df1['body'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
tuned_df1.rename(columns={'author': 'UserName', 'body': 'Text', 'created_utc': 'Timestamp', 'score': 'Score', 'ups': 'UpScore', 'subreddit': 'Categories'}, 
                        inplace=True)

In [None]:
tuned_df2.dropna(axis=0, inplace=True) # drop rows where NAN exists
tuned_df2['body'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
tuned_df2.rename(columns={'author': 'UserName', 'body': 'Text', 'created_utc': 'Timestamp', 'score': 'Score', 'ups': 'UpScore', 'subreddit': 'Categories'}, 
                        inplace=True)

In [None]:
tuned_df3.dropna(axis=0, inplace=True) # drop rows where NAN exists
tuned_df3['body'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
tuned_df3.rename(columns={'author': 'UserName', 'body': 'Text', 'created_utc': 'Timestamp', 'score': 'Score', 'ups': 'UpScore', 'subreddit': 'Categories'}, 
                        inplace=True)

# if full dataset exists, reading it directly

In [None]:
tuned_df = pd.read_csv(reddit_dir / 'html_parsed_full_dataset.csv', sep='|')
tuned_df.rename(columns={'author': 'UserName', 'body': 'Text', 'created_utc': 'Timestamp', 'score': 'Score', 'ups': 'UpScore', 'subreddit': 'Categories'}, 
                        inplace=True)
tuned_df.head()

In [None]:
tuned_df = tuned_df.dropna()

In [None]:
tmp_df = tuned_df['UserName']
tmp_df.head()

In [None]:
disclosed_df = pd.read_csv(reddit_dir / 'disclosed_dataset.csv', usecols=['Categories'])
undisclosed_df = pd.read_csv(reddit_dir / 'undisclosed_dataset.csv', usecols=['Categories'])

In [None]:
concat_df = pd.concat([disclosed_df, undisclosed_df], axis=0, ignore_index=True)

In [None]:
concat_df = concat_df.dropna()
categories_column = concat_df.Categories

In [None]:
category_freq = {}
for cate in categories_column:
    for sub_cate in cate.lower().split(','):
        sub_cate = sub_cate.strip()
        category_freq[sub_cate] = category_freq.get(sub_cate, 0) + 1

In [None]:
sorted_category_freq = sorted(category_freq.items(), key=lambda kv: kv[1], reverse=True)
with open(reddit_dir / 'top_10_categories.json', 'w') as f:
    json.dump(sorted_category_freq[:10], f)
len(sorted_category_freq)

In [None]:
sorted_category_freq

# html parser for body text

In [None]:
import warnings
from bs4 import BeautifulSoup

In [None]:
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
tuned_df1['Text'] = tuned_df1['Text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text().lower())
tuned_df1.head()

In [None]:
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
tuned_df2['Text'] = tuned_df2['Text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text().lower())
tuned_df2.head()

In [None]:
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
tuned_df3['Text'] = tuned_df3['Text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text().lower())
tuned_df3.head()

In [None]:
# tuned_df.to_csv(reddit_dir / 'html_parsed_full_dataset.csv', index=False, sep='|')

In [None]:
# tuned_df = pd.read_csv(reddit_dir / 'html_parsed_full_dataset.csv', sep='|')
# tuned_df.head()

In [None]:
import gender_guesser.detector as gender

d = gender.Detector(case_sensitive=False)

import string
translator = str.maketrans('', '', string.punctuation)

def get_gender(name):
    try:
        n = name.lower()
        if ('mom' in n) or ('girl' in n) or ('angel' in n) or ('mum' in n) or ('mother' in n) or ('woman' in n):
            return 'female'
        if ('boy' in n) or ('dude' in n):
            return 'male'
        temp = name.translate(translator).split()
    except:
        return 'unknown'
    
    if len(temp) > 0:
        first_name = temp[0]
        first_name = ''.join([i for i in first_name if not i.isdigit()])
        gender = d.get_gender(first_name,'usa')
        return gender
    else:
        return 'unknown'

In [None]:
tuned_df1['Gender'] = tuned_df1['UserName'].apply(lambda x:get_gender(x))
tuned_df1.head()

In [None]:
import gc
del tuned_df1
gc.collect()
tuned_df2['Gender'] = tuned_df2['UserName'].apply(lambda x:get_gender(x))
tuned_df2.head()

In [None]:
import gc
del tuned_df2
gc.collect()
tuned_df3['Gender'] = tuned_df3['UserName'].apply(lambda x:get_gender(x))
tuned_df3.head()

In [None]:
disclosed_flags1 = tuned_df1['Gender'].isin(['female', 'male'])
undisclosed_flags1 = ~disclosed_flags1

In [None]:
disclosed_flags2 = tuned_df2['Gender'].isin(['female', 'male'])
undisclosed_flags2 = ~disclosed_flags2

In [None]:
disclosed_flags3 = tuned_df3['Gender'].isin(['female', 'male'])
undisclosed_flags3 = ~disclosed_flags3

In [None]:
disclosed_gender_df1 = tuned_df1[disclosed_flags1]
disclosed_gender_df1.head()

In [None]:
disclosed_gender_df1.shape

In [None]:
disclosed_gender_df1.to_csv(reddit_dir / 'disclosed_dataset1.csv', index=False)

In [None]:
disclosed_gender_df2 = tuned_df2[disclosed_flags2]
disclosed_gender_df2.head()

In [None]:
disclosed_gender_df2.shape

In [None]:
disclosed_gender_df2.to_csv(reddit_dir / 'disclosed_dataset2.csv', index=False)

In [None]:
disclosed_gender_df3 = tuned_df3[disclosed_flags3]
disclosed_gender_df3.head()

In [None]:
disclosed_gender_df3.shape

In [None]:
disclosed_gender_df3.to_csv(reddit_dir / 'disclosed_dataset3.csv', index=False)

In [None]:
disclosed_gender_df1 = pd.read_csv(reddit_dir / 'disclosed_dataset1.csv')
disclosed_gender_df2 = pd.read_csv(reddit_dir / 'disclosed_dataset2.csv')
disclosed_gender_df3 = pd.read_csv(reddit_dir / 'disclosed_dataset3.csv')

In [None]:
disclosed_df = pd.concat([disclosed_gender_df1, disclosed_gender_df2, disclosed_gender_df3], axis=0, ignore_index=True)

In [None]:
disclosed_df.shape # at least (2117828, 7)

In [None]:
disclosed_df.to_csv(reddit_dir / 'disclosed_dataset.csv', index=False)

In [None]:
# undisclosed data is quite enough, so using one of them 
undisclosed_gender_df1 = tuned_df[undisclosed_flags1]
undisclosed_gender_df1.head()

In [None]:
undisclosed_gender_df3 = tuned_df3[undisclosed_flags3]
undisclosed_gender_df3.head()

In [None]:
undisclosed_gender_df3.shape

In [None]:
undisclosed_gender_df3.to_csv(reddit_dir / 'undisclosed_dataset.csv', index=False)

In [None]:
undisclosed_gender_df = undisclosed_gender_df3

# write or read csv file

In [None]:
import os.path
import pandas as pd
if not os.path.exists(reddit_dir / 'disclosed_dataset.csv'):
    print('saving disclosed dataset to csv')
    disclosed_df.to_csv(reddit_dir / 'disclosed_dataset.csv', index=False)
else:
    disclosed_gender_df = pd.read_csv(reddit_dir / 'disclosed_dataset.csv')

if not os.path.exists(reddit_dir / 'undisclosed_dataset.csv'):
    print('saving undisclosed dataset to csv')
    undisclosed_gender_df1.to_csv(reddit_dir / 'undisclosed_dataset.csv', index=False)
else:
    undisclosed_gender_df = pd.read_csv(reddit_dir / 'undisclosed_dataset.csv')

In [None]:
print(disclosed_gender_df.UserName.unique().shape, disclosed_gender_df.shape)

In [None]:
mapping = {'female' : 1, 'male' : 0}
disclosed_dataset_df = disclosed_df[['UserName', 'Text', 'Gender']]
disclosed_dataset_df.replace({'Gender': mapping}, inplace=True)
disclosed_dataset_df.head()

# plot data distribution

In [None]:
mapping = {'female' : 1, 'male' : 0}
disclosed_gender_df.replace({'Gender': mapping}, inplace=True)
disclosed_gender_df.head()

In [None]:
# disclosed_gender_df = disclosed_dataset_df
DF_shape = disclosed_gender_df.loc[disclosed_gender_df['Gender'] == 1].shape
DM_shape = disclosed_gender_df.loc[disclosed_gender_df['Gender'] == 0].shape
UNDIS_shape = undisclosed_gender_df.shape

In [None]:
total_size = disclosed_gender_df.shape[0] + UNDIS_shape[0]
print(total_size, disclosed_gender_df.shape[0], UNDIS_shape[0], DM_shape[0], DF_shape[0])

In [None]:
DM_shape

In [None]:
import matplotlib.pyplot as plt
labels = 'SM', 'SW', 'Performing'
sizes = [DM_shape[0]/total_size, DF_shape[0]/total_size, UNDIS_shape[0]/total_size]
explode = (0, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=10)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.savefig("reddit_data_distribution.png", dpi=400)

# Split disclosed dataset into train, test and validation

In [None]:
# Split disclosed dataset into train, test and validation
# train: text, gender
# test: name, text, gender
# validation: text, gender

In [None]:
from sklearn.model_selection import train_test_split
from pathlib import Path
reddit_dir = Path.cwd() / 'datasets/reddit/'


if 'disclosed_dataset_df' in locals():
    print('existed.')
    # 80/20 train/test
    train_df, test_df = train_test_split(disclosed_dataset_df, test_size=0.2)
    # 80/20 train/validation
    train_df, validation_df = train_test_split(train_df, test_size=0.2)
else:
    disclosed_dataset_df = pd.read_csv(reddit_dir / 'disclosed_dataset.csv')
    # 80/20 train/test
    train_df, test_df = train_test_split(disclosed_dataset_df, test_size=0.2)
    # 80/20 train/validation
    train_df, validation_df = train_test_split(train_df, test_size=0.2)

In [None]:
# extracting related attributes for training, validation and test

# gender_map = {'male': 0, 'female': 1}
train_gender_text_df = train_df[['Gender', 'Text']]
# train_gender_text_df.replace({'Gender': gender_map}, inplace=True)
validation_gender_text_df = validation_df[['Gender', 'Text']]
# validation_gender_text_df.replace({'Gender': gender_map}, inplace=True)
test_name_text_gender_df = test_df[['UserName', 'Text', 'Gender']]
# test_name_text_gender_df.replace({'Gender': gender_map}, inplace=True)

# train_gender_text_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
train_gender_text_df.to_csv(reddit_dir / 'training_gender_text.csv', index=False, header=False)

# validation_gender_text_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
validation_gender_text_df.to_csv(reddit_dir / 'validation_gender_text.csv', index=False, header=False)

# test_name_text_gender_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
test_name_text_gender_df.to_csv(reddit_dir / 'test_name_text_gender.csv', index=False, header=False)

In [None]:
# undisclosed_gender_df.rename(columns={'body': 'Text'}, inplace=True)
# undisclosed_gender_df = pd.read_csv(reddit_dir / 'undisclosed_dataset.csv')
undisclosed_gender_df = undisclosed_gender_df[['UserName', 'Text', 'Gender']]
# undisclosed_gender_df.replace({'Gender': gender_map}, inplace=True)
# undisclosed_gender_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
undisclosed_gender_df.to_csv(reddit_dir / 'undisclosed_id_text_gender.csv', index=False, header=False)

# Only using when forget to map gender to numerical number 0, 1

In [None]:
from pathlib import Path
import pandas as pd
reddit_dir = Path.cwd() / 'datasets/reddit/'
train_gender_text_df = pd.read_csv(reddit_dir / 'training_gender_text.csv', names=['Gender', 'Text'])
validation_gender_text_df = pd.read_csv(reddit_dir / 'validation_gender_text.csv', names=['Gender', 'Text'])

gender_map = {'male': 0, 'female': 1}
train_gender_text_df.replace({'Gender': gender_map}, inplace=True)
validation_gender_text_df.replace({'Gender': gender_map}, inplace=True)

train_gender_text_df.to_csv(reddit_dir / 'training_gender_text_mapped.csv', index=False, header=False)
validation_gender_text_df.to_csv(reddit_dir / 'validation_gender_text_mapped.csv', index=False, header=False)


In [None]:
from pathlib import Path
import pandas as pd

reddit_dir = Path.cwd() / 'datasets/reddit/'
undisclosed_id_gender_text_df = pd.read_csv(reddit_dir / 'undisclosed_id_text_gender.csv', names=['UserName', 'Text', 'Gender'])

gender_map = {'male': 0, 'female': 1}
train_gender_text_df.replace({'Gender': gender_map}, inplace=True)

In [None]:
# accuracy: 0.7296723086106249 at epoch: 9

# reprocess undisclosed dataset

In [None]:
if not Path(reddit_dir / 'undisclosed_dataset.csv').exists():
    print('saving undisclosed dataset to csv')
    undisclosed_gender_df.to_csv(reddit_dir / 'undisclosed_dataset.csv', index=False)
else:
    undisclosed_gender_df = pd.read_csv(reddit_dir / 'undisclosed_dataset.csv')
    
undisclosed_gender_df.rename(columns={'body': 'Text'}, inplace=True)


In [None]:
undisclosed_gender_df = undisclosed_gender_df[['UserName', 'Text', 'Gender']]
undisclosed_gender_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
undisclosed_gender_df.to_csv(reddit_dir / 'undisclosed_id_text_gender.csv', index=False, header=False)