# A user can have multiple reivews, and each review could be labeled as female or male so that a user can act as bi-gender. We must decide which gender is taken

# Yelp

In [1]:
import pandas as pd
from pathlib import Path

dataset_dir = Path.cwd() / 'datasets/yelp'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv')
inference_result_df.head()

Unnamed: 0,True_label,Predicted_label,ReviewerID
0,1,0,56iEnLi8jR--2ranjPSQ4w
1,1,1,56iEnLi8jR--2ranjPSQ4w
2,1,1,56iEnLi8jR--2ranjPSQ4w
3,1,1,56iEnLi8jR--2ranjPSQ4w
4,1,1,56iEnLi8jR--2ranjPSQ4w


# StackExchange

In [2]:
import pandas as pd
from pathlib import Path

dataset_dir = Path.cwd() / 'datasets/stackexchange'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv')
inference_result_df.head()

Unnamed: 0,True_label,Predicted_label,ReviewerID
0,1,0,305644
1,1,0,3385216
2,1,0,960757
3,1,0,11449
4,1,0,1113420


# Reddit

In [1]:
# Letters, numbers, dashes, and underscores only. Please try again without symbols
import pandas as pd
from pathlib import Path

# accuracy: 0.688802607108919 at epoch: 9


dataset_dir = Path.cwd() / 'datasets/reddit'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv')
# inference_result_df[inference_result_df.isna().any(axis=1)]

In [7]:
inference_result_df.loc[inference_result_df['ReviewerID'] == 'cah_white_bot'][['Predicted_label', 'ReviewerID']]

Unnamed: 0,Predicted_label,ReviewerID
6,0,cah_white_bot
64768,0,cah_white_bot
163846,0,cah_white_bot
1465610,0,cah_white_bot
1487624,1,cah_white_bot
1746690,1,cah_white_bot
1839368,0,cah_white_bot
1908224,1,cah_white_bot
1954049,0,cah_white_bot
2155016,1,cah_white_bot


# Group by ReiewerID,  computing by pandas sum and count

In [3]:
def majority_gender(x):
    if x > 0.5: # female is majority. 1 is dominant
        return 1
    else:
        return 0

In [4]:
inference_result_grouped_df = inference_result_df.groupby('ReviewerID').agg({'Predicted_label':'sum','True_label':'count'}). \
reset_index().rename(columns={'Predicted_label':'total_count','True_label':'true_count'})
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,total_count,true_count
0,13,0,307
1,24,0,3
2,25,0,3
3,26,0,22
4,36,0,1


In [5]:
# total_count: 0: male, 1: female; plus 0 does not give any gain, so total_count infers the number of 1s
inference_result_grouped_df['Gender'] = inference_result_grouped_df['total_count'] / inference_result_grouped_df['true_count']
inference_result_grouped_df['Gender'] = inference_result_grouped_df['Gender'].apply(lambda x: majority_gender(x))
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,total_count,true_count,Gender
0,13,0,307,0
1,24,0,3,0
2,25,0,3,0
3,26,0,22,0
4,36,0,1,0


In [6]:
inference_result_grouped_df = inference_result_grouped_df[['ReviewerID', 'Gender']]
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,Gender
0,13,0
1,24,0
2,25,0
3,26,0
4,36,0


In [7]:
inference_result_grouped_df = inference_result_grouped_df.dropna()
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,Gender
0,13,0
1,24,0
2,25,0
3,26,0
4,36,0


In [8]:
inference_result_grouped_df.head()

Unnamed: 0,UserId,Gender
0,-------------------c,0
1,------------------__,0
2,------------------oh,0
3,----------_----,0
4,----------rocketman,0


In [8]:
import pandas as pd
# dataset_dir = Path.cwd() / 'datasets/reddit'

# so far only undisclosed needs to be voted
disclosed_flag = False
if disclosed_flag:
    disclosed_dataset_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv')
    mapping = {'female' : 1, 'male' : 0}
    disclosed_dataset_df.replace({'Gender': mapping}, inplace=True)
    test_df = disclosed_dataset_df.head()
else:
    print('undisclosed')
    inference_result_grouped_df.rename(columns={'ReviewerID': 'UserId'},  inplace=True)
    undisclosed_csv_df = pd.read_csv(dataset_dir / 'undisclosed_dataset.csv', header=0)
    # Stackexchange unused
#     undisclosed_csv_df.rename(columns={'UserName': 'UserId'}, inplace=True)
    # Yelp only, make sure join with same column name
#     undisclosed_csv_df.rename(columns={'gender': 'Gender', 'user_id': 'UserId'}, inplace=True)
    # drop unlabeled Gender column
#     undisclosed_csv_df.drop(columns=['Gender'], inplace=True)
    merged_df = pd.merge(inference_result_grouped_df, undisclosed_csv_df, on='UserId', how='outer')
    merged_df.head()


undisclosed


  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
merged_df = merged_df.dropna()
merged_df.count()

UserId          4665083
Gender_x        4665083
index           4665083
Text            4665083
CreationDate    4665083
Score           4665083
UserName        4665083
Reputation      4665083
Gender_y        4665083
dtype: int64

In [10]:
merged_df.head()

Unnamed: 0,UserId,Gender_x,index,Text,CreationDate,Score,UserName,Reputation,Gender_y
0,13,0.0,18754000,You realise that `nth-element` is a built-in f...,2013-06-26T12:51:07.277,0.0,Chris Jester-Young,187952.0,mostly_male
1,13,0.0,1511088,"Is this just a learning experience, or do peop...",2010-02-11T03:47:21.490,0.0,Chris Jester-Young,187952.0,mostly_male
2,13,0.0,124092,That's the runtime-type-checking option. For a...,2008-12-29T11:01:43.270,0.0,Chris Jester-Young,187952.0,mostly_male
3,13,0.0,1236489,@seh: What the OP wants is to be able to creat...,2009-12-17T15:40:12.390,1.0,Chris Jester-Young,187952.0,mostly_male
4,13,0.0,20068351,"The reason is that if you use ordinals, then w...",2013-08-20T15:07:14.957,2.0,Chris Jester-Young,187952.0,mostly_male


In [11]:
merged_df.drop(columns=['Gender_y'], inplace=True)
merged_df.rename(columns={'Gender_x': 'Gender'}, inplace=True)
merged_df.head()

Unnamed: 0,UserId,Gender,index,Text,CreationDate,Score,UserName,Reputation
0,13,0.0,18754000,You realise that `nth-element` is a built-in f...,2013-06-26T12:51:07.277,0.0,Chris Jester-Young,187952.0
1,13,0.0,1511088,"Is this just a learning experience, or do peop...",2010-02-11T03:47:21.490,0.0,Chris Jester-Young,187952.0
2,13,0.0,124092,That's the runtime-type-checking option. For a...,2008-12-29T11:01:43.270,0.0,Chris Jester-Young,187952.0
3,13,0.0,1236489,@seh: What the OP wants is to be able to creat...,2009-12-17T15:40:12.390,1.0,Chris Jester-Young,187952.0
4,13,0.0,20068351,"The reason is that if you use ordinals, then w...",2013-08-20T15:07:14.957,2.0,Chris Jester-Young,187952.0


In [12]:
merged_df.drop(columns=['index'], inplace=True)
merged_df.head()

Unnamed: 0,UserId,Gender,Text,CreationDate,Score,UserName,Reputation
0,13,0.0,You realise that `nth-element` is a built-in f...,2013-06-26T12:51:07.277,0.0,Chris Jester-Young,187952.0
1,13,0.0,"Is this just a learning experience, or do peop...",2010-02-11T03:47:21.490,0.0,Chris Jester-Young,187952.0
2,13,0.0,That's the runtime-type-checking option. For a...,2008-12-29T11:01:43.270,0.0,Chris Jester-Young,187952.0
3,13,0.0,@seh: What the OP wants is to be able to creat...,2009-12-17T15:40:12.390,1.0,Chris Jester-Young,187952.0
4,13,0.0,"The reason is that if you use ordinals, then w...",2013-08-20T15:07:14.957,2.0,Chris Jester-Young,187952.0


In [13]:
merged_df.to_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', index=False)

#  Read extra fields in the whole undisclosed  dataset

In [None]:
import pandas as pd

# so far only undisclosed needs to be voted
disclosed_flag = False
if disclosed_flag:
    disclosed_dataset_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv')
    mapping = {'female' : 1, 'male' : 0}
    disclosed_dataset_df.replace({'Gender': mapping}, inplace=True)
    test_df = disclosed_dataset_df.head()
else:
    print('undisclosed')
    undisclosed_csv_df = pd.read_csv(dataset_dir / 'undisclosed_dataset.csv', header=0)
    print(undisclosed_csv_df.head())


# Using only when the username is not the unique id
# Merge the voting results back to the dataset you wanna vote the gender

In [None]:
merged_df = pd.merge(majority_voting_result_df, undisclosed_csv_df, on='UserId', how='outer')

In [None]:
# majority_voting_result_df.count
undisclosed_csv_df.UserId.value_counts()

In [None]:
undisclosed_csv_df.count()

In [None]:
majority_voting_result_df.count()

In [None]:
inference_result_grouped_df.count()

In [None]:
merged_df.count()

In [None]:
merged_dropna_df = merged_df.dropna()
merged_dropna_df.count()

In [None]:
merged_dropna_df.drop(columns=['Gender'], inplace=True)
merged_dropna_df.head()

In [None]:
# fields = ['business_id','predicted_gender','useful','text', 'stars', 'timestamp']
# merged_dropna_df = merged_dropna_df[fields]
# merged_dropna_df.rename(columns={'predicted_gender': 'gender'})
merged_dropna_df.to_csv(dataset_dir/ 'undisclosed_predicted_dataset.csv', index=False)