# A user can have multiple reivews, and each review could be labeled as female or male so that a user can act as bi-gender. We must decide which gender is taken

# Yelp

In [None]:
import pandas as pd
from pathlib import Path

dataset_dir = Path.cwd() / 'csv_files/yelp'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv')
inference_result_df.head()

# StackExchange

In [None]:
import pandas as pd
from pathlib import Path

dataset_dir = Path.cwd() / 'csv_files/stackexchange'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv')
inference_result_df.head()

# Reddit

In [44]:
# Letters, numbers, dashes, and underscores only. Please try again without symbols
import pandas as pd
from pathlib import Path

dataset_dir = Path.home() / 'my_reddit'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv')
inference_result_df.head()

Unnamed: 0,True_label,Predicted_label,ReviewerID
0,1,1,Mugiwaras
1,1,0,billbourret
2,1,1,2_Ducks_in_a_Handbag
3,1,1,AnnaLemma
4,1,0,AutoModerator


# Group by ReiewerID, counting value counts of predicted gender label

In [81]:
def majority_gender(x):
    if x > 0.5: # female is majority. 1 is dominant
        return 1
    else:
        return 0

In [88]:
inference_result_grouped_df = inference_result_df.groupby('ReviewerID').agg({'Predicted_label':'sum','True_label':'count'}). \
reset_index().rename(columns={'Predicted_label':'total_count','True_label':'true_count'})
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,total_count,true_count
0,####&amp;#009;,0,70
1,##&amp;#009;,0,35
2,#&amp;#009;,0,245
3,#ABANDON THREAD!,0,1
4,#DeadMan'sFloat,0,1


In [89]:
# total_count: 0: male, 1: female; plus 0 does not give any gain, so total_count infers the number of 1s
inference_result_grouped_df['Gender'] = inference_result_grouped_df['total_count'] / inference_result_grouped_df['true_count']
inference_result_grouped_df['Gender'].apply(lambda x: majority_gender(x))
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,total_count,true_count,Gender
0,####&amp;#009;,0,70,0.0
1,##&amp;#009;,0,35,0.0
2,#&amp;#009;,0,245,0.0
3,#ABANDON THREAD!,0,1,0.0
4,#DeadMan'sFloat,0,1,0.0


In [90]:
inference_result_grouped_df = inference_result_grouped_df[['ReviewerID', 'Gender']]
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,Gender
0,####&amp;#009;,0.0
1,##&amp;#009;,0.0
2,#&amp;#009;,0.0
3,#ABANDON THREAD!,0.0
4,#DeadMan'sFloat,0.0


In [91]:
inference_result_grouped_df = inference_result_grouped_df.dropna()
inference_result_grouped_df.head()

Unnamed: 0,ReviewerID,Gender
0,####&amp;#009;,0.0
1,##&amp;#009;,0.0
2,#&amp;#009;,0.0
3,#ABANDON THREAD!,0.0
4,#DeadMan'sFloat,0.0


In [92]:
inference_result_grouped_df.to_csv(dataset_dir / 'undisclosed_predicted_dataset.csv', index=False)

# Applying value_counts to vote (not working so far)

In [56]:
# value_counts 
# inference_result_grouped_df = inference_result_df.groupby('ReviewerID').agg({'Predicted_label': 'value_counts'})
# inference_result_grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted_label
ReviewerID,Predicted_label,Unnamed: 2_level_1
####&amp;#009;,0,70
##&amp;#009;,0,35
#&amp;#009;,0,245
#ABANDON THREAD!,0,1
#DeadMan'sFloat,0,1


In [None]:
# from bs4 import BeautifulSoup
# tmp_df = inference_result_df['ReviewerID'].apply(lambda x: BeautifulSoup(str(x), 'html.parser').get_text())
# tmp_df.head() 

In [62]:
majority_voting_result_ser = inference_result_df.groupby('ReviewerID').max()
majority_voting_result_ser.head()

Unnamed: 0_level_0,True_label,Predicted_label
ReviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
####&amp;#009;,1,0
##&amp;#009;,1,0
#&amp;#009;,1,0
#ABANDON THREAD!,1,0
#DeadMan'sFloat,1,0


In [65]:
majority_voting_result_df = pd.DataFrame(majority_voting_result_ser.Predicted_label).reset_index()

In [71]:
majority_voting_result_df.columns = ['UserId', 'PredictedGender']
majority_voting_result_df.head()

Unnamed: 0,UserId,PredictedGender
0,####&amp;#009;,0
1,##&amp;#009;,0
2,#&amp;#009;,0
3,#ABANDON THREAD!,0
4,#DeadMan'sFloat,0


In [None]:
import pandas as pd

# so far only undisclosed needs to be voted
disclosed_flag = False
if disclosed_flag:
    disclosed_dataset_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv')
    mapping = {'female' : 1, 'male' : 0}
    disclosed_dataset_df.replace({'Gender': mapping}, inplace=True)
    test_df = disclosed_dataset_df.head()
else:
    print('undisclosed')
    undisclosed_csv_df = pd.read_csv(dataset_dir / 'undisclosed_dataset.csv', header=0)
    print(undisclosed_csv_df.head())


# only for test

In [None]:
test_df = disclosed_dataset_df.iloc[1:100]

In [None]:
test_tmp = test_df.groupby('UserId').apply(lambda g: g.gender.mode())
test_tmp.head()

In [None]:
ser = test_tmp.gender.groupby('UserId').max(level=0)
ser

In [None]:
#

# Using only when the username is not the unique id
# Merge the voting results back to the dataset you wanna vote the gender

In [None]:
merged_df = pd.merge(majority_voting_result_df, undisclosed_csv_df, on='UserId', how='outer')

In [None]:
# majority_voting_result_df.count
undisclosed_csv_df.UserId.value_counts()

In [None]:
undisclosed_csv_df.count()

In [None]:
majority_voting_result_df.count()

In [None]:
inference_result_grouped_df.count()

In [None]:
merged_df.count()

In [None]:
merged_dropna_df = merged_df.dropna()
merged_dropna_df.count()

In [None]:
merged_dropna_df.drop(columns=['Gender'], inplace=True)
merged_dropna_df.head()

In [None]:
# fields = ['business_id','predicted_gender','useful','text', 'stars', 'timestamp']
# merged_dropna_df = merged_dropna_df[fields]
# merged_dropna_df.rename(columns={'predicted_gender': 'gender'})
merged_dropna_df.to_csv(dataset_dir/ 'undisclosed_predicted_dataset.csv', index=False)