# A user can have multiple reivews, and each review could be labeled as female or male so that a user can act as bi-gender. We must decide which gender is taken

# Yelp

In [None]:
import pandas as pd
from pathlib import Path

dataset_dir = Path.cwd() / 'csv_files/yelp'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv', usecols=['Predicted_label', 'ReviewerID'])
inference_result_df.head()

# StackExchange

In [1]:
import pandas as pd
from pathlib import Path

dataset_dir = Path.cwd() / 'csv_files/stackexchange'
inference_result_df = pd.read_csv(dataset_dir / 'inference_result_RNN_vanilla_2.csv', usecols=['Predicted_label', 'ReviewerID'])
inference_result_df.head()

Unnamed: 0,Predicted_label,ReviewerID
0,0,169
1,0,157
2,0,157
3,0,51
4,0,603


In [2]:
inference_result_grouped_df = inference_result_df.groupby('ReviewerID').agg({'Predicted_label': 'value_counts'})
inference_result_grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted_label
ReviewerID,Predicted_label,Unnamed: 2_level_1
8,0,3
15,0,14
17,0,2
21,0,7
22,0,4


In [3]:
majority_voting_result_ser = inference_result_df.groupby('ReviewerID').max()
majority_voting_result_ser.head()

Unnamed: 0_level_0,Predicted_label
ReviewerID,Unnamed: 1_level_1
8,0
15,0
17,0
21,0
22,0


In [4]:
majority_voting_result_df = pd.DataFrame(majority_voting_result_ser.Predicted_label).reset_index()

In [5]:
majority_voting_result_df.columns = ['UserId', 'PredictedGender']
majority_voting_result_df.head()

Unnamed: 0,UserId,PredictedGender
0,8,0
1,15,0
2,17,0
3,21,0
4,22,0


In [6]:
majority_voting_result_df.PredictedGender.value_counts()

0    4326
1       3
Name: PredictedGender, dtype: int64

In [7]:
import pandas as pd

# so far only undisclosed needs to be voted
disclosed_flag = False
if disclosed_flag:
    disclosed_dataset_df = pd.read_csv(dataset_dir / 'disclosed_dataset.csv')
    mapping = {'female' : 1, 'male' : 0}
    disclosed_dataset_df.replace({'Gender': mapping}, inplace=True)
    test_df = disclosed_dataset_df.head()
else:
    print('undisclosed')
    undisclosed_csv_df = pd.read_csv(dataset_dir / 'undisclosed_dataset.csv', header=0)
    print(undisclosed_csv_df.head())


undisclosed
   UserId                                         AnswerText  PostTypeId  \
0      41  in comparison to other comparison-based sortin...           2   
1      44  assuming you are storing a reference, not the ...           2   
2      29  i think one of the main reasons why quicksort ...           2   
3      22  even though quick-sort has a worst case run ti...           2   
4      59  in the cooperative scheduling (preferably coop...           2   

              CreationDate  Score          Name  Reputation   Gender  
0  2012-03-06T19:19:20.237     20         Kaveh       18213     andy  
1  2012-03-06T19:44:17.600      8  Rafe Kettler         181  unknown  
2  2012-03-06T19:48:00.343     34         svick        1726  unknown  
3  2012-03-06T20:18:27.510     16           0x0         565  unknown  
4  2012-03-06T20:19:10.463      4         Ankit         627     andy  


# only for test

In [None]:
test_df = disclosed_dataset_df.iloc[1:100]

In [None]:
test_tmp = test_df.groupby('UserId').apply(lambda g: g.gender.mode())
test_tmp.head()

In [None]:
ser = test_tmp.gender.groupby('UserId').max(level=0)
ser

In [None]:
#

# Merge

In [8]:
merged_df = pd.merge(majority_voting_result_df, undisclosed_csv_df, on='UserId', how='outer')

In [10]:
# majority_voting_result_df.count
undisclosed_csv_df.UserId.value_counts()

683       5288
755       2969
17408      783
91753      672
4287       548
8321       409
39         376
699        359
6447       313
472        312
43599      307
157        304
1329       301
2253       281
72943      269
7459       263
35842      259
6890       245
6553       191
2205       182
4911       176
83244      172
22714      170
140        167
86844      154
41         152
4416       148
4249       145
70382      142
45662      124
          ... 
15472        1
23660        1
97384        1
49429        1
54352        1
15440        1
60650        1
58603        1
54525        1
9491         1
101782       1
84164        1
93578        1
67788        1
47322        1
44402        1
48496        1
21869        1
95593        1
40292        1
23900        1
11586        1
48448        1
61667        1
74465        1
77106        1
70949        1
45307        1
62745        1
2047         1
Name: UserId, Length: 4329, dtype: int64

In [11]:
undisclosed_csv_df.count()

UserId          28889
AnswerText      28888
PostTypeId      28889
CreationDate    28889
Score           28889
Name            28889
Reputation      28889
Gender          28889
dtype: int64

In [12]:
majority_voting_result_df.count()

UserId             4329
PredictedGender    4329
dtype: int64

In [13]:
inference_result_grouped_df.count()

Predicted_label    4332
dtype: int64

In [14]:
merged_df.count()

UserId             28889
PredictedGender    28889
AnswerText         28888
PostTypeId         28889
CreationDate       28889
Score              28889
Name               28889
Reputation         28889
Gender             28889
dtype: int64

In [15]:
merged_dropna_df = merged_df.dropna()
merged_dropna_df.count()

UserId             28888
PredictedGender    28888
AnswerText         28888
PostTypeId         28888
CreationDate       28888
Score              28888
Name               28888
Reputation         28888
Gender             28888
dtype: int64

In [19]:
merged_dropna_df.drop(columns=['Gender'], inplace=True)
merged_dropna_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,UserId,PredictedGender,AnswerText,PostTypeId,CreationDate,Score,Name,Reputation
0,8,0,this is a problem of selecting the $k$th small...,2,2012-04-12T17:41:18.730,16,Opt,1146
1,8,0,i'd say the most well known barriers to solvin...,2,2012-05-17T02:27:37.430,76,Opt,1146
2,8,0,while the deductions made by $m$ can have equi...,2,2012-06-05T15:31:21.090,5,Opt,1146
3,15,0,this answer is about generic memory management...,2,2012-03-06T22:40:01.673,4,rgrig,1031
4,15,0,you get fewer special cases. in many situation...,2,2012-03-12T00:06:02.723,2,rgrig,1031


In [20]:
# fields = ['business_id','predicted_gender','useful','text', 'stars', 'timestamp']
# merged_dropna_df = merged_dropna_df[fields]
# merged_dropna_df.rename(columns={'predicted_gender': 'gender'})
merged_dropna_df.to_csv(dataset_dir/ 'undisclosed_predicted_dataset.csv', index=False)