In [1]:
import functools
from ast import literal_eval

import pandas as pd
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model.label_model import LabelModel

In [2]:
# original dataset
df_train_orig = pd.read_csv("../data/train_v0.7.1.csv", usecols=['entry_id','gender',])
df_train_kw_lbl = pd.read_csv("train_0.7.1_kw_gender.csv", usecols=['entry_id', 'gender_kw_pred'])
df_train_model_kw = pd.read_csv("kw_model_gender_preds_on_train.csv")
df_train_model_orig = pd.read_csv("orig_model_gender_preds_on_train.csv")

In [3]:
dfs = [df_train_orig, df_train_kw_lbl, df_train_model_kw, df_train_model_orig]
df_final = functools.reduce(lambda left,right: pd.merge(left,right,on='entry_id'), dfs)
print(len(df_final))
df_final.drop_duplicates(inplace=True)
print(len(df_final))

126659
126299


In [4]:
for col in ['gender', 'gender_kw_pred', 'gender_model_pred', 'gender_model_orig_pred']:
    df_final[col] = df_final[col].apply(literal_eval)

In [5]:
df_final_male = df_final.copy()
df_final_male["male_1"] = df_final_male["gender"].apply(lambda x: 1 if "Male" in x else 0)
df_final_male["male_2"] = df_final_male["gender_kw_pred"].apply(lambda x: 1 if "Male" in x else 0)
df_final_male["male_3"] = df_final_male["gender_model_pred"].apply(lambda x: 1 if "Male" in x else 0)
df_final_male["male_4"] = df_final_male["gender_model_orig_pred"].apply(lambda x: 1 if "Male" in x else 0)
##
df_final_female = df_final.copy()
df_final_female["female_1"] = df_final_female["gender"].apply(lambda x: 1 if "Female" in x else 0)
df_final_female["female_2"] = df_final_female["gender_kw_pred"].apply(lambda x: 1 if "Female" in x else 0)
df_final_female["female_3"] = df_final_female["gender_model_pred"].apply(lambda x: 1 if "Female" in x else 0)
df_final_female["female_4"] = df_final_female["gender_model_orig_pred"].apply(lambda x: 1 if "Female" in x else 0)

In [6]:
LFAnalysis(L=df_final_male[["male_1", "male_2", "male_3", "male_4"]].to_numpy()).lf_summary()

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
0,"[0, 1]",1.0,1.0,0.13898
1,"[0, 1]",1.0,1.0,0.13898
2,"[0, 1]",1.0,1.0,0.13898
3,"[0, 1]",1.0,1.0,0.13898


In [7]:
LFAnalysis(L=df_final_female[["female_1", "female_2", "female_3", "female_4"]].to_numpy()).lf_summary()

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
0,"[0, 1]",1.0,1.0,0.137222
1,"[0, 1]",1.0,1.0,0.137222
2,"[0, 1]",1.0,1.0,0.137222
3,"[0, 1]",1.0,1.0,0.137222


In [8]:
label_model_male = LabelModel()
label_model_male.fit(L_train=df_final_male[["male_1", "male_2", "male_3", "male_4"]].to_numpy(), n_epochs=200, seed=100)
df_final["preds_labeling_model_male"] = label_model_male.predict(L=df_final_male[["male_1", "male_2", "male_3", "male_4"]].to_numpy())
##
label_model_female = LabelModel()
label_model_female.fit(L_train=df_final_female[["female_1", "female_2", "female_3", "female_4"]].to_numpy(), n_epochs=200, seed=100)
df_final["preds_labeling_model_female"] = label_model_female.predict(L=df_final_female[["female_1", "female_2", "female_3", "female_4"]].to_numpy())

In [9]:
def agg_lbls(row):
    lbls = []
    if row["preds_labeling_model_female"]:
        lbls.append("Female")
    if row["preds_labeling_model_male"]:
        lbls.append("Male")
    return lbls
        
df_final["gender_snorkel"] = df_final.apply(agg_lbls, axis=1)

In [10]:
df_final[["entry_id", "gender_snorkel"]].to_csv("train_v0.7.1_gender_snorkel.csv", index=None)