In [3]:
#Calculating confusion matrices for teenagers' names using Chinese
import pandas as pd
import numpy as np

file_path = '/.../teenagers.csv'

df = pd.read_csv(file_path, sep='\t', low_memory=False)

df['genderize_predict_male'] = pd.to_numeric(df['genderize_predict_male'], errors='coerce')
df['汉字男性概率'] = pd.to_numeric(df['汉字男性概率'], errors='coerce')
df['拼音总数量'] = pd.to_numeric(df['拼音总数量'], errors='coerce')
df['汉字总数量'] = pd.to_numeric(df['汉字总数量'], errors='coerce')
df['拼音男性概率'] = pd.to_numeric(df['拼音男性概率'], errors='coerce')
df['Gender Count'] = pd.to_numeric(df['Gender Count'], errors='coerce') 

def calculate_error(filtered_df, threshold):
    upper_bound = threshold
    lower_bound = 1 - threshold
    m_u_nan = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['汉字男性概率'].isna())].shape[0]
    f_u_nan = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['汉字男性概率'].isna())].shape[0]
    m_u = m_u_nan
    f_u = f_u_nan

    within_unknown_range = (filtered_df['汉字男性概率'] > lower_bound) & (filtered_df['汉字男性概率'] < upper_bound)
    m_u_non_nan = filtered_df[(filtered_df['性别'] == '男') & within_unknown_range & (~filtered_df['汉字男性概率'].isna())].shape[0]
    f_u_non_nan = filtered_df[(filtered_df['性别'] == '女') & within_unknown_range & (~filtered_df['汉字男性概率'].isna())].shape[0]
    m_u += m_u_non_nan
    f_u += f_u_non_nan

    f_m = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['汉字男性概率'] > upper_bound)].shape[0]
    m_f = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['汉字男性概率'] < lower_bound)].shape[0]
    m_m = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['汉字男性概率'] >= upper_bound)].shape[0]
    f_f = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['汉字男性概率'] <= lower_bound)].shape[0]

    return m_m, f_f, m_f, f_m, m_u, f_u

frequency = 0
threshold = 0.5

filtered_df = df[(df['汉字总数量'] >= frequency) | (df['汉字男性概率'].isna())]

m_m, f_f, m_f, f_m, m_u, f_u = calculate_error(filtered_df, threshold)

print(f"男-男数量 (m_m): {m_m}")
print(f"女-女数量 (f_f): {f_f}")
print(f"男-女数量 (m_f): {m_f}")
print(f"女-男数量 (f_m): {f_m}")
print(f"未分类男数量 (m_u): {m_u}")
print(f"未分类女数量 (f_u): {f_u}")


男-男数量 (m_m): 4420
女-女数量 (f_f): 2987
男-女数量 (m_f): 204
女-男数量 (f_m): 1627
未分类男数量 (m_u): 276
未分类女数量 (f_u): 286


In [5]:
#Calculating confusion matrices for teenagers' names using Pinyin

file_path = '/.../teenagers.csv'

df = pd.read_csv(file_path, sep='\t', low_memory=False)

df['genderize_predict_male'] = pd.to_numeric(df['genderize_predict_male'], errors='coerce')
df['汉字男性概率'] = pd.to_numeric(df['汉字男性概率'], errors='coerce')
df['拼音总数量'] = pd.to_numeric(df['拼音总数量'], errors='coerce')
df['汉字总数量'] = pd.to_numeric(df['汉字总数量'], errors='coerce')
df['拼音男性概率'] = pd.to_numeric(df['拼音男性概率'], errors='coerce')
df['Gender Count'] = pd.to_numeric(df['Gender Count'], errors='coerce')  

def calculate_error(filtered_df, threshold):
    upper_bound = threshold
    lower_bound = 1 - threshold

    m_u_nan = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['拼音男性概率'].isna())].shape[0]
    f_u_nan = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['拼音男性概率'].isna())].shape[0]

    m_u = m_u_nan
    f_u = f_u_nan

    f_m = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['拼音男性概率'] >= upper_bound)].shape[0]
    m_f = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['拼音男性概率'] <= lower_bound)].shape[0]

    within_unknown_range = ((filtered_df['拼音男性概率'] > lower_bound) & (filtered_df['拼音男性概率'] < upper_bound)) | (filtered_df['拼音男性概率'].isna())
    m_u += filtered_df[(filtered_df['性别'] == '男') & within_unknown_range].shape[0]
    f_u += filtered_df[(filtered_df['性别'] == '女') & within_unknown_range].shape[0]

    m_m = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['拼音男性概率'] >= upper_bound)].shape[0]
    f_f = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['拼音男性概率'] <= lower_bound)].shape[0]

    return m_m, f_f, m_f, f_m, m_u, f_u

def calculate_error(filtered_df, threshold):

    upper_bound = threshold
    lower_bound = 1 - threshold

    m_u_nan = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['拼音男性概率'].isna())].shape[0]
    f_u_nan = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['拼音男性概率'].isna())].shape[0]

    m_u = m_u_nan
    f_u = f_u_nan

    within_unknown_range = (filtered_df['拼音男性概率'] > lower_bound) & (filtered_df['拼音男性概率'] < upper_bound)
    m_u_non_nan = filtered_df[(filtered_df['性别'] == '男') & within_unknown_range & (~filtered_df['拼音男性概率'].isna())].shape[0]
    f_u_non_nan = filtered_df[(filtered_df['性别'] == '女') & within_unknown_range & (~filtered_df['拼音男性概率'].isna())].shape[0]

    m_u += m_u_non_nan
    f_u += f_u_non_nan

    f_m = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['拼音男性概率'] > upper_bound)].shape[0]
    m_f = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['拼音男性概率'] < lower_bound)].shape[0]
    m_m = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['拼音男性概率'] >= upper_bound)].shape[0]
    f_f = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['拼音男性概率'] <= lower_bound)].shape[0]

    return m_m, f_f, m_f, f_m, m_u, f_u

frequency = 0
threshold = 0.5

filtered_df = df[(df['拼音总数量'] >= frequency) | (df['拼音男性概率'].isna())]

m_m, f_f, m_f, f_m, m_u, f_u = calculate_error(filtered_df, threshold)

print(f"男-男数量 (m_m): {m_m}")
print(f"女-女数量 (f_f): {f_f}")
print(f"男-女数量 (m_f): {m_f}")
print(f"女-男数量 (f_m): {f_m}")
print(f"未分类男数量 (m_u): {m_u}")
print(f"未分类女数量 (f_u): {f_u}")


男-男数量 (m_m): 4547
女-女数量 (f_f): 1845
男-女数量 (m_f): 330
女-男数量 (f_m): 3025
未分类男数量 (m_u): 23
未分类女数量 (f_u): 30


In [7]:
#Calculating confusion matrices for teenagers' names using Genderize.io

file_path = '/.../teenagers.csv'

df = pd.read_csv(file_path, sep='\t', low_memory=False)

df['genderize_predict_male'] = pd.to_numeric(df['genderize_predict_male'], errors='coerce')
df['汉字男性概率'] = pd.to_numeric(df['汉字男性概率'], errors='coerce')
df['拼音总数量'] = pd.to_numeric(df['拼音总数量'], errors='coerce')
df['汉字总数量'] = pd.to_numeric(df['汉字总数量'], errors='coerce')
df['拼音男性概率'] = pd.to_numeric(df['拼音男性概率'], errors='coerce')
df['Gender Count'] = pd.to_numeric(df['Gender Count'], errors='coerce')  

def calculate_error(filtered_df, threshold):
    upper_bound = threshold
    lower_bound = 1 - threshold

    m_u_nan = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['genderize_predict_male'].isna())].shape[0]
    f_u_nan = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['genderize_predict_male'].isna())].shape[0]

    m_u = m_u_nan
    f_u = f_u_nan

    within_unknown_range = (filtered_df['genderize_predict_male'] > lower_bound) & (filtered_df['genderize_predict_male'] < upper_bound)
    m_u_non_nan = filtered_df[(filtered_df['性别'] == '男') & within_unknown_range & (~filtered_df['genderize_predict_male'].isna())].shape[0]
    f_u_non_nan = filtered_df[(filtered_df['性别'] == '女') & within_unknown_range & (~filtered_df['genderize_predict_male'].isna())].shape[0]

    m_u += m_u_non_nan
    f_u += f_u_non_nan

    f_m = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['genderize_predict_male'] > upper_bound)].shape[0]
    m_f = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['genderize_predict_male'] < lower_bound)].shape[0]
    m_m = filtered_df[(filtered_df['性别'] == '男') & (filtered_df['genderize_predict_male'] >= upper_bound)].shape[0]
    f_f = filtered_df[(filtered_df['性别'] == '女') & (filtered_df['genderize_predict_male'] <= lower_bound)].shape[0]

    return m_m, f_f, m_f, f_m, m_u, f_u

frequency = 0
threshold = 0.5

filtered_df = df[(df['Gender Count'] >= frequency) | (df['genderize_predict_male'].isna())]

m_m, f_f, m_f, f_m, m_u, f_u = calculate_error(filtered_df, threshold)

print(f"男-男数量 (m_m): {m_m}")
print(f"女-女数量 (f_f): {f_f}")
print(f"男-女数量 (m_f): {m_f}")
print(f"女-男数量 (f_m): {f_m}")
print(f"未分类男数量 (m_u): {m_u}")
print(f"未分类女数量 (f_u): {f_u}")


男-男数量 (m_m): 3743
女-女数量 (f_f): 2990
男-女数量 (m_f): 838
女-男数量 (f_m): 1706
未分类男数量 (m_u): 319
未分类女数量 (f_u): 204


In [13]:
#Calculating confusion matrices for grantees' names using Genderize.io

file_path = '/.../grantees.csv'

df = pd.read_csv(file_path, sep='\t', low_memory=False)

df['genderize_predict_male'] = pd.to_numeric(df['genderize_predict_male'], errors='coerce')
df['汉字男性概率'] = pd.to_numeric(df['汉字男性概率'], errors='coerce')
df['拼音总数量'] = pd.to_numeric(df['拼音总数量'], errors='coerce')
df['汉字总数量'] = pd.to_numeric(df['汉字总数量'], errors='coerce')
df['拼音男性概率'] = pd.to_numeric(df['拼音男性概率'], errors='coerce')
df['count'] = pd.to_numeric(df['count'], errors='coerce') 

def calculate_error(filtered_df, threshold):
    upper_bound = threshold
    lower_bound = 1 - threshold

    m_u_nan = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['genderize_predict_male'].isna())].shape[0]
    f_u_nan = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['genderize_predict_male'].isna())].shape[0]

    m_u = m_u_nan
    f_u = f_u_nan

    within_unknown_range = (filtered_df['genderize_predict_male'] > lower_bound) & (filtered_df['genderize_predict_male'] < upper_bound)
    m_u_non_nan = filtered_df[(filtered_df['gender'] == '男') & within_unknown_range & (~filtered_df['genderize_predict_male'].isna())].shape[0]
    f_u_non_nan = filtered_df[(filtered_df['gender'] == '女') & within_unknown_range & (~filtered_df['genderize_predict_male'].isna())].shape[0]

    m_u += m_u_non_nan
    f_u += f_u_non_nan

    f_m = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['genderize_predict_male'] > upper_bound)].shape[0]
    m_f = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['genderize_predict_male'] < lower_bound)].shape[0]
    m_m = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['genderize_predict_male'] >= upper_bound)].shape[0]
    f_f = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['genderize_predict_male'] <= lower_bound)].shape[0]

    return m_m, f_f, m_f, f_m, m_u, f_u

frequency = 1
threshold = 0.5

filtered_df = df[(df['count'] >= frequency) | (df['genderize_predict_male'].isna())]

m_m, f_f, m_f, f_m, m_u, f_u = calculate_error(filtered_df, threshold)

print(f"男-男数量 (m_m): {m_m}")
print(f"女-女数量 (f_f): {f_f}")
print(f"男-女数量 (m_f): {m_f}")
print(f"女-男数量 (f_m): {f_m}")
print(f"未分类男数量 (m_u): {m_u}")
print(f"未分类女数量 (f_u): {f_u}")


男-男数量 (m_m): 58386
女-女数量 (f_f): 12073
男-女数量 (m_f): 10070
女-男数量 (f_m): 7361
未分类男数量 (m_u): 10312
未分类女数量 (f_u): 1525


In [11]:
#Calculating confusion matrices for grantees' names using Pinyin

file_path = '/.../grantees.csv'

df = pd.read_csv(file_path, sep='\t', low_memory=False)

df['genderize_predict_male'] = pd.to_numeric(df['genderize_predict_male'], errors='coerce')
df['汉字男性概率'] = pd.to_numeric(df['汉字男性概率'], errors='coerce')
df['拼音总数量'] = pd.to_numeric(df['拼音总数量'], errors='coerce')
df['汉字总数量'] = pd.to_numeric(df['汉字总数量'], errors='coerce')
df['拼音男性概率'] = pd.to_numeric(df['拼音男性概率'], errors='coerce')
df['count'] = pd.to_numeric(df['count'], errors='coerce')  

def calculate_error(filtered_df, threshold):
    upper_bound = threshold
    lower_bound = 1 - threshold

    m_u_nan = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['拼音男性概率'].isna())].shape[0]
    f_u_nan = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['拼音男性概率'].isna())].shape[0]

    m_u = m_u_nan
    f_u = f_u_nan

    within_unknown_range = (filtered_df['拼音男性概率'] > lower_bound) & (filtered_df['拼音男性概率'] < upper_bound)
    m_u_non_nan = filtered_df[(filtered_df['gender'] == '男') & within_unknown_range & (~filtered_df['拼音男性概率'].isna())].shape[0]
    f_u_non_nan = filtered_df[(filtered_df['gender'] == '女') & within_unknown_range & (~filtered_df['拼音男性概率'].isna())].shape[0]

    m_u += m_u_non_nan
    f_u += f_u_non_nan

    f_m = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['拼音男性概率'] > upper_bound)].shape[0]
    m_f = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['拼音男性概率'] < lower_bound)].shape[0]
    m_m = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['拼音男性概率'] >= upper_bound)].shape[0]
    f_f = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['拼音男性概率'] <= lower_bound)].shape[0]

    return m_m, f_f, m_f, f_m, m_u, f_u

frequency = 0
threshold = 0.5

filtered_df = df[(df['拼音总数量'] >= frequency) | (df['拼音男性概率'].isna())]

m_m, f_f, m_f, f_m, m_u, f_u = calculate_error(filtered_df, threshold)

print(f"男-男数量 (m_m): {m_m}")
print(f"女-女数量 (f_f): {f_f}")
print(f"男-女数量 (m_f): {m_f}")
print(f"女-男数量 (f_m): {f_m}")
print(f"未分类男数量 (m_u): {m_u}")
print(f"未分类女数量 (f_u): {f_u}")

男-男数量 (m_m): 71957
女-女数量 (f_f): 11821
男-女数量 (m_f): 6768
女-男数量 (f_m): 9135
未分类男数量 (m_u): 43
未分类女数量 (f_u): 3


In [9]:
#Calculating confusion matrices for grantees' names using Chinese

file_path = '/.../grantees.csv'

df = pd.read_csv(file_path, sep='\t', low_memory=False)

df['genderize_predict_male'] = pd.to_numeric(df['genderize_predict_male'], errors='coerce')
df['汉字男性概率'] = pd.to_numeric(df['汉字男性概率'], errors='coerce')
df['拼音总数量'] = pd.to_numeric(df['拼音总数量'], errors='coerce')
df['汉字总数量'] = pd.to_numeric(df['汉字总数量'], errors='coerce')
df['拼音男性概率'] = pd.to_numeric(df['拼音男性概率'], errors='coerce')
df['count'] = pd.to_numeric(df['count'], errors='coerce')  

def calculate_error(filtered_df, threshold):
    upper_bound = threshold
    lower_bound = 1 - threshold

    m_u_nan = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['汉字男性概率'].isna())].shape[0]
    f_u_nan = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['汉字男性概率'].isna())].shape[0]

    m_u = m_u_nan
    f_u = f_u_nan

    within_unknown_range = (filtered_df['汉字男性概率'] > lower_bound) & (filtered_df['汉字男性概率'] < upper_bound)
    m_u_non_nan = filtered_df[(filtered_df['gender'] == '男') & within_unknown_range & (~filtered_df['汉字男性概率'].isna())].shape[0]
    f_u_non_nan = filtered_df[(filtered_df['gender'] == '女') & within_unknown_range & (~filtered_df['汉字男性概率'].isna())].shape[0]

    m_u += m_u_non_nan
    f_u += f_u_non_nan

    f_m = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['汉字男性概率'] > upper_bound)].shape[0]
    m_f = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['汉字男性概率'] < lower_bound)].shape[0]
    m_m = filtered_df[(filtered_df['gender'] == '男') & (filtered_df['汉字男性概率'] >= upper_bound)].shape[0]
    f_f = filtered_df[(filtered_df['gender'] == '女') & (filtered_df['汉字男性概率'] <= lower_bound)].shape[0]

    return m_m, f_f, m_f, f_m, m_u, f_u


frequency = 0
threshold = 0.5

filtered_df = df[(df['汉字总数量'] >= frequency) | (df['汉字男性概率'].isna())]

m_m, f_f, m_f, f_m, m_u, f_u = calculate_error(filtered_df, threshold)

print(f"男-男数量 (m_m): {m_m}")
print(f"女-女数量 (f_f): {f_f}")
print(f"男-女数量 (m_f): {m_f}")
print(f"女-男数量 (f_m): {f_m}")
print(f"未分类男数量 (m_u): {m_u}")
print(f"未分类女数量 (f_u): {f_u}")

男-男数量 (m_m): 73323
女-女数量 (f_f): 13426
男-女数量 (m_f): 3613
女-男数量 (f_m): 7071
未分类男数量 (m_u): 1832
未分类女数量 (f_u): 462
