In [2]:
#check raw dataset of 9800 teenagers
import pandas as pd
filepath = '/.../9800ChineseNamesnamegender.xlsx'
data = pd.read_excel(filepath)
print(data.head(10))

             来源   姓名 性别  Unnamed: 3  Unnamed: 4       男       女       Σ
0   14-FDU-ZZZS   艾超  男         NaN         NaN  4900.0  4900.0  9800.0
1   14-ZJU-ZZZS  艾方洲  男         NaN         NaN     NaN     NaN     NaN
2   14-FDU-ZZZS  艾临风  男         NaN         NaN     NaN     NaN     NaN
3  201211172043   艾艺  女         NaN         NaN     NaN     NaN     NaN
4  14-QHDX-ZZZS  艾章习  女         NaN         NaN     NaN     NaN     NaN
5      14-Sport  安婧雯  女         NaN         NaN     NaN     NaN     NaN
6  201211121927   安岚  女         NaN         NaN     NaN     NaN     NaN
7  14-BJDX-ZZZS   安曼  女         NaN         NaN     NaN     NaN     NaN
8  201311081186  安孟瑶  女         NaN         NaN     NaN     NaN     NaN
9  201311232051   安乔  女         NaN         NaN     NaN     NaN     NaN


In [34]:
#data preprocessing & converting names from Chinese to Pinyin fromats
#Delete lines containing letters, numbers, and punctuation.
#Limit the name length to 4.

import re
from pypinyin import pinyin, Style

file_path = '/.../9800ChineseNamesnamegender.xlsx'
data = pd.read_excel(file_path)

compound_surnames_dict = {
    1: "欧阳", 2: "太史", 3: "端木", 4: "上官", 5: "司马", 6: "东方", 7: "独孤", 8: "南宫", 9: "万俟", 10: "闻人",
    11: "夏侯", 12: "诸葛", 13: "尉迟", 14: "公羊", 15: "赫连", 16: "澹台", 17: "皇甫", 18: "宗政", 19: "濮阳",
    20: "公冶", 21: "太叔", 22: "申屠", 23: "公孙", 24: "慕容", 25: "仲孙", 26: "钟离", 27: "长孙", 28: "宇文",
    29: "司徒", 30: "鲜于", 31: "司空", 32: "闾丘", 33: "子车", 34: "亓官", 35: "司寇", 36: "巫马", 37: "公西",
    38: "颛孙", 39: "壤驷", 40: "公良", 41: "漆雕", 42: "乐正", 43: "宰父", 44: "谷梁", 45: "拓跋", 46: "夹谷",
    47: "轩辕", 48: "令狐", 49: "段干", 50: "百里", 51: "呼延", 52: "东郭", 53: "南门", 54: "羊舌", 55: "微生",
    56: "公户", 57: "公玉", 58: "公仪", 59: "梁丘", 60: "公仲", 61: "公上", 62: "公门", 63: "公山", 64: "公坚",
    65: "左丘", 66: "公伯", 67: "西门", 68: "公祖", 69: "第五", 70: "公乘", 71: "贯丘", 72: "公皙", 73: "南荣",
    74: "东里", 75: "东宫", 76: "仲长", 77: "子书", 78: "子桑", 79: "即墨", 80: "达奚", 81: "褚师", 82: "吴铭"
}

surnames = list(compound_surnames_dict.values())

def split_name(full_name):
    for surname in surnames:
        if full_name.startswith(surname):
            return surname, full_name[len(surname):]
    return full_name[0], full_name[1:]

data[['surname', 'given_name']] = data['姓名'].apply(lambda x: pd.Series(split_name(x)))

corrections = {
    '思': 'si', 
    '育': 'yu',
    '若': 'ruo',
    '敦': 'dun',
    '朴': 'pu',
    '露': 'lu',
    '陆': 'lu',
    '男': 'nan',
    '南': 'nan',
    '楠': 'nan',
    '钠': 'na',
    '娜': 'na',
    '拓': 'tuo',
    '觉': 'jue',
}

def convert_to_pinyin(name):
    pinyin_list = [item[0] for item in pinyin(name, style=Style.NORMAL)]
    corrected_pinyin = [corrections.get(char, pinyin_list[idx]) for idx, char in enumerate(name)]
    return ''.join(corrected_pinyin)

data['given_name_pinyin'] = data['given_name'].apply(lambda x: convert_to_pinyin(x))

data['gender_numeric'] = data['性别'].apply(lambda x: 1 if x == '男' else 0 if x == '女' else x)
filtered_data = data[['姓名', '性别', 'surname', 'given_name', 'given_name_pinyin', 'gender_numeric']].copy()

filtered_data['country'] = 'CN'

output_path = '/.../teenagers_rawdata.csv'
filtered_data.to_csv(output_path, index=False)

print("新的文件已保存到:", output_path)


新的文件已保存到: /Users/tongtong/Documents/python/teenagers_rawdata.csv


In [None]:
#After getting the results from Genderize.io, we have the file:
teenagers_genderize.csv

In [36]:
#Merge the raw factual data and Genderize.io results.

df_genderize = pd.read_csv(
    "/.../teenagers_genderize.csv", 
    sep="\t"
)
df_data = pd.read_csv(
    "/.../teenagers_rawdata.csv", 
    sep=","
)

df_genderize_unique = df_genderize.drop_duplicates(subset='name', keep='first')
df_merged = df_data.merge(
    df_genderize_unique, 
    how='left', 
    left_on='given_name_pinyin', 
    right_on='name'
)

df_merged.to_csv(
    "/.../teenagers_merged0.csv", 
    sep="\t", 
    index=False
)

print("合并完成，最终行数：", len(df_merged))


合并完成，最终行数： 9800


In [58]:
#Merge with ChineseGender dataset 
import numpy as np
from tqdm import tqdm

merged_truth_file = '/Users/tongtong/Documents/python/teenagers_merged0.csv'
chinese_gender_file = '/Users/tongtong/Documents/python/ChineseGender_cleaned1.txt'
output_file = '/Users/tongtong/Documents/python/teenagers.csv'

chinese_gender_file_df = pd.read_csv(chinese_gender_file, sep='\t', on_bad_lines='skip')
print("成功读取股东数据，列名如下：", chinese_gender_file_df.columns)

if 'fname' not in chinese_gender_file_df.columns:
    for col in chinese_gender_file_df.columns:
        if 'given_name' in col.lower() or 'en' in col.lower():
            chinese_gender_file_df.rename(columns={col: 'fname'}, inplace=True)

chinese_gender_file_df_df = chinese_gender_file_df[['名', '汉字总数量', '汉字男性概率', 'fname', '拼音总数量', '拼音男性概率']]
chinese_gender_file_df_hanzi = chinese_gender_file_df.drop_duplicates(subset='名', keep='first')
chinese_gender_file_df_pinyin = chinese_gender_file_df.drop_duplicates(subset='fname', keep='first')
hanzi_dict = chinese_gender_file_df_hanzi.set_index('名')[['汉字总数量', '汉字男性概率']].to_dict(orient='index')
pinyin_dict = chinese_gender_file_df_pinyin.set_index('fname')[['拼音总数量', '拼音男性概率']].to_dict(orient='index')

merged_truth_df = pd.read_csv(merged_truth_file, sep='\t')
print("成功读取真值数据，列名如下：", merged_truth_df.columns)

if 'given_name' not in merged_truth_df.columns:
    print("错误：未找到 'given_name' 列，检查列名是否匹配")
    raise ValueError("'given_name' 列不存在，请检查真值文件的内容！")

if 'given_name_pinyin' not in merged_truth_df.columns:
    print("错误：未找到 'given_name_pinyin' 列，检查列名是否匹配")
    raise ValueError("'given_name_pinyin' 列不存在，请检查真值文件的内容！")

total_rows = len(merged_truth_df)

def process_merged_truth_file(chunk_size=10000):
    for chunk in pd.read_csv(merged_truth_file, sep='\t', chunksize=chunk_size):
        chunk['汉字总数量'] = chunk['given_name'].map(lambda x: hanzi_dict.get(x, {}).get('汉字总数量'))
        chunk['汉字男性概率'] = chunk['given_name'].map(lambda x: hanzi_dict.get(x, {}).get('汉字男性概率'))
        chunk['拼音总数量'] = chunk['given_name_pinyin'].map(lambda x: pinyin_dict.get(x, {}).get('拼音总数量'))
        chunk['拼音男性概率'] = chunk['given_name_pinyin'].map(lambda x: pinyin_dict.get(x, {}).get('拼音男性概率'))
        yield chunk

all_chunks = []
with tqdm(total=total_rows, desc="Processing", unit="rows") as pbar:
    for result_chunk in process_merged_truth_file():
        all_chunks.append(result_chunk)
        pbar.update(len(result_chunk))
result_df = pd.concat(all_chunks, ignore_index=True)
#converting the results from genderize.io to male propbability 
result_df['genderize_predict_male'] = np.where(
    result_df['Gender'] == 'male', 
    result_df['Gender Probability'],
    np.where(result_df['Gender'] == 'female', 1 - result_df['Gender Probability'], None)
)

result_df.to_csv(output_file, sep='\t', index=False)
print("数据处理完成，结果已写入：", output_file)
print("处理后的前 10 行：")
print(result_df.head(10))


  gudong_after_df = pd.read_csv(gudong_after_file, sep='\t', on_bad_lines='skip')


成功读取股东数据，列名如下： Index(['id', 'orig_name', 'gender', 'b_year', '姓', '名', 'lname', 'fname',
       '拼音总数量', '拼音男性数量', '拼音男性概率', '汉字总数量', '汉字男性数量', '汉字男性概率'],
      dtype='object')
成功读取真值数据，列名如下： Index(['姓名', '性别', 'surname', 'given_name', 'given_name_pinyin',
       'gender_numeric', 'country_x', 'name', 'country_y', 'Gender',
       'Gender Probability', 'Gender Count'],
      dtype='object')


Processing: 100%|█████████████████████| 9800/9800 [00:00<00:00, 442280.03rows/s]

数据处理完成，结果已写入： /Users/tongtong/Documents/python/teenagers.csv
处理后的前 10 行：
    姓名 性别 surname given_name given_name_pinyin  gender_numeric country_x  \
0   艾超  男       艾          超              chao               1        CN   
1  艾方洲  男       艾         方洲          fangzhou               1        CN   
2  艾临风  男       艾         临风           linfeng               1        CN   
3   艾艺  女       艾          艺                yi               0        CN   
4  艾章习  女       艾         章习           zhangxi               0        CN   
5  安婧雯  女       安         婧雯           jingwen               0        CN   
6   安岚  女       安          岚               lan               0        CN   
7   安曼  女       安          曼               man               0        CN   
8  安孟瑶  女       安         孟瑶           mengyao               0        CN   
9   安乔  女       安          乔              qiao               0        CN   

       name country_y  Gender  Gender Probability  Gender Count    汉字总数量  \
0      chao   


