# 建立模型解决姓名性别配对问题
- 组别：第  组
- 组长：卢宇翔（软件Q211班 202109601005）
### 源代码：
- 包导入：

In [None]:
import pandas as pd
from collections import defaultdict
import math  #数学库
import ipywidgets as widgets	# 控件库
from IPython.display import display	# 显示控件的方法

- 读取 roster.csv文件

In [None]:
train = pd.read_csv('roster.CSV',encoding='gbk')
# 加入 encoding='gbk'指定数据的编码方式，以解决中文数据加载问题；
train['性别'].replace('女',0,inplace=True)
train['性别'].replace('男',1,inplace=True)
train
# 显示数据

- 数据处理

In [None]:
namesFemale = train[train['性别'] == 0]  #将女生存入namesFemale
namesMale = train[train['性别'] == 1]  #将男生存入namesMale
totals = {'f': len(namesFemale), #用于存放男女生总数
          'm': len(namesMale)}

namesFemale  #打印女生

In [None]:
namesMale  #打印男生

- 数据分析

In [None]:
frequency_list_f = defaultdict(int)
for name in namesFemale['姓名']:
    for char in name:
        frequency_list_f[char] += 1. / totals['f']

frequency_list_m = defaultdict(int)
for name in namesMale['姓名']:
    for char in name:
        frequency_list_m[char] += 1. / totals['m']

# 以上代码用于计算在所有女生（男生）的名字当中，某个字出现的频率。
frequency_list_f #打印女生

In [None]:
frequency_list_m #打印男生

以下代码块借鉴的，建立朴素贝叶斯模型，并平滑处理

In [None]:
def LaplaceSmooth(char, frequency_list, total, alpha=1.0):
    count = frequency_list[char] * total
    distinct_chars = len(frequency_list)
    freq_smooth = (count + alpha ) / (total + distinct_chars * alpha)
    return freq_smooth
base_f = math.log(1 - train['性别'].mean())
base_f += sum([math.log(1 - frequency_list_f[char]) for char in frequency_list_f])

base_m = math.log(train['性别'].mean())
base_m += sum([math.log(1 - frequency_list_m[char]) for char in frequency_list_m])

bases = {'f': base_f, 'm': base_m}
def GetLogProb(char, frequency_list, total):
    freq_smooth = LaplaceSmooth(char, frequency_list, total)
    return math.log(freq_smooth) - math.log(1 - freq_smooth)
def ComputeLogProb(name, bases, totals, frequency_list_m, frequency_list_f):
    logprob_m = bases['m']
    logprob_f = bases['f']
    for char in name:
        logprob_m += GetLogProb(char, frequency_list_m, totals['m'])
        logprob_f += GetLogProb(char, frequency_list_f, totals['f'])
    return {'male': logprob_m, 'female': logprob_f}

def GetGender(LogProbs):
    return LogProbs['male'] > LogProbs['female']

- 模型训练结束，开始进行预测

In [None]:
#为方便演示，这里采用ipywidgets控件进行输入
text = widgets.Text()  #定义一个输入框
display(text)  #显示输入框
def output(x):    #定义处理输入数据方法
    name = x.value
    LogProbs = ComputeLogProb(name, bases, totals, frequency_list_m, frequency_list_f)
    gender = GetGender(LogProbs)
    if gender==1:
        print('男')
    else:
        print('女')
text.on_submit(output)    #当输入回车时，调用方法输出结果