In [None]:
# 根据姓氏 分类国籍
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# 构建分类器
class SurnameMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SurnameMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    def forward(self, x_in, softmax = False):
        inter_data1 = F.relu(self.fc1(x_in))
        pre_vector = self.fc2(inter_data1)
        
        if softmax:
            pre_vector = F.softmax(pre_vector, dim = 1)
        return pre_vector

In [None]:
# 数据集设置 继承 原始类 Dataset
def SurnameDataSet(Dataset):
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        # 获取 姓的向量
        surname_vector = 
            self.vectorizer.vectorize(row.surname)
        # 获取国家的索引
        nationality_index = 
            self.vectorizer.nationality_vocab.lookup_token(row.nationality)
        return {'x_surname': surname_vector,
               'y_nationality': nationality_index}

In [None]:
# 数据预处理 向量化
class SurnameVectorizer(object):
    def __init__(self, surname_vocab, nationality_vocab):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
    # 向量化
    def vectorize(self, surname):
        vocab = self.surname_vocab
        one_hot = np.zeros(len(vocab), dtype = float32)
        for token in surname:
            one_hot[vocab.lookup_token(token)] = 1
        return one_hot
    def from_dataframe(cls, surname_df):
        surname_vocab = Vocabulary(unk_token = "@")
        nationality_vocab = Vocabulary(add_unk = False)
        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)
        return cls(surname_vocab, nationality_vocab)

In [None]:
# 用一个字典来存放所有需要的 数据信息
args = Namespace(
    # 数据的路径信息
    surname_csv = "surnames_with_splits.csv",
    vecrotizer_file = "vectorizer.json",
    model_state_file = "model.path",
    save_dir = "ch4/surname_mlp",
    # 模型超级参数
    hidden_dim = 300,
    # 训练超级参数
    seed = 1337,
    num_epochs = 100,
    early_stopping_criteria = 5,
    learning_rate = 0.001,
    batch_size = 64
)

In [None]:
# 实例化 各个部分  数据集， 向量化， 分类器，定义损失函数，设置优化器
dataset = SurnameDataSet.load_dataset_and_make_vectorizer(args.surname_csv)
vectorizer = dataset.get_vectorizer()
mlp = SurnameMLP(input_dim = len(vectorizer.surname_vocab),
                 hidden_dim = args.hidden_dim,
                 output_dim = len(vectorizer.nationality_vocab))
mlp = mlp.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(mlp.parameters(), lr = args.learning_rate)

In [None]:
# 训练
# 梯度归零
optimizer.zero_grad()
# 获取 前向传播结果
y_pred = mlp(batich_dict['x_surname'])
# 计算损失
loss = loss_func(y_pred, batich_dict['y_nationality'])
loss_batch = loss.to("cpu").item()
running_loss += (loss_batch - running_loss) / (batch_index + 1)
# 后向传播 使用损失 计算梯度
loss.backward()
# 梯度迭代
optimizer.step()

In [None]:
# 预测
def pre_nation(name, mlp, vectorizer):
    vectorized_name = vectorizer.vectorize(name)
    vectorized_name = torch.tensor(vectorized_name).view(1, -1)
    result = mlp(vectorized_name, softmax = True)
    probability_values, indices = result.max(dim = 1)
    index = indices.item()
    predicted_nation = vectorizer.nationality_vocab.lookup_index(index)
    probability_value = probability_values.item()
    return {'nationality': predicted_nationality,
           'probability': probability_value}