In [1]:
from torch import nn
import torch
import torch.nn as nn  
import torch.nn.functional as F  
import torch.utils.data as tud  
from torch.nn.parameter import Parameter  #参数更新和优化函数
from collections import Counter 
import numpy as np 
import random
import math 
import pandas as pd
from collections import defaultdict
import scipy  #
import sklearn
from sklearn.metrics.pairwise import cosine_similarity #余弦相似度函数

# 负例采样就是Skip-Gram模型的输出不是周围词的概率了，是正例和负例的概率
USE_CUDA = torch.cuda.is_available()

random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
 
K = 3   # 负样本随机采样数量
C = 1    # 周围单词的数量
NUM_EPOCHS = 20
VOCAB_SIZE = 56000
BATCH_SIZE = 128
LEARNING_RATE = 0.2 
EMBEDDING_SIZE = 100
#对应的维度

LOG_FILE = "word-embedding.log"

In [2]:
import  re
file = open("train.txt")
line = file.readline()
punctuation = '！，；：？“”\'、；《》（）。—'
results = []
text = []
word_to_sentence = {}
word_to_sentence = defaultdict(lambda: set([]))
#!!!上面这种操作可以将word_to_dicts的相应内容全部定义为[]

while  line:
    line = re.sub('['+punctuation+']','',line)
    currents = line.split()
    #print(currents)
    text.extend(currents)
    results.append(currents)
    for  i  in  range(len(currents)):
        word_to_sentence[currents[i]].add(len(results)-1)
    line = file.readline()

In [3]:
len(word_to_sentence)

55107

In [4]:
#词语对应的编号
vocab = dict(Counter(text).most_common(VOCAB_SIZE-1))
#词语出现的次数
vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
#目前这里面的["<unk>"]对应的内容是必为0的内容
idx_to_word = [word for word in vocab.keys()] 
word_to_idx = {word:i for i, word in enumerate(idx_to_word)}
#idx_to_word为所有切出来的单词构成的list
#word_to_idx为将单词进行相应的编号
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
#word_counts为所有对应的单词构成的相应的矩阵
#这里word_counts共有对应的483个单词
print(len(word_counts))
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)
#计算词频，按照原文转换为3/4次方
word_freqs = word_freqs / np.sum(word_freqs)  # 用来做 negative sampling
#词频为词频/词频的总和
#将所有的单词个数转化为一个483长度的词频矩阵

55108


In [5]:
len(word_to_idx)

55108

In [6]:
# 实现Dataloader
class Dataset(tud.Dataset): # 继承tud.Dataset父类
    
    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):    
        super(Dataset, self).__init__() 
        print('Dataset __init__')
        self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE-1) for t in text]
        #依次对text当中的单词进行相应的查找
        # get()返回指定键的值，没有则返回默认值
        # 这里面的key -- 字典中要查找的键，default -- 如果指定键的值不存在时，返回该默认值。
        self.text_encoded = torch.Tensor(self.text_encoded).long()
        #变成tensor类型，这里变成longtensor，也可以torch.LongTensor
        self.word_to_idx = word_to_idx 
        self.idx_to_word = idx_to_word  
        self.word_freqs = torch.Tensor(word_freqs) 
        self.word_counts = torch.Tensor(word_counts)
        
    def __len__(self): 
        return len(self.text_encoded) #所有单词的总数
        
    def __getitem__(self, idx):
        ''' 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的(positive)单词
            - 随机采样的K个单词作为negative sample
        '''
        center_word = self.text_encoded[idx] 
        pos_indices = list(range(idx-C, idx)) + list(range(idx+1, idx+C+1))
        pos_indices = [i%len(self.text_encoded) for i in pos_indices]
        pos_words = self.text_encoded[pos_indices]
        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], replacement=True)
        return center_word, pos_words, neg_words 


dataset = Dataset(text, word_to_idx, idx_to_word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)  
#BATCH_SIZE = 128,每一个批次随机取出128个对应的数值，这里打包成DataLoader的主要原因是
#便于每次取出一个相应的批次操作

#!!!dataloader之中对应的数据本身没有重复，但是text之中的内容有重复，所以dataloader
#之中的单词编号也会相应的出现重复

Dataset __init__


In [7]:
# 定义PyTorch模型
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
    #放入的vocab_size=30000,embed_size=100
        super(EmbeddingModel, self).__init__()
        print('EmbeddingModel __init__')
        self.vocab_size = vocab_size  #30000
        self.embed_size = embed_size  #100
              
        # 模型输入，输出是两个一样的矩阵参数nn.Embedding(30000, 100)
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
         # 权重初始化的一种方法
        initrange = 0.5 / self.embed_size
        self.in_embed.weight.data.uniform_(-initrange, initrange)
        self.out_embed.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, input_labels, pos_labels, neg_labels):
        '''
        input_labels: 中心词, [batch_size]
        pos_labels: 中心词周围出现过的单词 [batch_size * (c * 2)],左边找出c个词组，右边找出c个词组
        neg_labelss: 中心词周围没有出现过的单词，从 negative sampling 得到 [batch_size, (c * 2 * K)]
        return: loss, [batch_size]
        '''
        #print('EmbeddingModel forward')
        batch_size = input_labels.size(0) 
       
        input_embedding = self.in_embed(input_labels) # B * embed_size
        pos_embedding = self.out_embed(pos_labels) # B * (2C) * embed_size 
        neg_embedding = self.out_embed(neg_labels) # B * (2*C*K) * embed_size

        log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)) # B * (2*C)
        log_pos = log_pos.squeeze()
        log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze() # B * (2*C*K)

        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)
        loss = log_pos + log_neg  # 正样本损失和负样本损失和尽量最大
        #如果为负数的时候就是损失和尽量最小
        #对应的大小为[batch_size]
        #因为需要提取出来的这128个维度的单词集体操作
        return -loss 
        #注意这里return的是-loss，最终的optimizer.step中还带有一个减号
        #所以这里如果是当前选中的这个单词的周边单词的话的128个维度单词的梯度被减去，而周边单词的梯度被加上，
        #而如果是这128个单词负采样的话这128个维度的单词
    
    # 模型训练有两个矩阵，self.in_embed和self.out_embed两个, 作者认为输入矩阵比较好，舍弃了输出矩阵
    # 取出输入矩阵参数，self.in_embed的矩阵为正采样的相应的矩阵，self.out_embed为负采样的相应矩阵
    def input_embeddings(self):   
        return self.in_embed.weight.data.cpu().numpy() 
    def output_embeddings(self):
        return self.out_embed.weight.data.cpu().numpy()


In [8]:
model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE)
#VOCAB_SIZE = 30000,EMBEDDING_SIZE = 100
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
#model = model.cuda(),这里面的LEARING_RATE=0.2

EmbeddingModel __init__


In [9]:
for e in range(NUM_EPOCHS):
    totalloss = 0
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        if  len(input_labels) == 1:
            break
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean() 
        loss.backward()
        optimizer.step()
        totalloss = totalloss+loss
        if i % 100 == 0:
            with open(LOG_FILE, "a") as fout: 
                fout.write("epoch: {}, iter: {}, loss: {}\n".format(e, i, loss.item()))
                print("epoch: {}, iter: {}, loss: {}".format(e, i, loss.item()))
    print('epoch = %d'%e)
    print('totalloss = ')
    print(totalloss)
    embedding_weights = model.input_embeddings() # 调用最终训练好的embeding词向量
    np.save("embedding-{}".format(EMBEDDING_SIZE), embedding_weights) # 保存参数
    torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE)) # 保存参数

epoch: 0, iter: 0, loss: 5.545188903808594
epoch: 0, iter: 100, loss: 5.545193672180176
epoch: 0, iter: 200, loss: 5.545172691345215
epoch: 0, iter: 300, loss: 5.545185565948486
epoch: 0, iter: 400, loss: 5.545162200927734
epoch: 0, iter: 500, loss: 5.545172214508057
epoch: 0, iter: 600, loss: 5.5451765060424805
epoch: 0, iter: 700, loss: 5.545167922973633
epoch: 0, iter: 800, loss: 5.5451741218566895
epoch: 0, iter: 900, loss: 5.545161724090576
epoch: 0, iter: 1000, loss: 5.545164585113525
epoch: 0, iter: 1100, loss: 5.545170783996582
epoch: 0, iter: 1200, loss: 5.545156955718994
epoch: 0, iter: 1300, loss: 5.545164108276367
epoch: 0, iter: 1400, loss: 5.545173645019531
epoch: 0, iter: 1500, loss: 5.545166969299316
epoch: 0, iter: 1600, loss: 5.545170783996582
epoch: 0, iter: 1700, loss: 5.545166015625
epoch: 0, iter: 1800, loss: 5.545152187347412
epoch: 0, iter: 1900, loss: 5.545126914978027
epoch: 0, iter: 2000, loss: 5.545165061950684
epoch: 0, iter: 2100, loss: 5.545143127441406
e

epoch: 2, iter: 4100, loss: 5.418550491333008
epoch: 2, iter: 4200, loss: 5.354263782501221
epoch: 2, iter: 4300, loss: 5.487360000610352
epoch: 2, iter: 4400, loss: 5.435061454772949
epoch: 2, iter: 4500, loss: 5.455600261688232
epoch: 2, iter: 4600, loss: 5.388360977172852
epoch: 2, iter: 4700, loss: 5.450019836425781
epoch: 2, iter: 4800, loss: 5.457646369934082
epoch: 2, iter: 4900, loss: 5.442681312561035
epoch: 2, iter: 5000, loss: 5.443376541137695
epoch: 2, iter: 5100, loss: 5.48482084274292
epoch: 2, iter: 5200, loss: 5.469601631164551
epoch: 2, iter: 5300, loss: 5.445549964904785
epoch: 2, iter: 5400, loss: 5.416411399841309
epoch: 2, iter: 5500, loss: 5.446374893188477
epoch: 2, iter: 5600, loss: 5.410077095031738
epoch: 2, iter: 5700, loss: 5.468776226043701
epoch: 2, iter: 5800, loss: 5.4098968505859375
epoch: 2, iter: 5900, loss: 5.397675037384033
epoch: 2, iter: 6000, loss: 5.441617012023926
epoch: 2, iter: 6100, loss: 5.435159206390381
epoch: 2, iter: 6200, loss: 5.4634

epoch: 4, iter: 6900, loss: 5.2797698974609375
epoch: 4, iter: 7000, loss: 5.320988655090332
epoch: 4, iter: 7100, loss: 5.282883644104004
epoch: 4, iter: 7200, loss: 5.2900519371032715
epoch: 4, iter: 7300, loss: 5.3194756507873535
epoch = 4
totalloss = 
tensor(39376.1562, grad_fn=<AddBackward0>)
epoch: 5, iter: 0, loss: 5.2775187492370605
epoch: 5, iter: 100, loss: 5.260690689086914
epoch: 5, iter: 200, loss: 5.32249641418457
epoch: 5, iter: 300, loss: 5.35313081741333
epoch: 5, iter: 400, loss: 5.364988803863525
epoch: 5, iter: 500, loss: 5.221414566040039
epoch: 5, iter: 600, loss: 5.2652435302734375
epoch: 5, iter: 700, loss: 5.296438217163086
epoch: 5, iter: 800, loss: 5.176678657531738
epoch: 5, iter: 900, loss: 5.214909076690674
epoch: 5, iter: 1000, loss: 5.274603843688965
epoch: 5, iter: 1100, loss: 5.29140043258667
epoch: 5, iter: 1200, loss: 5.247220039367676
epoch: 5, iter: 1300, loss: 5.335747241973877
epoch: 5, iter: 1400, loss: 5.2345871925354
epoch: 5, iter: 1500, loss

epoch: 7, iter: 2200, loss: 5.108485698699951
epoch: 7, iter: 2300, loss: 5.050796031951904
epoch: 7, iter: 2400, loss: 5.085929870605469
epoch: 7, iter: 2500, loss: 4.964363098144531
epoch: 7, iter: 2600, loss: 5.1462202072143555
epoch: 7, iter: 2700, loss: 5.079452991485596
epoch: 7, iter: 2800, loss: 5.027925491333008
epoch: 7, iter: 2900, loss: 5.024785995483398
epoch: 7, iter: 3000, loss: 5.0356059074401855
epoch: 7, iter: 3100, loss: 5.017446517944336
epoch: 7, iter: 3200, loss: 4.93335485458374
epoch: 7, iter: 3300, loss: 5.0746750831604
epoch: 7, iter: 3400, loss: 4.934443950653076
epoch: 7, iter: 3500, loss: 5.086825847625732
epoch: 7, iter: 3600, loss: 4.893255710601807
epoch: 7, iter: 3700, loss: 4.8710432052612305
epoch: 7, iter: 3800, loss: 5.0140886306762695
epoch: 7, iter: 3900, loss: 4.9357709884643555
epoch: 7, iter: 4000, loss: 5.073157787322998
epoch: 7, iter: 4100, loss: 5.018987655639648
epoch: 7, iter: 4200, loss: 5.010644912719727
epoch: 7, iter: 4300, loss: 5.05

epoch: 9, iter: 5000, loss: 4.797247409820557
epoch: 9, iter: 5100, loss: 4.818394660949707
epoch: 9, iter: 5200, loss: 4.730887413024902
epoch: 9, iter: 5300, loss: 4.905886173248291
epoch: 9, iter: 5400, loss: 4.95833158493042
epoch: 9, iter: 5500, loss: 4.897167682647705
epoch: 9, iter: 5600, loss: 4.812552452087402
epoch: 9, iter: 5700, loss: 4.822450160980225
epoch: 9, iter: 5800, loss: 4.797471046447754
epoch: 9, iter: 5900, loss: 4.7191362380981445
epoch: 9, iter: 6000, loss: 4.831075668334961
epoch: 9, iter: 6100, loss: 4.742160320281982
epoch: 9, iter: 6200, loss: 4.8374738693237305
epoch: 9, iter: 6300, loss: 4.737945079803467
epoch: 9, iter: 6400, loss: 4.839054107666016
epoch: 9, iter: 6500, loss: 4.8536553382873535
epoch: 9, iter: 6600, loss: 4.779714584350586
epoch: 9, iter: 6700, loss: 4.931675910949707
epoch: 9, iter: 6800, loss: 4.813510894775391
epoch: 9, iter: 6900, loss: 4.768136978149414
epoch: 9, iter: 7000, loss: 4.870599746704102
epoch: 9, iter: 7100, loss: 4.83

epoch: 12, iter: 0, loss: 4.642845630645752
epoch: 12, iter: 100, loss: 4.584932327270508
epoch: 12, iter: 200, loss: 4.466526985168457
epoch: 12, iter: 300, loss: 4.717166423797607
epoch: 12, iter: 400, loss: 4.574451923370361
epoch: 12, iter: 500, loss: 4.503769397735596
epoch: 12, iter: 600, loss: 4.692620277404785
epoch: 12, iter: 700, loss: 4.593075275421143
epoch: 12, iter: 800, loss: 4.677763938903809
epoch: 12, iter: 900, loss: 4.764152526855469
epoch: 12, iter: 1000, loss: 4.4911789894104
epoch: 12, iter: 1100, loss: 4.611827850341797
epoch: 12, iter: 1200, loss: 4.670037746429443
epoch: 12, iter: 1300, loss: 4.641091346740723
epoch: 12, iter: 1400, loss: 4.526908874511719
epoch: 12, iter: 1500, loss: 4.554900646209717
epoch: 12, iter: 1600, loss: 4.540161609649658
epoch: 12, iter: 1700, loss: 4.6461591720581055
epoch: 12, iter: 1800, loss: 4.602420806884766
epoch: 12, iter: 1900, loss: 4.43310546875
epoch: 12, iter: 2000, loss: 4.6275553703308105
epoch: 12, iter: 2100, loss: 

epoch: 14, iter: 2500, loss: 4.58150577545166
epoch: 14, iter: 2600, loss: 4.303423881530762
epoch: 14, iter: 2700, loss: 4.525180816650391
epoch: 14, iter: 2800, loss: 4.405501365661621
epoch: 14, iter: 2900, loss: 4.593067646026611
epoch: 14, iter: 3000, loss: 4.478621959686279
epoch: 14, iter: 3100, loss: 4.53743314743042
epoch: 14, iter: 3200, loss: 4.5542426109313965
epoch: 14, iter: 3300, loss: 4.462948322296143
epoch: 14, iter: 3400, loss: 4.446147918701172
epoch: 14, iter: 3500, loss: 4.410549163818359
epoch: 14, iter: 3600, loss: 4.377466201782227
epoch: 14, iter: 3700, loss: 4.644991874694824
epoch: 14, iter: 3800, loss: 4.381047248840332
epoch: 14, iter: 3900, loss: 4.348536491394043
epoch: 14, iter: 4000, loss: 4.388583660125732
epoch: 14, iter: 4100, loss: 4.519956588745117
epoch: 14, iter: 4200, loss: 4.618917465209961
epoch: 14, iter: 4300, loss: 4.49232292175293
epoch: 14, iter: 4400, loss: 4.414427757263184
epoch: 14, iter: 4500, loss: 4.430473804473877
epoch: 14, iter

epoch: 16, iter: 5000, loss: 4.376509666442871
epoch: 16, iter: 5100, loss: 4.510918617248535
epoch: 16, iter: 5200, loss: 4.246950149536133
epoch: 16, iter: 5300, loss: 4.318685531616211
epoch: 16, iter: 5400, loss: 4.451239585876465
epoch: 16, iter: 5500, loss: 4.379627227783203
epoch: 16, iter: 5600, loss: 4.242471694946289
epoch: 16, iter: 5700, loss: 4.368413925170898
epoch: 16, iter: 5800, loss: 4.316682815551758
epoch: 16, iter: 5900, loss: 4.127659797668457
epoch: 16, iter: 6000, loss: 4.359257698059082
epoch: 16, iter: 6100, loss: 4.483580589294434
epoch: 16, iter: 6200, loss: 4.2714948654174805
epoch: 16, iter: 6300, loss: 4.225829601287842
epoch: 16, iter: 6400, loss: 4.333707809448242
epoch: 16, iter: 6500, loss: 4.493443489074707
epoch: 16, iter: 6600, loss: 4.471436023712158
epoch: 16, iter: 6700, loss: 4.4066596031188965
epoch: 16, iter: 6800, loss: 4.335264682769775
epoch: 16, iter: 6900, loss: 4.193861484527588
epoch: 16, iter: 7000, loss: 4.389949798583984
epoch: 16, 

epoch = 18
totalloss = 
tensor(31330.9316, grad_fn=<AddBackward0>)
epoch: 19, iter: 0, loss: 4.231967926025391
epoch: 19, iter: 100, loss: 4.125585079193115
epoch: 19, iter: 200, loss: 4.241580963134766
epoch: 19, iter: 300, loss: 4.218646049499512
epoch: 19, iter: 400, loss: 4.19606351852417
epoch: 19, iter: 500, loss: 4.195826053619385
epoch: 19, iter: 600, loss: 4.358565330505371
epoch: 19, iter: 700, loss: 4.118013858795166
epoch: 19, iter: 800, loss: 4.197022438049316
epoch: 19, iter: 900, loss: 4.226846694946289
epoch: 19, iter: 1000, loss: 4.282125949859619
epoch: 19, iter: 1100, loss: 4.248346328735352
epoch: 19, iter: 1200, loss: 4.002085208892822
epoch: 19, iter: 1300, loss: 4.126471519470215
epoch: 19, iter: 1400, loss: 4.2166523933410645
epoch: 19, iter: 1500, loss: 4.01594877243042
epoch: 19, iter: 1600, loss: 4.335887432098389
epoch: 19, iter: 1700, loss: 4.189471244812012
epoch: 19, iter: 1800, loss: 4.254294395446777
epoch: 19, iter: 1900, loss: 4.166607856750488
epoch:

In [10]:
print(embedding_weights.shape)
#小数据范围进行操作的时候100个单词，5个数组

(56000, 100)


In [11]:
def  dists(word1,word2):
    #print('word1 = '+str(word1))
    #print('word2 = '+str(word2))
    data1 = word_to_idx[word1]
    data2 = word_to_idx[word2]
    totaldis = 0
    weight1 = embedding_weights[data1]
    weight2 = embedding_weights[data2]
    totaldis = 0.0
    for  i  in  range(len(weight1)):
        totaldis = totaldis + (weight1[i]-weight2[i])**2
    #print('totaldis = '+str(totaldis))
    #print(type(totaldis))
    totaldis = float(totaldis)
    return  totaldis

In [12]:
file = open('pku_sim_test.txt')
line = file.readline()
resultdata = []
maxdata = -float('inf')
mindata = float('inf')
word_text = []
while  line:
    currents = line.split()
    flag = False
    word_text.append(currents)
    if  currents[0]  not  in  vocab  or  currents[1]  not  in  vocab:
        print('situation1')
        resultdata.append(1)
        line = file.readline()
        continue
    else:
        dij = dists(currents[0],currents[1])
        resultdata.append(dij)
        maxdata = max(maxdata,dij)
        mindata = min(mindata,dij)
    line = file.readline()

situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1
situation1

In [13]:
print(resultdata)

[1, 2.609468066068871, 1, 0.23469488037391367, 6.795081356006434, 7.62839317046698, 1, 0.1792622899531066, 0.3939660994643835, 1, 0.11211042426600694, 1, 1.0250852894912623, 1, 1, 0.00932326752781728, 1, 1, 1, 0.200750405395679, 0.4222526225172305, 4.491168386149611, 1, 1.8555508986523117, 1, 3.3875406217516635, 17.965278541979643, 2.682956947395615, 1, 2.1288477887292396, 1, 0.7238994279253748, 1, 0.5746111002219398, 1, 6.129965165086044, 1, 0.15434323129832977, 0.14621075269419148, 1, 3.3060810272489194, 12.643069605200328, 1, 1, 0.00857738861604728, 9.05218738091135, 1, 2.0546670449216538, 1, 0.2643773551007114, 1, 0.2226575756924161, 1, 0.1580793205799436, 1, 1, 18.333750869988776, 1, 0.76045406847337, 1, 1, 0.5991878867234561, 0.6696929959193889, 1, 0.0355792987510922, 0.07926148554443214, 1, 1, 6.803184177955227, 1, 0.020196280774598292, 0.046281690346223356, 0.1749503404057901, 0.028705608818952778, 3.2725556404701983, 4.706949969008707, 0.19187798973728581, 3.5051193580247544, 

In [14]:
with  open("2020140873.txt","w",encoding='utf-8') as f:
    for  i  in  range(len(word_text)):
        f.write(word_text[i][0]+'\t'+word_text[i][1])
        if(resultdata[i] != 1):
            resultdata[i] = (10*maxdata-mindata-9*resultdata[i])/(maxdata-mindata)
        f.write('\t'+str(resultdata[i]))
        f.write('\n')

In [15]:
print(maxdata)
print(mindata)

31.301017074135313
0.0019401898524078357


In [16]:
print('maxdata = ')
print(maxdata)
print('mindata = ')
print(mindata)

maxdata = 
31.301017074135313
mindata = 
0.0019401898524078357


In [17]:
print(embedding_weights)

[[-4.83021080e-01 -4.42046583e-01 -8.63905787e-01 ...  1.01411331e+00
  -5.49720228e-01 -1.61939752e+00]
 [-7.13023767e-02 -3.10909569e-01  4.45584923e-01 ...  2.34064624e-01
  -3.40479732e-01 -6.11896873e-01]
 [-3.16732973e-01  2.34555617e-01  2.22618490e-01 ...  7.67184258e-01
   1.20999865e-01 -1.31640446e+00]
 ...
 [ 4.79613803e-03 -2.18170695e-03 -3.99669446e-03 ... -4.54436289e-03
   4.11063991e-03 -1.03086838e-03]
 [-2.47491105e-03 -1.86349987e-03 -4.59686387e-04 ... -4.95863846e-03
  -2.98599899e-03 -5.36078587e-04]
 [ 2.35364679e-03  4.65715397e-03 -2.40499480e-03 ...  4.06059343e-03
   1.04251318e-04  4.71033901e-03]]
