<a href="https://colab.research.google.com/github/zhihong1224/RNN_demo/blob/master/skip_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [0]:
import os
import math
import numpy as np
import torch 
from torch import nn,optim
import random
from torch.utils.data import Dataset,DataLoader
from collections import Counter
import torch.nn.functional as F
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
from  google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
ROOT='gdrive/My Drive/Colab Notebooks/MXNet/MX_data/ptb'
train_file=os.path.join(ROOT,'ptb.train.txt')
test_file=os.path.join(ROOT,'ptb.test.txt')
valid_file=os.path.join(ROOT,'ptb.valid.txt')

# 读取及处理数据

In [0]:
def tokenize(file):
  with open(file) as f:
    text=f.readlines()
  result,token=[],[]
  for line in text:
    line_token=line.lower().strip().split()
    result.append(line_token)
    token.extend(line_token)
  return result,token

In [0]:
train_data,token=tokenize(train_file)

In [7]:
len(train_data)

42068

In [0]:
# 建立词典
def word_idx(token):
  counter=Counter(token)
  word_count=dict(filter(lambda x:x[1]>=5,counter.items()))
  idx_to_char=[ch for ch,_ in word_count.items()]
  char_to_idx={ch:idx for idx,ch in enumerate(idx_to_char)}
  vocab_size=len(idx_to_char)
  word_count=np.array([count for _,count in word_count.items()])
  word_freq=word_count/np.sum(word_count)
  return idx_to_char,char_to_idx,vocab_size,word_freq

In [0]:
idx_to_char,char_to_idx,vocab_size,word_freq=word_idx(token)

In [0]:
# 将文本转换为数字表示
def get_corpus(data,char_to_idx,idx_to_char):
  result=[]
  for line in data:
    result.append([char_to_idx[ch] for ch in line if ch in char_to_idx])
  return result

In [0]:
train_corpus=get_corpus(train_data,char_to_idx,idx_to_char)

In [12]:
num_tokens=sum([len(st) for st in train_corpus]);num_tokens

887100

In [0]:
# 二次采样
def subsampling(corpus,word_freq):
  result=[]
  for line in corpus:
    temp=[]
    for idx in line:
      if 1-np.sqrt(1e-4/word_freq[idx])<np.random.uniform(0,1):
        temp.extend([idx])
    result.append(temp)
  return result

In [0]:
subsampled_corpus=subsampling(train_corpus,word_freq)

In [15]:
sum([len(st) for st in subsampled_corpus])

376245

In [0]:
# 比较一个词在二次采样前后出现的次数
def compare_count(word,char_to_idx,corpus,subsampled_corpus):
  before_count=sum([line.count(char_to_idx[word]) for line in corpus])
  after_count=sum([line.count(char_to_idx[word]) for line in subsampled_corpus])
  print('before_count:{},after_count:{}'.format(before_count,after_count))

In [17]:
compare_count('the',char_to_idx,train_corpus,subsampled_corpus)
compare_count('join',char_to_idx,train_corpus,subsampled_corpus)

before_count:50770,after_count:2043
before_count:45,after_count:45


In [0]:
# 提取中心词、背景词
def get_centers_contents(subsampled_corpus,max_win=5):
  centers=[]
  contents=[]
  for h,line in enumerate(subsampled_corpus):
    if len(line)<2:
      continue
    for idx,center in enumerate(line):
      win_size=random.randint(1,max_win)
      centers.extend([center]) # 中心词列表
      indices=list(range(max(0,idx-win_size),min(len(line),idx+win_size+1)))
      indices.remove(idx)
      content=[line[index] for index in indices]  # 背景词
      contents.append(content)   # 背景词列表
  return centers,contents

In [21]:
# 测试中心词，背景词提取
tiny_dataset=[list(range(7)),list(range(7,10))]
print(tiny_dataset)
centers,contents=get_centers_contents(tiny_dataset,max_win=2)
print('centers:{},\ncontens:{}\n'.format(centers,contents))

[[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
centers:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
contens:[[1], [0, 2], [1, 3], [2, 4], [2, 3, 5, 6], [4, 6], [5], [8, 9], [7, 9], [8]]



In [0]:
all_centers,all_contents=get_centers_contents(subsampled_corpus,max_win=5)

In [20]:
max([len(c) for c in all_contents]);len(all_contents)

375342

In [0]:
# 提取负例词（此函数运行效率低下）
def get_negatives(centers,contents,vocab_size,word_freq,K=5):
  negatives=[]
  for i in range(len(centers)):
    print('i',i)
    neg_num=K*len(contents[i])  # 负例词个数
    prob_idx=random.choices(list(range(vocab_size)),word_freq**0.75,k=200) # 候选负例词索引
    negative=[]
    neg_count=0
    for prob_neg in prob_idx:
      if prob_neg not in set(contents[i]):
        negative.extend([prob_neg])   # 负例词索引
        neg_count=neg_count+1
        if neg_count==neg_num:
          break
      else:
        continue
    negatives.append(negative)   # 负例词列表

  return negatives

In [0]:
# 测试负例词
negatives=get_negatives(centers,contents,vocab_size,word_freq)
max([len(n) for n in negatives])

In [0]:
all_negatives=get_negatives(all_centers,all_contents,vocab_size,word_freq,K=5)

In [24]:
print(len(all_centers),len(all_contents[0]),len(all_negatives[0]))

375342 2 10


In [0]:
# 数据集
class Data(Dataset):
  def __init__(self,all_centers,all_contents,all_negatives):
    self.all_centers=all_centers
    self.all_contents=all_contents
    self.all_negatives=all_negatives

  def __len__(self):
    return len(self.all_centers)

  def __getitem__(self,item):
    return (self.all_centers[item],self.all_contents[item],self.all_negatives[item])

def batchify(data):
  max_content=max([len(c) for _,c,n in data])
  max_negative=max([len(n) for _,c,n in data])
  centers,contents,negatives=[],[],[]
  mask_contents,mask_negatives=[],[]
  for center,content,negative in data:
    centers.extend([center])
    contents.append(content+[0]*(max_content-len(content)))
    negatives.append(negative+[0]*(max_negative-len(negative)))
    mask_contents.append([1]*len(content)+[0]*(max_content-len(content)))
    mask_negatives.append([1]*len(negative)+[0]*(max_negative-len(negative)))
  return (torch.tensor(centers).view(-1,1),torch.tensor(contents),torch.tensor(negatives),torch.tensor(mask_contents),torch.tensor(mask_negatives))


In [0]:
dataset=Data(all_centers,all_contents,all_negatives)
batch_size=512
train_iter=DataLoader(dataset,batch_size=batch_size,shuffle=True,collate_fn=batchify)

In [28]:
for batch in train_iter:
  for name,data in zip(['centers','contents','negatives','mask_contents','mask_negatives'],batch):
    print(name,' shape:',data.shape)
  break

centers  shape: torch.Size([512, 1])
contents  shape: torch.Size([512, 10])
negatives  shape: torch.Size([512, 50])
mask_contents  shape: torch.Size([512, 10])
mask_negatives  shape: torch.Size([512, 50])


# 模型

In [0]:
class Net(nn.Module):
  def __init__(self,vocab_size,embed_size):
    super(Net,self).__init__()
    self.in_embed=nn.Embedding(vocab_size,embed_size)
    self.out_embed=nn.Embedding(vocab_size,embed_size)
  def forward(self,centers,contents,negatives,mask_contents,mask_negatives):
    # centers:(batch_size,1)
    # contents:(batch_size,max_c)
    # negatives:(batch_size,max_n)
    # mask_contents:(batch_size,max_c)
    # mask_negatives:(batch_size,max_n)
    centers_embed=self.in_embed(centers)    # (batch_size,1,embed_size)
    contents_embed=self.out_embed(contents)   # (batch_size,max_c,embed_size)
    negatives_embed=self.out_embed(negatives)  # (batch_size,max_n,embed_size) 
    sim_cc=torch.bmm(contents_embed,centers_embed.permute(0,2,1)) # (batch_size,max_c,1)
    sim_cn=torch.bmm(negatives_embed,centers_embed.permute(0,2,1)) #(batch_size,max_n,1)
    sim_cc=sim_cc.squeeze(-1)  # (batch_size,max_c)
    sim_cn=sim_cn.squeeze(-1)  # (batch_size,max_n)
    sim_cc=sim_cc*mask_contents
    sim_cn=sim_cn*mask_negatives 
    loss=-F.logsigmoid(sim_cc).sum(1)-F.logsigmoid(-sim_cn).sum(1)
    return loss
class Loss_fn(nn.Module):
  def __init__(self):
    super(Loss_fn,self).__init__()
  def forward(self,sim_cc,sim_cn):
    return -torch.log(torch.sigmoid(sim_cc)).sum(dim=1)-torch.log(1-torch.sigmoid(sim_cn)).sum(dim=1)


# 训练

In [0]:
def train(model,Loss_fn,num_epochs,lr,train_iter):
  model=model.to(device)
  criterion=Loss_fn()
  optimizer=optim.Adam(model.parameters(),lr=lr)

  for epoch in range(num_epochs):
    train_loss,n=0.0,0
    for centers,contents,negatives,mask_contents,mask_negatives in train_iter:
      centers=centers.to(device)
      contents=contents.to(device)
      negatives=negatives.to(device)
      mask_contents=mask_contents.to(device)
      mask_negatives=mask_negatives.to(device)
      loss=model(centers,contents,negatives,mask_contents,mask_negatives).mean()
      # loss=criterion(sim_cc,sim_cn).sum()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      train_loss+=loss.item()
      n+=1
    train_loss=train_loss/(n*60)
    print('epoch:{},train loss:{}'.format(epoch+1,train_loss))

In [57]:
model=Net(vocab_size,100)
num_epochs,lr=10,0.01
train(model,Loss_fn,num_epochs,lr,train_iter)

epoch:1,train loss:1.2155407589419553
epoch:2,train loss:0.6503230510680487
epoch:3,train loss:0.581870673722727
epoch:4,train loss:0.560213195703768
epoch:5,train loss:0.5501134004948033
epoch:6,train loss:0.5440090482609582
epoch:7,train loss:0.5393522541486166
epoch:8,train loss:0.5354487695875869
epoch:9,train loss:0.5321821521565007
epoch:10,train loss:0.5292974558664819


# 应用

In [61]:
def get_similar_tokens(query_token,k,model):
  W=model.in_embed.weight.data
  x=W[char_to_idx[query_token]]
  cos=torch.matmul(W,x)/(torch.sum(W*W,dim=1)*torch.sum(x*x)+1e-9).sqrt()
  _,topk=torch.topk(cos,k=k+1)
  topk=topk.cpu().numpy()
  for i in topk[1:]:
    print('cosine sim=%.3f:%s'%(cos[i],(idx_to_char[i])))
get_similar_tokens('chip',3,model)

cosine sim=0.570:intel
cosine sim=0.541:microprocessor
cosine sim=0.507:computer
