In [1]:
import tensorflow as tf
from nltk.tokenize import WordPunctTokenizer
import numpy as np
from collections import Counter
import random

In [11]:
def prepocess(text, freq=5):
    """输入文本并对文本进行预处理。处理包括：将文本句子转为单个的按原有文本顺序排列的列表，
    同时去除频数低于5的噪声，此外对高频词汇进行subsampling处理
    
    Arguments:
        text {str} -- 输入文本
        freq {int} -- 频数上线
    
    Returns:
        list -- 训练集
    """
    # 输入为文本,str类型
    # 输出为处理后的单词列表
    # 同时进行subsampling
    words = [word for word in text.split()]
    # 使用nltk便于处理
    tokenizer = WordPunctTokenizer()
    words_count = Counter(words)
    #删除单词数目少于5的噪声
    words = [tokenizer.tokenize(word.lower())[0]  for word in words if words_count[word] > freq]
    
    # subsampling
    threshold = 1e-5
    total_words = len(words)
    freqs = {word:count/total_words for word, count in words_count.items()}
    p_drop = {word:1-np.sqrt(threshold/freqs[word]) for word in words_count}
    train_words = [word for word in words if random.random() < (1-p_drop[word])]
    
    return train_words

In [12]:
def get_targets(words, idx, window_size=5):
    '''
    获得input word的上下文单词列表
    words: 单词列表
    idx: input words的索引号
    window_size: 窗口大小
    '''
    target_window = np.random.randint(1, window_size+1) # 随机生成目标窗口大小
    # 这里要考虑input word前面单词不够的情况
    start_point = idx - target_window if (idx - target_window) > 0 else 0
    end_point = idx + target_window
    targets = set(words[start_point:idx] + words[idx+1:end_point+1])
    return list(targets)

In [13]:
def get_batches(words, batch_size, window_size = 5):
    '''
    构造一个获取batch的生成器
    '''
    # 有多少个小批量，每个批量的单词数是batch_size
    n_batches = len(words) // batch_size
    
    # 仅取full batches
    words = words[:n_batches * batch_size]
    
    for idx in range(0, len(words), batch_size):
        x,y = [], []
        batch = words[idx:idx+batch_size]
        for i in range(len(batch)):
            # input word
            batch_x = batch[i]
            # output word
            batch_y = get_targets(batch, i, window_size)
            # 由于一个input word会对应多个output word，因此需要统一长度
            x.extend([batch_x] * len(batch_y))
            y.extend(batch_y)
        yield x,y

In [28]:
# 进行图定义
def define_graph(train_graph):
    with train_graph.as_default():
        # 输入，定义placeholder
        with tf.name_scope("inputs"):
            inputs = tf.placeholder(tf.int32, shape=[None], name='inputs')
            labels = tf.placeholder(tf.int32, shape=[None, None], name='labels')

        # 嵌入层权重矩阵
        embedding = tf.Variable(tf.random_uniform([embedding_row_dim, embedding_col_dim]), name='embedding')
        # 实现lookup
        embed = tf.nn.embedding_lookup(embedding, inputs)

        # 运算层
        # 注意name_scope中不能有“：”，否则会报错
        with tf.name_scope('layer_with_learn_skip_' + str(learn_skip)):
            # 权重矩阵
            with tf.name_scope("weights"):
                Weight = tf.Variable(tf.truncated_normal([embedding_row_dim, embedding_col_dim], stddev = 0.1), name='Weight')
                tf.summary.histogram('embedding/weights', Weight)
            # 偏差。官方推荐不初始化为0    
            with tf.name_scope('biases'):
                biase = tf.Variable(tf.zeros(embedding_row_dim)+0.1, name='biase')
                tf.summary.histogram('embedding/biases', biase)
            # 定义损失函数    
            with tf.name_scope('loss'):
                loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(Weight, biase, labels, embed, num_sampled, vocab_size))
                tf.summary.scalar('loss', loss)
            # 优化器
            with tf.name_scope('train'):
                optimizer = tf.train.AdamOptimizer(learn_skip).minimize(loss)
                
        # 保存图
        saver = tf.train.Saver()

In [30]:
# 训练
def train(train_graph, dataset, batch_size=100, window_size=5, save_graph_path="final_skip_gram_model"):
    
    with tf.Session(graph=train_graph) as sess:
        
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(epoches):
            batches = get_batches(dataset, batch_size, window_size) #迭代器
            for x,y in batches:#小批量随机梯度下降
                feed = {inputs:x, labels:np.array(y)[:None]}
                sess.run([loss,optimizer], feed_dict = feed)
                
            # 每个周期都进行一次保存操作
            save_path = saver.save(sess, save_graph_path+'/skip_gram'+str(epoch)+'.ckpt')

In [16]:
with open("E:/aboutme/STUDY/python-learn/word2vec_skipgram/text8", "r") as f:
    text = f.read()

#print(len(text))
#print(text[:50])

In [18]:
words = prepocess(text) # 得到的最终的训练集
#print("train words:", words[:5])

# 构建映射表
vocabulary = set(words) # 独特词汇表
vocab_int = {word:num for num,word in enumerate(vocabulary)}
int_vocab = {num:word for num,word in enumerate(vocabulary)}


# embedding 的大小
vocab_size = len(vocabulary)
embedding_row_dim = vocab_size
embedding_col_dim = 200

save_graph_path = "final_skip_gram_model"

learn_skip = 0.001 # 学习率
epoches = 100 # 训练周期

num_sampled = 5 # 负例数量
window_size = 5 # 上下文数目


In [29]:
train_graph = tf.Graph()
define_graph(train_graph)

In [31]:
train(train_graph, dataset=words[:100*10])

TypeError: Cannot interpret feed_dict key as Tensor: Tensor Tensor("inputs/inputs:0", shape=(?,), dtype=int32) is not an element of this graph.