# Skip-Gram模型

## 导入包

In [1]:
import time
import numpy as np
import tensorflow as tf
import random
from collections import Counter

## 加载数据

In [6]:
with open("data/text8.txt",encoding="utf-8")as f:
          text = f.read()

## 数据预处理

In [30]:
# 定义函数来完成数据的预处理
def preprocess(text,freq = 5):
    """
    对文本进行预处理
    
    参数
    ---
    text:文本数据
    freq:词频阈值
    """
    #对文本中的符号进行替换
    text = text.lower()
    text = text.replace("."," <PERIOD> ")
    text = text.replace(","," <COMMA> ")
    text = text.replace('"'," <QUOTATION_MARK> ")
    text = text.replace(";"," <SEMICOLON> ")
    text = text.replace("!"," <EXCLAMATION_MARK> ")
    text = text.replace("?"," <QUESTION_MARK> ")
    text = text.replace("("," <LEFT_PAREN> ")
    text = text.replace(")"," <RIGHT_PAREN> ")
    text = text.replace("--"," <HYPHENS> ")
    text = text.replace(":"," <COLON> ")
    words = text.split()
    
    # 删除低频词，减少噪音影响
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word]>freq]
    return trimmed_words

In [31]:
words = preprocess(text)

## 查看筛选后的单词数

In [34]:
print("原单词数：{}".format(len(words)))
print("筛选后的单词数：{}".format(len(set(words))))


原单词数：16680599
筛选后的单词数：63641


## 构建单词映射表

In [38]:
# 整形编码
vocab = set(words)
vocab_to_int = {w:i for i,w in enumerate(vocab)}
int_to_vocab = {i:w for i,w in enumerate(vocab)}

## 查看单词映射表

In [42]:
print("{}的整形编码为{}".format("anarchism",vocab_to_int["anarchism"]))
print("{}表示{}".format(4732,int_to_vocab[4732]))

anarchism的整形编码为21175
4732表示ov


## 对原文进行vocab到int的转换

In [41]:
print("原文：",words[:30])
int_words = [vocab_to_int[word] for word in words]
print(int_words[:30])

原文： ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst']
[21175, 54489, 3004, 17834, 60666, 63196, 10739, 61785, 9234, 27854, 60492, 31774, 40864, 18129, 4285, 43796, 4819, 63196, 43796, 17263, 31216, 22113, 43796, 18471, 60213, 63196, 43796, 13904, 31216, 45958]


## 采样
#### 对停用词进行采样，例如"the","or"等停用词，剔除这些单词以后可以加快我们的训练过程，同时减少训练过程中的噪音
#### 公式![image.png](attachment:image.png)

In [45]:
t = 1e-5 # t值
threshold  =0.8 # 剔除概率阈值

#统计单词出现频次
int_word_counts = Counter(int_words)
total_count = len(int_words)
#计算单词频率
word_freqs = {w:int_word_counts[w]/total_count for w in int_word_counts}
#计算被删除的概率
prob_drop = {w:1 - np.sqrt(t/word_freqs[w] for w in int_word_counts)}
# # 对单词进行采样
# train_words = [w for w in int_words if prob_drop[w]<threshold]