# pytroch 学习过程

## 基本数据构建

In [3]:
from __future__ import print_function
import torch

In [4]:
# 不加初始化构造一个 5 * 3的矩阵
x = torch.empty(5,3)
print(x)
# 初始化构造 5 * 3矩阵
y = torch.rand(5,3)
print(y)

tensor([[5.9694e-39, 7.2551e-39, 6.2449e-39],
        [8.4490e-39, 9.6429e-39, 8.4490e-39],
        [9.6429e-39, 9.2755e-39, 1.0286e-38],
        [9.0919e-39, 8.9082e-39, 9.2755e-39],
        [8.4490e-39, 1.0194e-38, 9.0919e-39]])
tensor([[0.9984, 0.8208, 0.1584],
        [0.0465, 0.5781, 0.0802],
        [0.7116, 0.2185, 0.1064],
        [0.5044, 0.1026, 0.6697],
        [0.2475, 0.1576, 0.1387]])


注：`torch.Tensor` 是一种包含单一数据类型元素的多维矩阵。

In [5]:
# Construct a tensor directly from data:
x = torch.tensor([5.5,3])
print(x)

tensor([5.5000, 3.0000])


In [8]:
# create a tensor based on an existing tensor. These methods will reuse properties of the input tensor, e.g. dtype, unless new values are provided by user
x = x.new_ones(5, 3, dtype=torch.double)      # new_* methods take in sizes
print(x)

x = torch.randn_like(x, dtype=torch.float)    # override dtype!
print(x)                                      # result has the same size

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)
tensor([[ 0.0723,  0.0453, -0.6463],
        [-0.6740, -1.0280,  0.3183],
        [-0.2123, -0.6022, -1.6899],
        [ 0.2951,  0.0717,  0.3614],
        [ 0.9633, -0.8296, -0.3745]])


In [10]:
# Get its size
print(x.size())

torch.Size([5, 3])


In [14]:
y = torch.rand(5,3)
print(x+y)
print(torch.add(x,y))
result = torch.empty(5,3)
torch.add(x,y,out=result)
print(result)

tensor([[ 0.5473,  0.4169,  0.0296],
        [-0.6419, -0.0466,  1.3012],
        [ 0.1793,  0.1926, -1.5360],
        [ 0.5820,  0.9546,  1.0707],
        [ 1.4848, -0.7317,  0.4464]])
tensor([[ 0.5473,  0.4169,  0.0296],
        [-0.6419, -0.0466,  1.3012],
        [ 0.1793,  0.1926, -1.5360],
        [ 0.5820,  0.9546,  1.0707],
        [ 1.4848, -0.7317,  0.4464]])
tensor([[ 0.5473,  0.4169,  0.0296],
        [-0.6419, -0.0466,  1.3012],
        [ 0.1793,  0.1926, -1.5360],
        [ 0.5820,  0.9546,  1.0707],
        [ 1.4848, -0.7317,  0.4464]])


In [16]:
y.add_(x)
print(y)

tensor([[ 0.6195,  0.4621, -0.6166],
        [-1.3159, -1.0746,  1.6195],
        [-0.0330, -0.4096, -3.2260],
        [ 0.8770,  1.0262,  1.4320],
        [ 2.4481, -1.5613,  0.0719]])


In [18]:
print(x[:,1])

tensor([ 0.0453, -1.0280, -0.6022,  0.0717, -0.8296])


In [21]:
# Converting a Torch Tensor to a NumPy Array
a = torch.ones(5);
print(a)

tensor([1., 1., 1., 1., 1.])


In [22]:
b = a.numpy()
print(b)

[1. 1. 1. 1. 1.]


In [6]:
# Converting NumPy Array to Torch Tensor
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a,1,out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [25]:
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!

# 基于深度学习的pytorch

`torch.Tensor` is the central class of the package. If you set its attribute `.requires_grad` as True, it starts to track all operations on it. When you finish your computation you can call `.backward()` and have all the gradients computed automatically. The gradient for this tensor will be accumulated into `.grad` attribute.

To stop a tensor from tracking history, you can call `.detach()` to detach it from the computation history, and to prevent future computation from being tracked.

To prevent tracking history (and using memory), you can also wrap the code block in with `torch.no_grad()`:. This can be particularly helpful when evaluating a model because the model may have trainable parameters with `requires_grad=True`, but for which we don’t need the gradients.

There’s one more class which is very important for autograd implementation - a Function.

Tensor and Function are interconnected and build up an acyclic graph, that encodes a complete history of computation. Each tensor has a `.grad_fn` attribute that references a Function that has created the Tensor (except for Tensors created by the user - their grad_fn is None).

If you want to compute the derivatives, you can call `.backward()` on a Tensor. If Tensor is a scalar (i.e. it holds a one element data), you don’t need to specify any arguments to backward(), however if it has more elements, you need to specify a gradient argument that is a tensor of matching shape.

In [1]:
import torch

In [2]:
x = torch.ones(2,2,requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)


In [3]:
y = x + 2
print(y)

tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)


y was created as a result of an operation, so it has a `grad_fn`.

In [4]:
z = y * y * 3 
out = z.mean()
print(z,out)

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward0>)


In [5]:
a = torch.randn(2,2)
a = ((a*3)/(a-1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a*a).sum()
print(b.grad_fn)

False
True
<SumBackward0 object at 0x000002BAA9B946D8>


In [6]:
out.backward()
print(x.grad)

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


In [10]:
x = torch.randn(3,requires_grad=True)
y = x * 2 ;
while y.data.norm() < 1000:
    print(y.data)
    print(y.data.norm())
    y = y * 2;
print(y)

tensor([-2.0176, -0.4099,  0.3306])
tensor(2.0852)
tensor([-4.0353, -0.8197,  0.6612])
tensor(4.1704)
tensor([-8.0705, -1.6395,  1.3224])
tensor(8.3408)
tensor([-16.1410,  -3.2789,   2.6448])
tensor(16.6817)
tensor([-32.2820,  -6.5579,   5.2896])
tensor(33.3634)
tensor([-64.5641, -13.1158,  10.5791])
tensor(66.7268)
tensor([-129.1282,  -26.2315,   21.1582])
tensor(133.4535)
tensor([-258.2563,  -52.4630,   42.3165])
tensor(266.9071)
tensor([-516.5126, -104.9261,   84.6330])
tensor(533.8141)
tensor([-1033.0253,  -209.8521,   169.2659], grad_fn=<MulBackward0>)


In [11]:
v = torch.tensor([0.1,1.0,0.0001],dtype=torch.float)
y.backward(v)
print(x.grad)

tensor([1.0240e+02, 1.0240e+03, 1.0240e-01])


In [15]:
print(x.requires_grad)
print((x**2).requires_grad)
print(x.grad)
with torch.no_grad():
    print((x**2).requires_grad)

True
True
tensor([1.0240e+02, 1.0240e+03, 1.0240e-01])
False


## word embeddings

### word embeding 简介

A word embedding is an approach to provide a dense vector representation of words that capture something about their meaning.

Word embeddings are an improvement over simpler bag-of-word model word encoding schemes like word counts and frequencies that result in large and sparse vectors (mostly 0 values) that describe documents but not the meaning of the words.

Word embeddings work by using an algorithm to train a set of fixed-length dense and continuous-valued vectors based on a large corpus of text. Each word is represented by a point in the embedding space and these points are learned and moved around based on the words that surround the target word.

It is defining a word by the company that it keeps that allows the word embedding to learn something about the meaning of words. The vector space representation of the words provides a projection where words with similar meanings are locally clustered within the space.

The use of word embeddings over other text representations is one of the key methods that has led to breakthrough performance with deep neural networks on problems like machine translation.

**注**:  
`word embeding` 其实是词袋模型的改进，在 task 1 中选择使用的`词袋模型`建立的向量中的 0 可能会占用过多的空间，导致我们浪费太多的无用空间在向量的保存上面，同时词袋模型也不能很好反应`词与词之间的相似性关系`，而且词袋模型很容易`隔断词与词之间的相互联系`；

### 基于Gensim库构建 `word embeding`

#### 构建 word2vec embeding

Word2vec is one algorithm for learning a word embedding from a text corpus.

There are two main training algorithms that can be used to learn the embedding from text; they are continuous bag of words (CBOW) and skip grams.

Gensim provides the Word2Vec class for working with a Word2Vec model.

`Learning a word embedding` from text involves loading and organizing the text into sentences and providing them to the constructor of a new Word2Vec() instance. For example:

In [43]:
from gensim.models import Word2Vec
sentences = "like play basketball dance rap music"
model = Word2Vec(sentences)

  return f(*args, **kwds)
  "C extension not loaded, training will be slow. "


There are many parameters on this constructor; a few noteworthy arguments you may wish to configure are:

- size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).
- window: (default 5) The maximum distance between a target word and words around the target word.
- min_count: (default 5) The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.
- workers: (default 3) The number of threads to use while training.
sg: (default 0 or CBOW) The training algorithm, either CBOW (0) or skip gram (1).

上面所给出的这些参数，统统是应用于word2vec函数中的，规定创建 word embeding 过程中的一些特殊需求

In [10]:
# 打印学习好的词汇 token
words = list(model.wv.vocab)
print(words)

[' ', 'a']


In [21]:
from gensim.models import Word2Vec
# 定义数据
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
    ['this', 'is', 'the', 'second', 'sentence'],
    ['yet', 'another', 'sentence'],
    ['one', 'more', 'sentence'],
    ['and', 'the', 'final', 'sentence']]
# 训练模型
model = Word2Vec(sentences,min_count=1)
# 汇总加载的模型
print(model)
words = list(model.wv.vocab)
print(words)
# 访问向量
print(model['sentence'])


Word2Vec(vocab=14, size=100, alpha=0.025)
['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second', 'yet', 'another', 'one', 'more', 'and', 'final']
[-1.4843451e-03  1.7421880e-03 -3.6965567e-04  5.4466451e-04
  4.9114134e-03 -4.1461815e-03  1.5052444e-04  3.5633883e-03
  2.4346111e-04  1.1773163e-03 -2.4946758e-03 -2.9253820e-03
  8.3326653e-04 -2.1874896e-05 -4.8536160e-03  2.3529611e-03
 -1.0252636e-03 -5.6460599e-04 -3.8007323e-03  2.0722076e-03
  2.8510750e-03 -2.1302493e-03  2.4644409e-03 -3.4067023e-04
 -7.8525330e-04  4.7079628e-04  2.7827937e-03  3.6107708e-04
  3.3716476e-03  1.5742212e-03  3.7341474e-03 -4.1354182e-03
  1.6504226e-03 -1.0222066e-04 -3.8867055e-03  1.0258600e-03
  2.4221262e-03 -7.0633326e-04  3.9879270e-03 -2.3659850e-03
  3.2991129e-03  2.0806962e-03 -2.6877198e-04  3.5716174e-03
  2.0463958e-03  3.6212353e-03  7.0278713e-04  1.9959810e-03
  3.7956703e-03 -6.5343286e-04  9.1706769e-04  2.4014800e-03
 -1.6702205e-03  6.2492833e-04 -3.6832022e-0

  "C extension not loaded, training will be slow. "
  from ipykernel import kernelapp as app


#### Visualize Word Embedding

After you learn word embedding for your text data, it can be nice to explore it with visualization.

You can use classical projection methods to reduce the high-dimensional word vectors to two-dimensional plots and plot them on a graph.

The visualizations can provide a qualitative diagnostic for your learned model.

In [22]:
X = model[model.wv.vocab]

  """Entry point for launching an IPython kernel.


We can then train a projection method on the vectors, such as those methods offered in scikit-learn, then use matplotlib to plot the projection as a scatter plot.

Let’s look at an example with `Principal Component Analysis`（主成分分析） or PCA.

In [2]:
from gensim.models import  Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
# 定义语句
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
    ['this', 'is', 'the', 'second', 'sentence'],
    ['yet', 'another', 'sentence'],
    ['one', 'more', 'sentence'],
    ['and', 'the', 'final', 'sentence']]
# 训练模型
model = Word2Vec(sentences,min_count=1)
# 使用一个二维PCA模型去适应embeding vector
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# 创建散点图
pyplot.scatter(result[:,0],result[:,1])
words = list(model.wv.vocab)
for i , word in enumerate(words):
    pyplot.annotate(word,xy=(result[i,0],result[i,1]))
pyplot.show()

  "C extension not loaded, training will be slow. "
  del sys.path[0]


<Figure size 640x480 with 1 Axes>

### 装载谷歌 word2vec embeding

A pre-trained model is nothing more than a file containing tokens and their associated word vectors. The pre-trained Google word2vec model was trained on Google news data (about 100 billion words); it contains 3 million words and phrases and was fit using 300-dimensional word vectors.

In [1]:
# 预使用谷歌 word2vec embeding
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename,binary=true)



NameError: name 'true' is not defined

Another interesting thing that you can do is do a little linear algebra arithmetic with words.

For example, a popular example described in lectures and introduction papers is:

In [None]:
queen = (king - man) + woman

That is the word queen is the closest word given the subtraction of the notion of man from king and adding the word woman. The “man-ness” in king is replaced with “woman-ness” to give us queen. A very cool concept.

Gensim provides an interface for performing these types of operations in the most_similar() function on the trained or loaded model.

In [None]:
result = model.most_similar(positive=['woman'])

In [None]:
from gensim.models import KeyedVectors
# 装载模型
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)
# 计算最接近
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

### 装载 Stanford glove embeding

Stanford researchers also have their own word embedding algorithm like word2vec called Global Vectors for Word Representation, or GloVe for short.

The first step is to convert the GloVe file format to the word2vec file format. The only difference is the addition of a small header line. This can be done by calling the glove2word2vec() function. For example:

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
# 装入给定glove
glove_input_file = 'C:/Users/WYX/Desktop/GCAE/model_files/glove.840B.300d.txt'
word2vec_output_file = 'word2vec.txt'
glove2word2vec(glove_input_file,word2vec_output_file)

In [None]:
from gensim.models import KeyedVectors
filename = 'word2vec.txt'
model = KeyedVectors.load_word2vec_format(filename,binary=False)
result = model.most_similar(positive=['woman','king'],negative=['man'],topn=1)
print(result)

### 创建临时训练的 embeding

In [3]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


We will define a small problem where we have 10 text documents, each with a comment about a piece of work a student submitted. Each text document is classified as positive “1” or negative “0”. This is a simple sentiment analysis problem.

In [4]:
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

Keras provides the one_hot() function that creates a hash of each word as an efficient integer encoding. We will estimate the vocabulary size of 50, which is much larger than needed to reduce the probability of collisions from the hash function.

In [5]:
# 把整数编码成文档
vocab_size = 50
encoded_docs = [one_hot(d,vocab_size) for d in docs]
print(encoded_docs)

[[44, 43], [44, 29], [1, 47], [8, 29], [15], [1], [45, 47], [18, 44], [45, 29], [39, 49, 43, 46]]


The sequences have different lengths and Keras prefers inputs to be vectorized and all inputs to have the same length. We will pad all input sequences to have the length of 4. Again, we can do this with a built in Keras function, in this case the pad_sequences() function.

In [6]:
max_length = 4 
padded_docs = pad_sequences(encoded_docs,maxlen=max_length,padding= 'post')
print(padded_docs)

[[44 43  0  0]
 [44 29  0  0]
 [ 1 47  0  0]
 [ 8 29  0  0]
 [15  0  0  0]
 [ 1  0  0  0]
 [45 47  0  0]
 [18 44  0  0]
 [45 29  0  0]
 [39 49 43 46]]


The Embedding has a vocabulary of 50 and an input length of 4. We will choose a small embedding space of 8 dimensions.

The model is a simple binary classification model. Importantly, the output from the Embedding layer will be 4 vectors of 8 dimensions each, one for each word. We flatten this to a one 32-element vector to pass on to the Dense output layer.

In [7]:
model = Sequential()
model.add(Embedding(vocab_size,8,input_length=max_length))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
# 编译模型
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(padded_docs,labels,epochs=50,verbose=0)
loss,accuracy = model.evaluate(padded_docs,labels,verbose=0)
print('Accuracy: %f'%(accuracy*100))

Accuracy: 100.000000


### 使用 Stanford 预训练glove embedding

Keras provides a Tokenizer class that can be fit on the training data, can convert text to sequences consistently by calling the texts_to_sequences() method on the Tokenizer class, and provides access to the dictionary mapping of words to integers in a word_index attribute.

In [17]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [24]:
# define documents
docs = ['Well done!',
    'Good work',
    'Great effort',
    'nice work',
    'Excellent!',
    'Weak',
    'Poor effort!',
    'not good',
    'poor work',
    'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [14]:
embedings_index = dict()
f = open('C:/Users/WYX/Desktop/GCAE/model_files/glove.840B.300d.txt','rb')

In [18]:
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:],dtype='float32')
    embedings_index[word] = coefs 
f.close()
print('Loaded %s word vectors'% len(embedings_index))

Loaded 2196015 word vectors


In [28]:
# 创建权值矩阵
embedding_matrix = zeros((vocab_size,100))
for word ,i in t.word_index.items():
    embedding_vector = embedings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [29]:
# 定义模型
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [30]:
# 编译
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_2 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
None


In [42]:
# 适应模型
model.fit(padded_docs, labels, epochs=100, verbose=0)
# 评估
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 50.000000


### 基于 wordembeding 的词语预测

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

word_to_ix = {'hello': 0, 'world': 1}
embeds = nn.Embedding(2, 5)
hello_idx = torch.LongTensor([word_to_ix['hello']])
hello_idx = Variable(hello_idx)
hello_embed = embeds(hello_idx)
print(hello_embed)

tensor([[ 0.7518,  0.5892, -0.3038, -0.3653, -1.1739]],
       grad_fn=<EmbeddingBackward>)


在一篇文章中，每一句话可以由很多个单词组成，而对于一句话而言，这些单词的组成顺序也很重要；

N - gram 模型计算公式：
$P(w_i|w_{i-1},w_{i-2},....,w_{i-n+1})$

从上述公式可以看出，N - gram 公式是一个计算条件概率的公式，也即给定前面几个单词，最大化我们想要预测的单词的概率

In [4]:
# 数据预处理 
# 设置通过 2 个单词进行预测
CONTEXT_SIZE = 2
# 设置 word_embeding 的维数
EMBEDDING_DIM = 10
# 设置给定语料
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# 使用前两个为一个元组  后一个为待预测数据进行保存
trigram = [((test_sentence[i],test_sentence[i+1]),test_sentence[i+2]) 
           for i in range(len(test_sentence)-2)]

In [5]:
# 使用 set 处理掉中间重复出现的词语 
vocb = set(test_sentence)
# 将数据整理好，也就是我们需要将单词三个分组，每个组前两个作为传入的数据，而最后一个作为预测的结果
word_to_idx = {word:i for i ,word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]:word for word in word_to_idx}

In [6]:
# 定义模型
import torch.optim as optim
class NgramModel(nn.Module):
    def __init__(self, vocb_size, context_size, n_dim):
        super(NgramModel, self).__init__()
        self.n_word = vocb_size
        self.embedding = nn.Embedding(self.n_word, n_dim)
        self.linear1 = nn.Linear(context_size*n_dim, 128)
        self.linear2 = nn.Linear(128, self.n_word)

    def forward(self, x):
        emb = self.embedding(x)
        print((emb.shape))
        #print(emb)
        #print(emb.view(1,-1))
        emb = emb.view(1, -1)
        print((emb.shape))
        out = self.linear1(emb)
        out = F.relu(out)
        out = self.linear2(out)
        log_prob = F.log_softmax(out)
        return log_prob

ngrammodel = NgramModel(len(word_to_idx), CONTEXT_SIZE, 100)
criterion = nn.NLLLoss()
optimizer = optim.SGD(ngrammodel.parameters(), lr=1e-3)

In [None]:
# 训练
for epoch in range(100):
    print('epoch: {}'.format(epoch+1))
    print('*'*10)
    running_loss = 0
    for data in trigram:
        word, label = data
        word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
        #print(type(word),word)
        label = Variable(torch.LongTensor([word_to_idx[label]]))
        #print(type(label),label)
        # forward
        out = ngrammodel(word)
        #print(out)
        loss = criterion(out, label)
        # 拿到 tensor 中的python number
        running_loss += loss.item()
        # 反向传播的过程只需要调用loss.backgrad()函数即可．但是由于变量的梯度是累加的，所以在求backward之前应该先对现有的梯度清零
        optimizer.zero_grad()
        loss.backward()
        # 调用 step 更新梯度
        optimizer.step()
    print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))

In [11]:
# 测试
word, label = trigram[20]
word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
out = ngrammodel(word)
print(out)
_, predict_label = torch.max(out.data, 1)
#print(type(predict_label))
predict_word = idx_to_word[predict_label.item()]
print('real word is {}, predict word is {}'.format(label, predict_word))



tensor([[ -9.9988,  -8.0061,  -8.7443, -11.3829,  -9.6568,  -8.8099,  -9.7145,
          -9.6482,  -8.4003,  -8.1090,  -9.3973,  -9.3107,  -8.1866,  -9.6622,
          -8.0545, -10.0000,  -8.3407,  -8.4119,  -8.5360,  -8.5949,  -7.3294,
          -8.5396,  -8.5868,  -9.6320,  -8.6006,  -8.0478,  -8.9333,  -8.8886,
          -8.2759,  -8.8655,  -8.2310,  -8.1351,  -8.5338, -10.6826,  -6.5868,
          -7.3014,  -8.4470,  -8.0026,  -7.9392,  -6.8618,  -9.1761,  -7.2728,
          -6.7953,  -8.9893,  -8.3649,  -8.3360,  -8.8731,  -9.4642,  -9.1960,
          -9.2868, -10.2940,  -5.6262,  -9.4397, -10.5414,  -9.0552, -10.1858,
          -9.4227,  -8.6907,  -8.4254, -10.1058, -10.2626,  -9.0106, -10.9733,
          -9.7702,  -0.0251,  -8.9284,  -7.9546,  -7.8535,  -9.6066,  -8.8238,
          -9.0893,  -8.2352,  -8.0247,  -9.6315,  -8.5015,  -9.4875,  -8.8605,
          -6.1824,  -9.1412,  -9.2430, -10.7242,  -9.4411,  -7.5835, -10.5182,
          -8.9633,  -8.5399,  -9.3835,  -8.1014,  -8