# 基本设置

In [2]:
from gensim.models import Word2Vec

# load model

In [None]:
model_path = 'model/cbirc_skip_gram.model' # 输出模型
vector_path = 'model/cbirc_skip_gram.vector' # 原始c版本word2vec的vector格式的模型
model = Word2Vec.load(model_path)

## new data

In [6]:
filepath_raw = 'datasets_raw/circ_new/'
filename_list = os.listdir(filepath_raw)
# filename_list = filename_list[:1]
print('文件数：', len(filename_list))

file_num = 0
with open('datasets_pre/circ_pre_new.txt', 'w') as f:
    for index, filename in enumerate(filename_list):
        tmp_data = pd.read_excel(filepath_raw + filename)
        print('filename: ', index +1, filename, tmp_data.shape)
        file_num += tmp_data.shape[0]
        for index in tmp_data.index:
            content = str(tmp_data.loc[index, 'title']) + "。" + str(tmp_data.loc[index, 'content'])
            string_pre = pre_cor_circ.handle_contents([content])
            f.write(string_pre[0] + '\n')

del tmp_data
print('file_num: ', file_num)

In [None]:
circ_pre_new = []
with open('datasets_pre/circ_pre_new.txt', 'r') as f:
    for line in f.readlines():
        circ_pre_new.append(line.strip()) # 把末尾的'\n'删掉        
print('circ_pre_new file_num：', len(circ_pre_new))


# 由单词到段落，尝试1：向量平均

电影评论数据集处理起来一个比较麻烦的地方在于，评论的长度是不一样的。我们需要提取出每一个词的向量，然后把它们转换为一个特征集，而且每个评论的特征长度是一样的。

因为每一个单词有一个300维的特征，我们可以用特征操作来把一个评论中的单词合并起来。一个简单的方法就是对所有的词向量取平均。（如果取平均的话，我们需要移除stop words，因为会带来噪音）

下面是计算特征向量平均值的代码：

In [None]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given paragraph
    
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype="float32")
    
    nwords = 0
    
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    
    # Initialize a counter
    counter = 0
    
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    # Loop through the reviews
    for review in reviews:
       
        # Print a status message every 1000th review
        if counter%1000 == 0.:
            print("Review %d of %d" % (counter, len(reviews)))
       
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       
        # Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

接下来我们调用上面的函数来给每一个评论创建一个平均向量。下面会运行几分钟：

In [None]:
# Set values for various parameters
num_features = 400    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
# context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [None]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word removal.

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

数据处理结束，我们得到了每个评论的平均向量，我们用这个特征向量来训练一个随机森林模型。

In [None]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainDataVecs, train["sentiment"] )

# Test & extract results 
result = forest.predict( testDataVecs )

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "result/Word2Vec_AverageVectors.csv", index=False, quoting=3 )

#  由单词到段落，尝试2：聚类

word2vec会给词义上相近的单词进行聚类，所以另一个可行的方法是利用一个类中单词的相似性。这种对单词分组的方法叫做向量量化。第一步要做的，就是找到词聚类的中心，可以通过聚类算法来做的，比如K-Means。

在K-Means中，对于一个段落，需要设置一个K，或聚类的数量。如何决定创建多少个聚类？反复试验的结果表示，数量较少的类，比如一个类里有5个词，会有更好的效果。我们使用[scikit-learn来实现kmeans](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)

如果K很大的话，会非常慢；原文大概运行了40min。这里我们设置一个计时器来查看花了多长时间

In [None]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, 
# or an average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0] // 5

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters, n_jobs=-2 )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

  


In [None]:
idx.shape

idx: Index of the cluster each sample belongs to.

现在每一个单词的聚合赋值存储在了idx里，原始Word2vec模型里的词汇表，保存在model.wv.index2word。为了方便，我们用把这些集合在一个字典里：

In [None]:
# Create a Word / Index dictionary, 
# mapping each vocabulary word to a cluster number 
word_centroid_map = dict(zip( model.wv.index2word, idx ))

word_centroid_map包含的是每个单词所属于的类

上面有些抽象，现在我们看一下每个类里有什么。我们打印出类0~类9：

In [None]:
# For the first 10 clusters
for cluster in range(0, 10):

    # Print the cluster number  
    print("\nCluster %d" % cluster)

    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0, len(word_centroid_map.values())):
        if( list(word_centroid_map.values())[i] == cluster ):
            words.append(list(word_centroid_map.keys())[i])
    print(words)

我们可以看到这些类的质量差别很大。cluster 1基本包含的是名字，有类里的词之间是有关系的，但是有些类的词之间就没什么关系。

不管怎么说，现在每个单词都有了一个类，我们可以写一个函数，把评论转化为重心袋(convert reviews into bags-of-centroids)。其实就像词袋一样，但是这种方法是用语义上相关的类，而不是单独的单词：

In [None]:
def create_bag_of_centroids( wordlist, word_centroid_map ):

    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1

    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )

    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1

    # Return the "bag of centroids"
    return bag_of_centroids

上面的函数会对每一个评论返还一个numpy数组，这个数组代表特征，特征的数量和聚类的数量一样。`max(word_centroid_map.values())`得到的结果是3297，所以一共是3298个类。

然后我们给训练集和测试集构建bags of centroids。然后训练随机森林并检验结果：

In [None]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map )
    counter += 1

In [None]:
# Fit a random forest and extract predictions 
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results 
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "result/BagOfCentroids.csv", index=False, quoting=3 )