In [1]:
import numpy as np
import pandas as pd
import nltk.data
from gensim.models import Word2Vec
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [2]:
# 评论到单词列表清洗函数
def review_to_wordlist( review, remove_stopwords=False ):
    # Function:将文档转换为单词序列
    # 返回一个单词list
    # 是否删除stop words为可选项
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. 删除标点符号
    # 保存数字
    review_text = re.sub("[^a-zA-Z0-9]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. 返回一个单词list
    return(words)

# 将完整的评论拆分成句子
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. 用NLTK将段落分成句子
    raw_sentences = tokenizer.tokenize(review.strip())
#     print(len(raw_sentences))
#     print(len(raw_sentences[0]))
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # 跳过空句子
        if len(raw_sentence) > 0:
            # 对分好的句子分词
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords=False ))
    #
    # 返回一个句子列表sentences，且每个sentence是一个单词list。
    # 即返回一个元素为列表的列表
    return sentences

In [3]:
# 从单词到段落，尝试1：矢量平均

# 创建特征向量
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given paragraph
    # function ：平均给定段落中的定词向量
    # 初始化一个空numpy array
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word是一个list，包含模型中的词汇名称，将其转换成set以提速
    index2word_set = set(model.wv.index2word)
    #
    # 循环遍历电影评论中的每一个单词，若单词在词汇表中，添加其特征向量至featureVec中
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # 将结果初一单词数得到平均值
    featureVec = np.divide(featureVec,nwords)
    # print(featureVec)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # 给定一个集合形式的评论（记录每个词的list），计算其平均特征向量
    # 返回一个2d numpy array
    # 
    # 初始化计数器
    counter = 0
    # 
    # 预分配2d numpy数组, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        #
       # 没遍历1000条评论打印一条状态信息
        if counter%5000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))
       # 
       # 调用makeFeatureVec函数创建平均特征向量
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       #
       # 增加计数器
        counter = counter + 1
    return reviewFeatureVecs

In [4]:
# 导入Part2中创建的模型
model = Word2Vec.load("300features_40minwords_10context")

# 第二部分中训练的Word2Vec模型由词汇表中每个单词的特征向量组成，
# 存储在numpy 称为“ syn0” 的数组中
print(type(model.wv.syn0))

# syn0中的行数是模型词汇量中的单词数，列数与特征向量的大小相对应
# (part2中设置词向量维数为300)，
# 这是我们在第2部分中设置的。
# 将最小单词数设置为40可得到的总词汇量为16,492个单词，每个单词具有300个功能
print("syn0.shape: ",model.wv.syn0.shape)
print(model.wv.syn0)


<class 'numpy.ndarray'>
syn0.shape:  (16731, 300)
[[-0.09709576  0.02011709  0.06497605 ... -0.0238334  -0.04807308
  -0.06135374]
 [-0.03287856  0.05827773 -0.09481616 ... -0.00039323  0.02297289
  -0.01942133]
 [-0.01214846  0.02901594  0.02997003 ... -0.01631646  0.00844716
  -0.0477585 ]
 ...
 [ 0.01183749  0.0275721  -0.0189919  ... -0.02713539  0.10740301
  -0.0229452 ]
 [-0.06867991 -0.02180096 -0.02346303 ... -0.03971877  0.04584648
  -0.03860654]
 [ 0.03090754  0.0717666  -0.08793098 ...  0.05066429 -0.00154953
  -0.07792217]]


  
  if sys.path[0] == '':
  del sys.path[0]


In [5]:
# 读取数据 
train = pd.read_csv( "F:\\NLP\\kaggle_data\\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "F:\\NLP\\kaggle_data\\testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "F:\\NLP\\kaggle_data\\unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

# 共读取100000条数据
print("Read %d labeled train reviews, %d labeled test reviews, "
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [6]:
# 设置参数值
num_features = 300    # 词向量的维数                      
min_word_count = 40   # 最小单词数                        
num_workers = 6       # 并行线程数
context = 10          # 上下文窗口大小                                                                                    
downsampling = 1e-3   # 常用词下采样设置

In [7]:
#使用上面定义的函数计算训练和测试集的平均特征向量。注意，我们现在使用停用词
# 建立训练集数据
print("Creating average feature vecs for train reviews")
clean_train_reviews = []
# 先将评论转为单词list
for review in train["review"]:
    clean_train_reviews.append(review_to_wordlist(review,remove_stopwords=True))
print("clean reviews is finished")
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )
print(trainDataVecs.shape)
print(trainDataVecs[0])

Creating average feature vecs for train reviews
clean reviews is finished
Review 0 of 25000




Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
(25000, 300)
[ 7.73013150e-03  6.11095363e-03  4.40710457e-03  1.20043028e-02
  1.10902777e-03  1.43982219e-02  1.64157581e-02  1.49366527e-03
 -1.01699894e-02 -1.73766930e-02  2.74553169e-02  7.99764600e-03
 -7.79353362e-03 -1.22647099e-02  6.16329350e-03  5.11194253e-03
 -1.17299343e-02 -2.59704422e-03 -6.29001111e-03 -2.40691565e-02
 -4.11423884e-04 -7.24892225e-03  1.65850706e-02  8.18479166e-04
 -9.63951647e-03  3.92386923e-03  8.12559295e-03  3.10213096e-03
 -6.26386702e-03 -1.87832937e-02 -1.00115137e-02  2.11877772e-03
 -5.93891321e-03 -1.00193387e-02 -1.40575003e-02 -9.36071668e-03
  1.25718303e-02 -1.33886933e-05 -4.46036505e-03  7.06611318e-04
 -1.66625660e-02  5.94374631e-03 -9.66841937e-04 -1.97779248e-03
 -7.34623941e-03 -5.59078343e-03 -1.53250052e-02  1.16132749e-02
 -9.58719011e-03  2.99465796e-03 -1.83002651e-02 -9.94403660e-03
  4.90120891e-03  1.33741293e-02 -1.46699445e-02 -1.605

In [8]:
# 建立测试集数据
print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))
print("clean reviews is finished")

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

Creating average feature vecs for test reviews
clean reviews is finished
Review 0 of 25000




Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [9]:
# 在训练集上训练随机森林分类模型（100棵树）
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainDataVecs, train["sentiment"] )
print("Fitting Finished")

Fitting a random forest to labeled training data...
Fitting Finished


In [10]:
# Test & extract results 
result = forest.predict( testDataVecs )
print(len(result))
# 存储预测结果 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "F:\\NLP\\kaggle_data\\Word2Vec_AverageVectors.csv", index=False, quoting=3 )

25000


In [11]:
# 单词聚类
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# 设置聚类的蔟数（k），为单词向量数/5
word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0] / 5)

# 创建一个Kmeans对象并提取单词质心
kmeans_clustering = KMeans( n_clusters = num_clusters )
# 聚类分组
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

  


Time taken for K Means clustering:  1582.9706280231476 seconds.


In [26]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.wv.index2word, idx ))

# For the first 100 clusters
for cluster in range(0,100):
    #
    # Print the cluster number  
    print("\nCluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    values = list(word_centroid_map.values())
    keys = list(word_centroid_map.keys())
#     print(values)
#     print(keys)
    
    for i in range(0,len(word_centroid_map.values())):
        if( temp[i] == cluster ):
            words.append(keys[i])
    print(words)


Cluster 0
['terms']

Cluster 1
['regal']

Cluster 2
['named', 'drunken', 'joshua', 'salesman', 'mechanic', 'runaway', 'kenny', 'hitchhiker', 'loony', 'mercedes', 'sperm', 'malik', 'rapping']

Cluster 3
['tunnel']

Cluster 4
['enlightenment']

Cluster 5
['battles', 'ships', 'bombs', 'planes', 'vehicles', 'aircraft', 'trains', 'submarine', 'helicopters', 'boats', 'planets', 'airplanes', 'automobiles', 'jets']

Cluster 6
['cameraman', 'pov', 'aerial', 'miniature', 'overhead', 'closeup', 'blocking']

Cluster 7
['todd', 'lopez', 'mira']

Cluster 8
['disappointing', 'surprising', 'satisfying', 'frustrating', 'rewarding', 'satisfactory']

Cluster 9
['imitating', 'kramer', 'stein', 'johnston', 'damien', 'schmidt', 'reuben', 'koch', 'rydell', 'ghai']

Cluster 10
['guidance', 'adviser', 'intervention', 'commission', 'chambers', 'functioning', 'hypnosis', 'concealed', 'parliament', 'communications']

Cluster 11
['program', 'nbc', 'schedule']

Cluster 12
['joan', 'davis', 'barrymore', 'olivia', '

['rani', 'vivek', 'oberoi', 'mukherjee']

Cluster 94
['remembered', 'viewed', 'recognized']

Cluster 95
['events', 'subplots', 'incidents', 'sentences', 'happenings', 'revelations', 'coincidences', 'occurrences', 'additions']

Cluster 96
['weirdness', 'cannibalism', 'sadism']

Cluster 97
['scarlet', 'scoop', 'alvin', 'kolchak', 'cypher', 'heath', 'pecker', 'ledger', 'villainy']

Cluster 98
['conflict', 'clash', 'gap', 'rivalry', 'bonding', 'tensions', 'bonds', 'bickering', 'feud', 'dispute', 'camaraderie', 'confrontations', 'clashes', 'dichotomy']

Cluster 99
['wizard', 'fury', 'hound', 'sinbad', 'wolfman', 'knockoff']


In [27]:
# 定义一个将评论转换为质心袋的函数。这就像单词袋一样，但是使用语义相关的簇而不是单个单词：
def create_bag_of_centroids( wordlist, word_centroid_map ):
    # 簇数等于单词/形心图中的最高簇索引
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # 预分配袋质心向量
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # 循环浏览评论中的单词。 如果单词在词汇表中，请找到该单词所属的簇，并将该簇数加1
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # R返回“质心包”
    return bag_of_centroids

In [31]:
# 为训练集的质心袋预分配一个数组
train_centroids = np.zeros( (train["review"].size, num_clusters), dtype="float32" )


# 将训练集评论转换为质心袋
print("Start transforming the training set")
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

    
# 将测试集评论转换为质心袋
print("Start transforming the testing set")
test_centroids = np.zeros(( test["review"].size, num_clusters), dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1
    
print(train_centroids[0])

Start transforming the training set
Start transforming the testing set
[0. 0. 0. ... 0. 0. 0.]


In [36]:
print(train_centroids.shape)
print(test_centroids.shape)
print(train_centroids[0][100:200])

(25000, 3346)
(25000, 3346)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 11.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [37]:
# 拟合随机森林并提取预测
forest = RandomForestClassifier(n_estimators = 100)

print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results 
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "F:\\NLP\\kaggle_data\\BagOfCentroids.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...
