# 参考地址
[https://github.com/apachecn/kaggle/tree/dev/competitions/getting-started/word2vec-nlp-tutorial](https://github.com/apachecn/kaggle/tree/dev/competitions/getting-started/word2vec-nlp-tutorial)

In [None]:
# 加载依赖包
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

In [None]:
root_dir = "../data"
# 载入数据集
train = pd.read_csv('%s/%s' % (root_dir, 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
test = pd.read_csv('%s/%s' % (root_dir, 'testData.tsv'), header=0, delimiter="\t", quoting=3)
test["id"] = test["id"].apply(lambda x: eval(x))
print(train.shape)
print(train.columns.values)
print(train.head(3))
print(test.head(3))

In [None]:
# 去除评论中的HTML标签
print('\n处理前: \n', train['review'][0])

example1 = BeautifulSoup(train['review'][0], "html.parser")

import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub('[^a-zA-Z]',  # 搜寻的pattern
                      ' ',           # 用来替代的pattern(空格)
                      example1.get_text())  # 待搜索的text 

print(letters_only)
lower_case = letters_only.lower()  # Convert to lower case
words = lower_case.split()  # Split into word

print('\n处理后: \n', words)

In [None]:
def review_to_wordlist(review):
    '''
    把IMDB的评论转成词序列
    参考：http://blog.csdn.net/longxinchen_ml/article/details/50629613
    '''
    # 去掉HTML标签，拿到内容
    review_text = BeautifulSoup(review, "html.parser").get_text()
    # 用正则表达式取出符合规范的部分
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 小写化所有的词，并转成词list
    words = review_text.lower().split()
    # 返回words
    return words


# 预处理数据
label = train['sentiment']
train_data = []
for i in range(len(train['review'])):
    train_data.append(' '.join(review_to_wordlist(train['review'][i])))
test_data = []
for i in range(len(test['review'])):
    test_data.append(' '.join(review_to_wordlist(test['review'][i])))

# 预览数据
print(train_data[0], '\n')
print(test_data[0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
# 参考：http://blog.csdn.net/longxinchen_ml/article/details/50629613

"""
min_df: 最小支持度为2（词汇出现的最小次数）
max_features: 默认为None，可设为int，对所有关键词的term frequency进行降序排序，只取前max_features个作为关键词集
strip_accents: 将使用ascii或unicode编码在预处理步骤去除raw document中的重音符号
analyzer: 设置返回类型
token_pattern: 表示token的正则表达式，需要设置analyzer == 'word'，默认的正则表达式选择2个及以上的字母或数字作为token，标点符号默认当作token分隔符，而不会被当作token
ngram_range: 词组切分的长度范围
use_idf: 启用逆文档频率重新加权
use_idf：默认为True，权值是tf*idf，如果设为False，将不使用idf，就是只使用tf，相当于CountVectorizer了。
smooth_idf: idf平滑参数，默认为True，idf=ln((文档总数+1)/(包含该词的文档数+1))+1，如果设为False，idf=ln(文档总数/包含该词的文档数)+1
sublinear_tf: 默认为False，如果设为True，则替换tf为1 + log(tf)
stop_words: 设置停用词，设为english将使用内置的英语停用词，设为一个list可自定义停用词，设为None不使用停用词，设为None且max_df∈[0.7, 1.0)将自动根据当前的语料库建立停用词表
"""
tfidf = TFIDF(min_df=2,
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1, 3),  # 二元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1,
           stop_words = 'english') # 去掉英文停用词

# 合并训练和测试集以便进行TFIDF向量化操作
data_all = train_data + test_data
len_train = len(train_data)

tfidf.fit(data_all)
data_all = tfidf.transform(data_all)
# 恢复成训练集和测试集部分
train_x = data_all[:len_train]
test_x = data_all[len_train:]
print('TF-IDF处理结束.')

print("train: \n", np.shape(train_x[0]))
print("test: \n", np.shape(test_x[0]))

In [None]:
# 朴素贝叶斯训练

from sklearn.naive_bayes import MultinomialNB as MNB

model_NB = MNB() # (alpha=1.0, class_prior=None, fit_prior=True)
# 为了在预测的时候使用
model_NB.fit(train_x, label)

from sklearn.model_selection import cross_val_score
import numpy as np

print("多项式贝叶斯分类器10折交叉验证得分:  \n", cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc'))
print("\n多项式贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc')))

In [None]:
test_predicted = np.array(model_NB.predict(test_x))
print('保存结果...')

submission_df = pd.DataFrame(data ={'id': test['id'], 'sentiment': test_predicted})
print(submission_df.head(10))
submission_df.to_csv('../out/submission_br.csv',columns = ['id','sentiment'], index = False)

# nb_output = pd.DataFrame(data=test_predicted, columns=['sentiment'])
# nb_output['id'] = test['id']
# nb_output = nb_output[['id', 'sentiment']]
# nb_output.to_csv('nb_output.csv', index=False)
print('结束.')

'''
1.提交最终的结果到kaggle，AUC为：0.85728，排名300左右，50%的水平
2. ngram_range = 3, 三元文法，AUC为0.85924
'''

In [None]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV

# 设定grid search的参数
grid_values = {'C': [1, 15, 30, 50]}  
# grid_values = {'C': [30]}
# 设定打分为roc_auc
"""
penalty: l1 or l2, 用于指定惩罚中使用的标准。
"""
model_LR = GridSearchCV(LR(penalty='l2', dual=True, random_state=0), grid_values, scoring='roc_auc', cv=20)
model_LR.fit(train_x, label)
# 20折交叉验证
# GridSearchCV(cv=20, 
#         estimator=LR(C=1.0, 
#             class_weight=None, 
#             dual=True, 
#             fit_intercept=True, 
#             intercept_scaling=1, 
#             penalty='l2', 
#             random_state=0, 
#             tol=0.0001),
#         fit_params={}, 
#         iid=True,
#         n_jobs=1,
#         param_grid={'C': [30]}, 
#         pre_dispatch='2*n_jobs',
#         refit=True,
#         scoring='roc_auc', 
#         verbose=0)

# 输出结果
# print(model_LR.grid_scores_, '\n', model_LR.best_params_, model_LR.best_params_)
print(model_LR.cv_results_, '\n', model_LR.best_params_, model_LR.best_score_)

In [None]:
model_LR = LR(penalty='l2', dual=True, random_state=0)
model_LR.fit(train_x, label)

test_predicted = np.array(model_LR.predict(test_x))
print('保存结果...')

test["sentiment"] = test_predicted
test = test[['id','sentiment']]
test.to_csv('../out/submission_lr.csv',index=False)


'''
1. 提交最终的结果到kaggle，AUC为：0.88956，排名260左右，比之前贝叶斯模型有所提高
2. 三元文法，AUC为0.89076
'''

# 3.Word2vec向量
---

神经网络语言模型L = SUM[log(p(w|contect(w))]，即在w的上下文下计算当前词w的概率，由公式可以看到，我们的核心是计算p(w|contect(w)， Word2vec给出了构造这个概率的一个方法。

In [None]:
import gensim
import nltk
from nltk.corpus import stopwords

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
def review_to_wordlist(review, remove_stopwords=False):
    # review = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review)

    words = review_text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # print(words)
    return(words)


def review_to_sentences(review, tokenizer, remove_stopwords=False):
    '''
    1. 将评论文章，按照句子段落来切分(所以会比文章的数量多很多)
    2. 返回句子列表，每个句子由一堆词组成
    '''
    review = BeautifulSoup(review, "html.parser").get_text()
    # raw_sentences 句子段落集合
    raw_sentences = tokenizer.tokenize(review)
    # print(raw_sentences)
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            # 获取句子中的词列表
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences


sentences = []
for i, review in enumerate(train["review"]):
    # print(i, review)
    sentences += review_to_sentences(review, tokenizer, True)

In [None]:
print(np.shape(train["review"]))
print(np.shape(sentences))

In [None]:
unlabeled_train = pd.read_csv("%s/%s" % (root_dir, "unlabeledTrainData.tsv"), header=0, delimiter="\t", quoting=3 )
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)
print('预处理 unlabeled_train data...')

In [None]:
print(np.shape(train_data))
print(np.shape(sentences))

In [None]:
import time
from gensim.models import Word2Vec
# 模型参数
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [None]:
%%time
# 训练模型
print("训练模型中...")
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count=min_word_count, \
            window=context, sample=downsampling)
print("训练完成")

In [None]:
print('保存模型...')
model.init_sims(replace=True)
model_name = "%s/%s" % (root_dir, "300features_40minwords_10context")
model.save(model_name)
print('保存结束')

In [None]:
model.wv.doesnt_match("man woman child kitchen".split())

In [None]:
model.wv.doesnt_match("france england germany berlin".split())

In [None]:
model.wv.doesnt_match("paris berlin london austria".split())

In [None]:
# help(model.wv.most_similar)
model.wv.most_similar("man", topn=5)

In [None]:
model.wv.most_similar("queen", topn=5)

In [None]:
model.wv.most_similar("awful", topn=5)

In [None]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

In [None]:
def makeFeatureVec(words, model, num_features):
    '''
    对段落中的所有词向量进行取平均操作
    '''
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0.

    # Index2word包含了词表中的所有词，为了检索速度，保存到set中
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])

    # 取平均
    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    '''
    给定一个文本列表，每个文本由一个词列表组成，返回每个文本的词向量平均值
    '''
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")

    for review in reviews:
        if counter % 5000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))

        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1

    return reviewFeatureVecs

In [None]:
%time trainDataVecs = getAvgFeatureVecs(train_data, model, num_features)
print(np.shape(trainDataVecs))

In [None]:
%time testDataVecs = getAvgFeatureVecs(test_data, model, num_features)
print(np.shape(testDataVecs))

In [None]:
from sklearn.naive_bayes import GaussianNB as GNB

model_GNB = GNB()
model_GNB.fit(trainDataVecs, label)

from sklearn.cross_validation import cross_val_score
import numpy as np

print("高斯贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_GNB, trainDataVecs, label, cv=10, scoring='roc_auc')))

print('保存结果...')
result = model_GNB.predict( testDataVecs )
submission_df = pd.DataFrame(data ={'id': test['id'], 'sentiment': result})
print(submission_df.head(10))
submission_df.to_csv('/Users/jiangzl/Desktop/gnb_word2vec.csv',columns = ['id','sentiment'], index = False)
print('结束.')

"""
从验证结果来看，没有超过基于TF-IDF多项式贝叶斯模型
"""

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier( n_estimators = 100, n_jobs=2)

print("Fitting a random forest to labeled training data...")
%time forest = forest.fit( trainDataVecs, label )
print("随机森林分类器10折交叉验证得分: ", np.mean(cross_val_score(forest, trainDataVecs, label, cv=10, scoring='roc_auc')))

# 测试集
result = forest.predict( testDataVecs )

print('保存结果...')
submission_df = pd.DataFrame(data ={'id': test['id'], 'sentiment': result})
print(submission_df.head(10))
submission_df.to_csv('/Users/jiangzl/Desktop/rf_word2vec.csv',columns = ['id','sentiment'], index = False)
print('结束.')

"""
改用随机森林之后，效果有提升，但是依然没有超过基于TF-IDF多项式贝叶斯模型
"""

In [None]:
# 加载训练好的词向量

from gensim.models.word2vec import Word2Vec

model = Word2Vec.load_word2vec_format("vector.txt", binary=False)  # C text format
# model = Word2Vec.load_word2vec_format("vector.bin", binary=True)  # C

In [None]:
# 加载 google 的词向量，查看单词之间关系

from gensim.models.word2vec import Word2Vec 
model = Word2Vec.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
# 测试预测效果

print(model.most_similar(positive=["woman", "king"], negative=["man"], topn=5))
print(model.most_similar(positive=["biggest", "small"], negative=["big"], topn=5))
print(model.most_similar(positive=["ate", "speak"], negative=["eat"], topn=5))

In [None]:
import numpy as np

with open("food_words.txt", "r") as infile:
    food_words = infile.readlines()
    
with open("sports_words.txt", "r") as infile:
    food_words = infile.readlines()
    
with open("weather_words.txt", "r") as infile:
    food_words = infile.readlines()
    
def getWordVecs(words):
    vec = []
    for word in words:
        word = word.replace("\n", "")
        try:
            vecs.append(model[word].reshape((1, 300)))
        except KeyError:
            continue
    
    # numpy提供了numpy.concatenate((a1,a2,...), axis=0)函数。能够一次完成多个数组的拼接
    """
    >>> a=np.array([1,2,3])
    >>> b=np.array([11,22,33])
    >>> c=np.array([44,55,66])
    >>> np.concatenate((a,b,c),axis=0)  # 默认情况下，axis=0可以不写
    array([ 1,  2,  3, 11, 22, 33, 44, 55, 66]) #对于一维数组拼接，axis的值不影响最后的结果
    """
    vecs = np.concatenate(vecs)
    return np.array(vecs, dtype="float")

food_vecs = getWordVecs(food_words)
sports_vecs = getWordVecs(sports_words)
weather_vecs = getWordVecs(weather_words)

In [None]:
# 利用 TSNE 和 matplotlib 对分类结果进行可视化处理

from sklearn.manifold import TSEN
import matplotlib.pyplot as plt

ts = TSEN(2)
reduced_vecs = ts.fit_transform(np.concatenate((food_vecs, sports_vecs, weather_vecs)))

for i in range(len(reduced_vecs)):
    if i < len(food_vecs):
        color = "b"
    elif i >= len(food_vecs) and i <(len(food_vecs)+len(sports_vecs)):
        color = "r"
    else:
        color = "g"
    
    plt.plot(reduced_vecs[i, 0], reduced_vecs[i, 1], marker="0", color=color, marksize=8)

In [None]:
# 首先，我们导入数据并构建 Word2Vec 模型：

from sklearn.cross_validation import train_ _test_ _split
from gensim.models.word2vec import Word2Vec

with open('twitter.data/pos_ tweets.txt', 'r') as infile:
    pos_tweets= infile.readlines()

with open(' twitter_ data/neg_ tweets.txt', 'r') as infile:
    neg_ _tweets = infile.readlines()

# use 1for positive sentiment,0 for negative
Y= np.concatenate((np.ones( len (pos_tweets )) ，np.zeros(len(neg_tweets))))

x_train,x_test,y_train,y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2)
# Do some very minor text preprocessing

def cleanText(corpus):
    corpus= [z.lower( ).replace(' \n' , '').split() for z in corpus]
    return corpus

x_ train= cleanText(x_ train)
x_ test= cleanText (x_ _test)

n _dim= 300
#Initialize model and build vocab
imdb_w2v= Word2Vec(size=n dim, min_count=10)
imdb_w2v.build_vocab(x_ _train)
#Train the model over train_ _reviews (this may take several minutes)
imdb_w2v.train( x_train)

In [None]:
# 接下来，为了利用下面的函数获得推文中所有词向量的平均值，我们必须构建作为输入文本的词向量。

def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1，size))
    count= 0.

    for word in text :
        try:
            vec += imdb_w2v[word].reshape( (1，size) )
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec 1'= count
    return vec

In [None]:
# 调整数据集的量纲是数据标准化处理的一部分，我们通常将数据集转化成服从均值为零的高斯分布，这说明数值大于均值表示乐观，反之则表示悲观。为了使模型更有效，许多机器学习模型需要预先处理数据集的量纲，特别是文本分类器这类具有许多变量的模型。

from sklearn.preprocessing import scale

train_vecs = np.concatenate([buildWordVector(z ，n_dim) for z in x_train])
train_vecs= scale(train_vecs)

# Train word2vec on test tweets
imdb_w2v.train(x_test)

In [None]:
# 最后我们需要建立测试集向量并对其标准化处理：

#Build test tweet vectors then scale
test_vecs = np.concatenate( [buildWordVector( Z，n _dim) for z in x _test ])
test_vecs = scale(test_vecs)

In [None]:
"""
接下来我们想要通过计算测试集的预测精度和 ROC 曲线来验证分类器的有效性。 ROC 曲线衡量当模型参数调整的时候，其真阳性率和假阳性率的变化情况。在我们的案例中，我们调整的是分类器模型截断阈值的概率。一般来说，ROC 曲线下的面积（AUC）越大，该模型的表现越好。你可以在这里找到更多关于 ROC 曲线的资料

（https://en.wikipedia.org/wiki/Receiver_operating_characteristic）

在这个案例中我们使用罗吉斯回归的随机梯度下降法作为分类器算法。
"""

#Use classification algorithm (i.e.Stochastic Logistic Regression) on training set, then assess model performance on test set

from sklearn.linear model import SGDClassifier
lr = SGDClassifier(loss='log' ，penalty='11' )
lr.fit(train_vecs, y_train)
print' Test Accuracy: %.2f' % r.score(test vecs, y_test )

In [None]:
# 随后我们利用 matplotlib 和 metric 库来构建 ROC 曲线

#Crea t e ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

pred_probas = lr.predict_proba(test_vecs)[:, 1]

fpr, tpr, _ = roc_curve(y_test, pred_probas )
roc_auc = auc(fpr, tpr)

plt.plot(fpr,tpr,label='area = %.2f' % roc_ auc)
plt.plot([0,1]，[0，1],'k--')
plt. xlim( [0. 0 ，1. 0 ])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()