# 利用word2vec套件建立詞向量

In [None]:
import gensim
from gensim.models import word2vec
#Python 內建的 logging 模組可取代 print() 的除錯功能
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)


## Word2vec 參數

* sentences：語料庫
* size：特徵向量的維度
* alpha：學習速率
* window：如同 n-gram 中前後要取幾個字詞
* min_count：字詞出現少於這個閥值則捨棄
* max_vocab_size：RAM的限制，如超過上限則捨棄不頻繁使用的， None為不限制
* sample：高頻字詞的取樣率
* seed：亂數產生器，與初始化向量有關係
* workers：多執行緒的數量
* sg： 0 為 CBOW ； 1 為 skip-gram
* hs： 0 為 negative sampling ； 1 為 hierarchica softmax
* iter：迭代次數
* batch_words：每個 batch的字詞量

In [None]:
#read corpus(語料庫) & 建立 model
sentences = word2vec.Text8Corpus('text8')
model = word2vec.Word2Vec(sentences, sg=1, size=200,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=4)
#save model and test
model.save('text82.model')
print(model['man'])

In [None]:
#第二次讀取corpus的話，直接從這開始
import gensim
from gensim.models import word2vec
import logging
logging.basicConfig(format = '%(asctime)s : %(levelnames)s : %(message)s', level = logging.INFO)
#read model
model = word2vec.Word2Vec.load('text82.model')

In [None]:
import numpy as np
import pandas as pd
#讀入情緒字庫中的 positive_word
positive_word = pd.read_csv('positive_words.csv', encoding = 'ISO-8859-1')
words = positive_word['words']

In [None]:
#create a new dict
positive_dict = {}
#把 positive_word中有出現在 corpus裡的 word 進行詞的向量化，並加入positive_dict中
for word in words:
    if word in model:
        positive_dict.update({word : model[word]})
        #print(word+":")
        #print(model[word])
    else:
        continue

In [None]:
# dict to dataframe, orient = 'index'用來轉置
positiveword = pd.DataFrame.from_dict(positive_dict, orient = 'index')

In [None]:
print(positiveword.shape)

In [None]:
#事先標記好positive_word的情緒得分為 +1
idx = positiveword.index
np_one = np.ones((1, 1433), dtype = int)
df_one = pd.DataFrame(np_one).T
df_one.index = idx
df_one.columns = ['sentiment']
positiveword = pd.concat([positiveword, df_one], axis = 1)

In [None]:
#讀入情緒字庫中的 negative_word
negative_word = pd.read_csv('negative_words.csv', encoding = 'ISO-8859-1')
words = negative_word['words']

In [None]:
# create a new dict
negative_dict = {}
#把 negative_word中有出現在 corpus裡的 word 進行詞的向量化，並加入negative_dict中
for word in words:
    if word in model:
        negative_dict.update({word : model[word]})
        #print(word + ":")
        #print(model[word])
    else:
        continue

In [None]:
# dict to dataframe, orient = 'index'用來轉置
negativeword = pd.DataFrame.from_dict(negative_dict, orient = 'index')

In [None]:
print(negativeword.shape)

In [None]:
#事先標記好negative_word的情緒得分為 -1
idx1 = negativeword.index
np_minusone = np.full((1, 3029), -1)
df_minusone = pd.DataFrame(np_minusone).T
df_minusone.index = idx1
df_minusone.columns = ['sentiment']
negativeword = pd.concat([negativeword, df_minusone], axis = 1)

In [None]:
#把 positiveword 和 negativeword 合併之後當成training data
training_data = pd.concat([positiveword, negativeword], axis = 0)
#training_data

In [None]:
#最後一欄為y，其餘欄位為x
train_x = training_data.iloc[:, :-1]
train_y = training_data.iloc[:, -1]

In [None]:
#讀入 step2 做出來的 relation_word.csv，並且取出 relationword這個欄位
relation_word = pd.read_csv('relation_word.csv',encoding = "ISO-8859-1")
words = relation_word['relationword']

## words 的 data type 為 Series

* Series.str.lower(self): Convert strings in the Series/Index to lowercase

* Series.values: Return Series as ndarray or ndarray-like depending on the dtype.

In [None]:
#找出不重複的 relationword
unique = set(list(words.str.lower().values))
len(unique)

In [None]:
#create a new dict
relation_dict = {}
#把 unique 中有出現在 corpus裡的 word 進行詞的向量化，並加入relation_dict中
for word in unique:
    if word in model:
        relation_dict.update({word : model[word]})
    else:
        continue

In [None]:
# dict to dataframe, orient = 'index' 用來轉置
df_relationword = pd.DataFrame.from_dict(relation_dict, orient = 'index')
len(df_relationword)

In [None]:
#把 df_relationword當作測試資料集
test_x = df_relationword
#idx2 作為 test_x的index
idx2 = test_x.index

In [None]:
#利用 randomforest 預測 text_x的情緒得分為 +1 或 -1
from sklearn.ensemble import RandomForestClassifier
#建一百棵樹
forest = RandomForestClassifier(n_estimators = 100)
model = forest.fit(train_x, train_y)
pred_y = model.predict(test_x)

In [None]:
pred_y = pd.DataFrame(pred_y)
pred_y.columns = ['sentiment']
pred_y.index = idx2

In [None]:
#把 test_x 和 pred_y合併，並匯出成 sentiment_pred.csv
sentiment_pred = pd.concat([test_x, pred_y], axis = 1)
sentiment_pred.to_csv('sentiment_pred.csv', encoding = 'utf8')

---

## neg_word 的情緒得分需要額外校正

In [None]:
#讀入 step2 做出來的 relation_neg_word.csv，並且取出 relationword這個欄位
relation_word_neg = pd.read_csv('relation_neg_word.csv',encoding = "ISO-8859-1")
neg_words = relation_word_neg['relationword']

In [None]:
#找出不重複的 neg_words
neg_unique = set(list(neg_words.str.lower().values))
len(neg_unique) 

In [None]:
#create a new dict
neg_relation_dict = {}
#把 neg_unique 中有出現在 corpus裡的 word 進行詞的向量化，並加入neg_relation_dict中
for word in neg_unique:
    if word in model:
        neg_relation_dict.update({word:model[word]})
    else:
        continue     

In [None]:
# dict to dataframe, orient = 'index' 用來轉置
df_relationword_neg = pd.DataFrame.from_dict(neg_relation_dict, orient='index')
len(df_relationword_neg)

In [None]:
#把 df_relationword_neg當作測試資料集
test_x = df_relationword_neg
#idx2 作為 test_x的index
idx2 = test_x.index

In [None]:
#利用 randomforest 預測 text_x的情緒得分為 +1 或 -1
from sklearn.ensemble import RandomForestClassifier
#建一百棵樹
forest = RandomForestClassifier(n_estimators = 100)
model = forest.fit(train_x, train_y)
pred_y = model.predict(test_x)

In [None]:
pred_y = pd.DataFrame(pred_y)
pred_y.columns = ['sentiment']
pred_y.index = idx2

In [None]:
#把 test_x 和 pred_y合併，並匯出成 sentiment_neg_pred.csv
sentiment_pred = pd.concat([test_x, pred_y], axis = 1)
sentiment_pred.to_csv('sentiment_neg_pred.csv', encoding = 'utf8')