In [32]:
import pandas as pd 
import numpy as np 
import xgboost 
from sklearn import model_selection
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from pyhanlp import *
import gensim
from gensim.models import KeyedVectors, TfidfModel
from gensim.similarities import SparseMatrixSimilarity

from WordVectorFetcher import WordVectorFetcher

In [2]:
### load data 
df_data = pd.read_csv('data/sentiment_corpus_20191108.txt', encoding='utf8', sep='\t', names=['label', 'content'])
label2id = {'negative': -1, 'neutral': 0, 'positive': 1}
df_data['content_id'] = range(len(df_data))
df_data['label_id'] = df_data['label'].apply(lambda x: label2id[x])
print(df_data.shape)
df_data[:3]

(3000, 4)


Unnamed: 0,label,content,content_id,label_id
0,negative,[img]http://img.autohome.com.cn/album/smiles/s...,0,-1
1,negative,“戏说”奔驰女再次向奔驰维权：要求赔偿240万--致广大网友的一封公开信广大支持过我的网友，...,1,-1
2,negative,“这辆二手车多少钱买的?”因为家门口修车店维修工的这一句话，车主殷小姐憋了一肚子气，开着新买...,2,-1


In [6]:
class TfidfWordVectorCombiner:
    def __init__(self):
        self.dictionary = None
        self.tfidf_model = None
#         self.fetcher = WordVectorFetcher('tmp/sgns.sogou.word.bz2')
        self.fetcher = WordVectorFetcher('tmp/sgns.zhihu.bigram-char.bz2')
        print('Loading word vector file...')
        self.fetcher.init()
        print('Done')
        self.NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
        
    def seg(self, doc):
        tokens = []
        for item in self.NLPTokenizer.segment(doc):
            word = item.word
            tag = item.nature.toString()
            # http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8
            if tag[0] not in ('w', 'x', 'y'):
                tokens.append(word)
        return tokens
    
    def fit(self, df_train):
        corpus_train = list(df_train['content'].apply(self.seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        return

    def transform(self, df):
        corpus = list(df['content'].apply(self.seg))
        corpus_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus]
        tfidf_corpus = [t for t in self.tfidf_model[corpus_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X
    
    def fit_transform(self, df_train):
        corpus_train = list(df_train['content'].apply(self.seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        
        tfidf_corpus = [t for t in self.tfidf_model[corpus_train_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X

combiner = TfidfWordVectorCombiner()

Loading word vector file...
Done


In [11]:
### split dataset 
df_train, df_val = model_selection.train_test_split(
    df_data, test_size=0.2, 
#     random_state=42, 
    shuffle=True, stratify=df_data['label_id']
)
print(df_train.shape, df_val.shape)

print(df_train['label_id'].value_counts())
print(df_val['label_id'].value_counts())

y_train = df_train['label_id'].values
y_val = df_val['label_id'].values
print(y_train.shape, y_val.shape)

(2400, 4) (600, 4)
 1    800
-1    800
 0    800
Name: label_id, dtype: int64
 0    200
 1    200
-1    200
Name: label_id, dtype: int64
(2400,) (600,)


In [12]:
X_train = combiner.fit_transform(df_train)
X_val = combiner.transform(df_val)
print(X_train.shape, X_val.shape)

(2400, 300) (600, 300)


In [56]:
xgb = xgboost.XGBClassifier(n_estimators=500, n_jobs=-1)
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
lr = LogisticRegression(n_jobs=-1, solver='lbfgs', multi_class='auto')
svc = SVC(gamma='scale', kernel='rbf')

model = xgb
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [57]:
y_train_pred = model.predict(X_train)
print('###### Train')
print(metrics.classification_report(y_true=y_train, y_pred=y_train_pred))

y_val_pred = model.predict(X_val)
print('###### Val')
print(metrics.classification_report(y_true=y_val, y_pred=y_val_pred))

###### Train
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       800
           0       1.00      1.00      1.00       800
           1       1.00      1.00      1.00       800

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400

###### Val
              precision    recall  f1-score   support

          -1       0.82      0.89      0.85       200
           0       0.77      0.70      0.73       200
           1       0.86      0.86      0.86       200

    accuracy                           0.82       600
   macro avg       0.82      0.82      0.82       600
weighted avg       0.82      0.82      0.82       600



In [60]:
df_test = pd.DataFrame([[u'这车不错'],[u'这车好！'], [u'这车不行啊']], columns=['content'])
X_test = combiner.transform(df_test)
model.predict(X_test)

array([-1, -1, -1], dtype=int64)