In [1]:
import pandas as pd 
import numpy as np 
import xgboost 
from sklearn import model_selection
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import codecs
import json

from pyhanlp import *
import gensim
from gensim.models import KeyedVectors, TfidfModel
from gensim.similarities import SparseMatrixSimilarity

from WordVectorFetcher import WordVectorFetcher

In [2]:
### load data 
df_data = pd.read_csv('data/sentiment_corpus_20191108.txt', encoding='utf8', sep='\t', names=['label', 'content'])
label2id = {'negative': -1, 'neutral': 0, 'positive': 1}
df_data['content_id'] = range(len(df_data))
df_data['label_id'] = df_data['label'].apply(lambda x: label2id[x])
print(df_data.shape)
df_data[:3]

(3000, 4)


Unnamed: 0,label,content,content_id,label_id
0,negative,[img]http://img.autohome.com.cn/album/smiles/s...,0,-1
1,negative,“戏说”奔驰女再次向奔驰维权：要求赔偿240万--致广大网友的一封公开信广大支持过我的网友，...,1,-1
2,negative,“这辆二手车多少钱买的?”因为家门口修车店维修工的这一句话，车主殷小姐憋了一肚子气，开着新买...,2,-1


In [54]:
# NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
def seg(doc):
    tokens = []
#     for item in NLPTokenizer.segment(doc):
    for item in HanLP.segment(doc):
        word = item.word
        tag = item.nature.toString()
        # http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8
        if tag[0] not in ['b','m','p','q','u','x']:
            tokens.append(word)
#         tokens.append(word)
    return tokens

In [14]:
class TfidfWordVectorCombiner:
    def __init__(self):
        self.dictionary = None
        self.tfidf_model = None
        self.fetcher = WordVectorFetcher('tmp/sgns.sogou.word.bz2')
#         self.fetcher = WordVectorFetcher('tmp/sgns.zhihu.bigram-char.bz2')
        print('Loading word vector file...')
        self.fetcher.init()
        print('Done')
    
    def fit(self, df_train):
        corpus_train = list(df_train['content'].apply(seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
            for id, tok in self.dictionary.id2token.items():
                fout.write("{}\t{}\n".format(id, tok))
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        return

    def transform(self, df):
        corpus = list(df['content'].apply(seg))
        corpus_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus]
        tfidf_corpus = [t for t in self.tfidf_model[corpus_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X
    
    def fit_transform(self, df_train):
        corpus_train = list(df_train['content'].apply(seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
            for id, tok in self.dictionary.id2token.items():
                fout.write("{}\t{}\n".format(id, tok))
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        
        tfidf_corpus = [t for t in self.tfidf_model[corpus_train_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X

combiner = TfidfWordVectorCombiner()

Loading word vector file...
Done


In [44]:
with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
    for id, tok in combiner.dictionary.id2token.items():
        fout.write("{}\t{}\n".format(id, tok))

In [55]:
xgb = xgboost.XGBClassifier(n_estimators=500, n_jobs=-1)
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
lr = LogisticRegression(n_jobs=-1, solver='lbfgs', multi_class='auto')
svc = SVC(gamma='scale', kernel='rbf')

model = xgb
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [56]:
NUM_OF_VALIDATION = 3
for _ in range(NUM_OF_VALIDATION):
    print('### Spliting data...')
    df_train, df_val = model_selection.train_test_split(
        df_data, test_size=0.2, 
    #     random_state=42, 
        shuffle=True, 
    #     stratify=df_data['label_id']
    )
    y_train = df_train['label_id'].values
    y_val = df_val['label_id'].values
    print('y_train:', y_train.shape, 'y_val:', y_val.shape)

    print('### Preparing features...')
    X_train = combiner.fit_transform(df_train)
    X_val = combiner.transform(df_val)
    print('X_train:', X_train.shape, 'X_val:',  X_val.shape)
    
    print('### fitting...')
    model.fit(X_train, y_train)

    print(' Classification Report on Train set ')
    y_train_pred = model.predict(X_train)
    print(metrics.classification_report(y_true=y_train, y_pred=y_train_pred))
    print(' Classification Report on Val set ')
    y_val_pred = model.predict(X_val)
    print(metrics.classification_report(y_true=y_val, y_pred=y_val_pred))

### Spliting data...
y_train: (2400,) y_val: (600,)
### Preparing features...
X_train: (2400, 300) X_val: (600, 300)
### fitting...
 Classification Report on Train set 
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       787
           0       1.00      1.00      1.00       817
           1       1.00      1.00      1.00       796

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400

 Classification Report on Val set 
              precision    recall  f1-score   support

          -1       0.83      0.88      0.85       213
           0       0.72      0.68      0.70       183
           1       0.87      0.86      0.87       204

    accuracy                           0.81       600
   macro avg       0.81      0.81      0.81       600
weighted avg       0.81      0.81      0.81       600

### Spliting data...
y_train: (240

In [None]:
model.fit(X_data, y_data)
y_test = model.predict(X_test)