In [1]:
import pandas as pd 
import numpy as np
from collections import Counter
from sklearn import model_selection
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost 
import lightgbm

import codecs
import json

from pyhanlp import *
import gensim
from gensim.models import KeyedVectors, TfidfModel
from gensim.similarities import SparseMatrixSimilarity

from WordVectorFetcher import WordVectorFetcher

In [2]:
### load data 
df_data = pd.read_csv('data/sentiment_corpus_20191108.txt', encoding='utf8', sep='\t', names=['label', 'content'])
label2id = {'negative': -1, 'neutral': 0, 'positive': 1}
df_data['content_id'] = range(len(df_data))
df_data['label_id'] = df_data['label'].apply(lambda x: label2id[x])
print(df_data.shape)
df_data[:3]

(3000, 4)


Unnamed: 0,label,content,content_id,label_id
0,negative,[img]http://img.autohome.com.cn/album/smiles/s...,0,-1
1,negative,“戏说”奔驰女再次向奔驰维权：要求赔偿240万--致广大网友的一封公开信广大支持过我的网友，...,1,-1
2,negative,“这辆二手车多少钱买的?”因为家门口修车店维修工的这一句话，车主殷小姐憋了一肚子气，开着新买...,2,-1


In [3]:
# NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
def seg(doc):
    tokens = []
#     for item in NLPTokenizer.segment(doc):
    for item in HanLP.segment(doc):
        word = item.word
        tag = item.nature.toString()
        # http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8
        if tag[0] not in ['b','m','p','q','u','x']:
            tokens.append(word)
#         tokens.append(word)
    return tokens

In [4]:
class TfidfWordVectorCombiner:
    def __init__(self):
        self.dictionary = None
        self.tfidf_model = None
        self.fetcher = WordVectorFetcher('tmp/sgns.sogou.word.bz2')
#         self.fetcher = WordVectorFetcher('tmp/sgns.zhihu.bigram-char.bz2')
        print('Loading word vector file...')
        self.fetcher.init()
        print('Done')
    
    def fit(self, df_train):
        corpus_train = list(df_train['content'].apply(seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
            for id, tok in self.dictionary.id2token.items():
                fout.write("{}\t{}\n".format(id, tok))
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        return

    def transform(self, df):
        corpus = list(df['content'].apply(seg))
        corpus_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus]
        tfidf_corpus = [t for t in self.tfidf_model[corpus_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X
    
    def fit_transform(self, df_train):
        corpus_train = list(df_train['content'].apply(seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
            for id, tok in self.dictionary.id2token.items():
                fout.write("{}\t{}\n".format(id, tok))
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        
        tfidf_corpus = [t for t in self.tfidf_model[corpus_train_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X

combiner = TfidfWordVectorCombiner()

Loading word vector file...
Done


In [11]:
class EnsembleModel():
    def __init__(self):
        self.xgb = xgboost.XGBClassifier(
            n_estimators=100, 
            n_jobs=-1, 
            objective='multi:softmax', 
            num_class=3,
            max_depth=3,
            subsample=0.8,
            gamma=0
        )
        self.lr = LogisticRegression(
            n_jobs=-1, 
            solver='lbfgs', 
            multi_class='auto',
            max_iter=500
        )
        self.svc = SVC(
            gamma='scale', 
            kernel='rbf'
        )
        self.knn = KNeighborsClassifier(
            n_neighbors=5, 
            n_jobs=-1
        )

    def fit(self, X_train, y_train):
        print('### fitting xgb...')
        self.xgb.fit(X_train, y_train)
        print('### fitting svc...')
        self.svc.fit(X_train, y_train)
        print('### fitting knn...')
        self.knn.fit(X_train, y_train)
        print('### fitting lr...')
        self.lr.fit(X_train, y_train)
        
    def predict(self, X_val, y_val=None):
        n = len(X_val)
        y_xgb = self.xgb.predict(X_val)
        y_svc = self.svc.predict(X_val)
        y_knn = self.knn.predict(X_val)
        y_lr = self.lr.predict(X_val)
        y_val_pred = np.concatenate([
            y_xgb.reshape((n, 1)), 
            y_svc.reshape((n, 1)),
            y_knn.reshape((n, 1)),
            y_lr.reshape((n, 1)),
        ], axis=1)
        if y_val is not None:
            print(
                metrics.accuracy_score(y_true=y_val, y_pred=y_xgb),
                metrics.accuracy_score(y_true=y_val, y_pred=y_svc),
                metrics.accuracy_score(y_true=y_val, y_pred=y_knn),
                metrics.accuracy_score(y_true=y_val, y_pred=y_lr),
            )
        y_val_pred = [Counter(i).most_common(1)[0][0] for i in y_val_pred]
        return y_val_pred

model = EnsembleModel()
# print(model)

In [12]:
for _ in range(5):
    print('### Spliting data...')
    df_train, df_val = model_selection.train_test_split(
        df_data, test_size=0.2, 
    #     random_state=42, 
        shuffle=True, 
        stratify=df_data['label']
    )
    y_train = df_train['label'].values
    y_val = df_val['label'].values

    print('### Preparing features...')
    X_train = combiner.fit_transform(df_train)
    X_val = combiner.transform(df_val)

    model.fit(X_train, y_train)

    print(metrics.classification_report(y_true=y_train, y_pred=model.predict(X_train, y_train)))
    print(metrics.classification_report(y_true=y_val, y_pred=model.predict(X_val, y_val)))

### Spliting data...
### Preparing features...
### fitting xgb...
### fitting svc...
### fitting knn...
### fitting lr...
0.9516666666666667 0.8441666666666666 0.8433333333333334 0.8833333333333333
              precision    recall  f1-score   support

    negative       0.92      0.98      0.95       800
     neutral       0.93      0.86      0.89       800
    positive       0.93      0.95      0.94       800

    accuracy                           0.93      2400
   macro avg       0.93      0.93      0.93      2400
weighted avg       0.93      0.93      0.93      2400

0.8066666666666666 0.82 0.7733333333333333 0.7783333333333333
              precision    recall  f1-score   support

    negative       0.87      0.87      0.87       200
     neutral       0.75      0.70      0.72       200
    positive       0.82      0.86      0.84       200

    accuracy                           0.81       600
   macro avg       0.81      0.81      0.81       600
weighted avg       0.81      0.81

KeyboardInterrupt: 

In [7]:
# model.fit(X_data, y_data)
# y_test = model.predict(X_test)

In [8]:
# df_test = pd.read_csv('data/real_senti_demo_nolabel.txt', encoding='utf8', sep='\t', names=['content'])
# X_test = combiner.transform(df_test)
# y_test_pred = model.predict(X_test)
# df_test['label'] = y_test_pred
# df_test[['label', 'content']].to_csv('data/submission.csv', encoding='utf8', sep='\t', index=False, header=False)


In [9]:

# params_for_svc = [
#     {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
#     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
# ]
# cv_params = {'n_estimators': [400, 500, 600, 700, 800]}
# other_params = {
#     'learning_rate': 0.1, 
#     'n_estimators': 500, 
#     'max_depth': 5, 
#     'min_child_weight': 1, 
#     'seed': 0,
#     'subsample': 0.8, 
#     'colsample_bytree': 0.8, 
#     'gamma': 0, 
#     'reg_alpha': 0, 
#     'reg_lambda': 1, 
#     'n_jobs': -1
# }
# model = xgboost.XGBClassifier(**other_params)
# optimized_GBM = model_selection.GridSearchCV(
#     estimator=model, param_grid=cv_params, 
#     scoring='accuracy', cv=5, verbose=True, n_jobs=-1
# )
# optimized_GBM.fit(X_train, y_train)
# evalute_result = optimized_GBM.grid_scores_
# print('每轮迭代运行结果:{0}'.format(evalute_result))
# print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
# print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

# y_true, y_pred = y_val, clf.predict(X_val)
# print(metrics.classification_report(y_true, y_pred))