In [70]:
import pandas as pd 
import numpy as np 
import xgboost 
import lightgbm
from sklearn import model_selection
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import codecs
import json

from pyhanlp import *
import gensim
from gensim.models import KeyedVectors, TfidfModel
from gensim.similarities import SparseMatrixSimilarity

from WordVectorFetcher import WordVectorFetcher

In [2]:
### load data 
df_data = pd.read_csv('data/sentiment_corpus_20191108.txt', encoding='utf8', sep='\t', names=['label', 'content'])
label2id = {'negative': -1, 'neutral': 0, 'positive': 1}
df_data['content_id'] = range(len(df_data))
df_data['label_id'] = df_data['label'].apply(lambda x: label2id[x])
print(df_data.shape)
df_data[:3]

(3000, 4)


Unnamed: 0,label,content,content_id,label_id
0,negative,[img]http://img.autohome.com.cn/album/smiles/s...,0,-1
1,negative,“戏说”奔驰女再次向奔驰维权：要求赔偿240万--致广大网友的一封公开信广大支持过我的网友，...,1,-1
2,negative,“这辆二手车多少钱买的?”因为家门口修车店维修工的这一句话，车主殷小姐憋了一肚子气，开着新买...,2,-1


In [3]:
# NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
def seg(doc):
    tokens = []
#     for item in NLPTokenizer.segment(doc):
    for item in HanLP.segment(doc):
        word = item.word
        tag = item.nature.toString()
        # http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8
        if tag[0] not in ['b','m','p','q','u','x']:
            tokens.append(word)
#         tokens.append(word)
    return tokens

In [4]:
class TfidfWordVectorCombiner:
    def __init__(self):
        self.dictionary = None
        self.tfidf_model = None
        self.fetcher = WordVectorFetcher('tmp/sgns.sogou.word.bz2')
#         self.fetcher = WordVectorFetcher('tmp/sgns.zhihu.bigram-char.bz2')
        print('Loading word vector file...')
        self.fetcher.init()
        print('Done')
    
    def fit(self, df_train):
        corpus_train = list(df_train['content'].apply(seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
            for id, tok in self.dictionary.id2token.items():
                fout.write("{}\t{}\n".format(id, tok))
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        return

    def transform(self, df):
        corpus = list(df['content'].apply(seg))
        corpus_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus]
        tfidf_corpus = [t for t in self.tfidf_model[corpus_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X
    
    def fit_transform(self, df_train):
        corpus_train = list(df_train['content'].apply(seg))
        self.dictionary = gensim.corpora.Dictionary(corpus_train)
        with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
            for id, tok in self.dictionary.id2token.items():
                fout.write("{}\t{}\n".format(id, tok))
        corpus_train_bow = [self.dictionary.doc2bow(tokens) for tokens in corpus_train]
        self.tfidf_model = TfidfModel(corpus_train_bow)
        
        tfidf_corpus = [t for t in self.tfidf_model[corpus_train_bow]]
        arr = []
        for tfidf_doc in tfidf_corpus:
            vec = np.zeros_like(self.fetcher.get_word_vector(u""))
            for token_id, token_tfidf in tfidf_doc:
                token = self.dictionary[token_id]
                vec += token_tfidf * self.fetcher.get_word_vector(token)
            arr.append(vec.reshape((1, len(vec))))
        X = np.concatenate(arr)
        return X

combiner = TfidfWordVectorCombiner()

Loading word vector file...
Done


In [5]:
# with codecs.open('tmp/tfidf_vocab.csv', 'w', encoding='utf8') as fout:
#     for id, tok in combiner.dictionary.id2token.items():
#         fout.write("{}\t{}\n".format(id, tok))

AttributeError: 'NoneType' object has no attribute 'id2token'

In [85]:
xgb = xgboost.XGBClassifier(
    n_estimators=100, 
    n_jobs=-1, 
    objective='multi:softmax', 
    num_class=3,
    max_depth=3,
    subsample=0.8,
    gamma=0
)
lgbm = lightgbm.LGBMClassifier(
    n_estimators=50,
    objective='multi:softmax',
    n_jobs=-1,
    num_class=3
)
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, min_samples_leaf=5)
lr = LogisticRegression(n_jobs=-1, solver='lbfgs', multi_class='auto')
svc = SVC(gamma='scale', kernel='rbf')

model = svc
print(model)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [86]:
NUM_OF_VALIDATION = 1
for _ in range(NUM_OF_VALIDATION):
    print('### Spliting data...')
    df_train, df_val = model_selection.train_test_split(
        df_data, test_size=0.2, 
    #     random_state=42, 
        shuffle=True, 
        stratify=df_data['label']
    )
    y_train = df_train['label'].values
    y_val = df_val['label'].values
    print('y_train:', y_train.shape, 'y_val:', y_val.shape)

    print('### Preparing features...')
    X_train = combiner.fit_transform(df_train)
    X_val = combiner.transform(df_val)
    print('X_train:', X_train.shape, 'X_val:',  X_val.shape)
    
    print('### fitting...')
    print(model)
    model.fit(X_train, y_train)

    print(' Classification Report on Train set ')
    y_train_pred = model.predict(X_train)
    print(metrics.classification_report(y_true=y_train, y_pred=y_train_pred))
    print(' Classification Report on Val set ')
    y_val_pred = model.predict(X_val)
    print(metrics.classification_report(y_true=y_val, y_pred=y_val_pred))

### Spliting data...
y_train: (2400,) y_val: (600,)
### Preparing features...
X_train: (2400, 300) X_val: (600, 300)
### fitting...
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
 Classification Report on Train set 
              precision    recall  f1-score   support

    negative       0.85      0.91      0.88       800
     neutral       0.80      0.73      0.76       800
    positive       0.87      0.89      0.88       800

    accuracy                           0.84      2400
   macro avg       0.84      0.84      0.84      2400
weighted avg       0.84      0.84      0.84      2400

 Classification Report on Val set 
              precision    recall  f1-score   support

    negative       0.84      0.85      0.85       200
     neutral       0.73      0.67      0.69       200
    positive       0

In [87]:
# xgb = xgboost.XGBClassifier(
#     n_estimators=50, 
#     n_jobs=-1, 
#     objective='multi:softmax', 
#     num_class=3,
#     max_depth=3,
#     subsample=1,
#     gamma=0
# )
# lgbm = lightgbm.LGBMClassifier(
#     n_estimators=50,
#     objective='multi:softmax',
#     n_jobs=-1,
#     num_class=3,
#     min_child_weight=0.01,
#     subsample=1
# )
# model = svc
# print(model)
# model.fit(X_train, y_train)
# print(' Classification Report on Train set ')
# y_train_pred = model.predict(X_train)
# print(metrics.classification_report(y_true=y_train, y_pred=y_train_pred))
# print(' Classification Report on Val set ')
# y_val_pred = model.predict(X_val)
# print(metrics.classification_report(y_true=y_val, y_pred=y_val_pred))

In [None]:
model.fit(X_data, y_data)
y_test = model.predict(X_test)

In [14]:
df_test = pd.read_csv('data/real_senti_demo_nolabel.txt', encoding='utf8', sep='\t', names=['content'])
X_test = combiner.transform(df_test)
y_test_pred = model.predict(X_test)
df_test['label'] = y_test_pred
df_test[['label', 'content']].to_csv('data/submission.csv', encoding='utf8', sep='\t', index=False, header=False)


In [35]:

# params_for_svc = [
#     {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
#     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
# ]
# cv_params = {'n_estimators': [400, 500, 600, 700, 800]}
# other_params = {
#     'learning_rate': 0.1, 
#     'n_estimators': 500, 
#     'max_depth': 5, 
#     'min_child_weight': 1, 
#     'seed': 0,
#     'subsample': 0.8, 
#     'colsample_bytree': 0.8, 
#     'gamma': 0, 
#     'reg_alpha': 0, 
#     'reg_lambda': 1, 
#     'n_jobs': -1
# }
# model = xgboost.XGBClassifier(**other_params)
# optimized_GBM = model_selection.GridSearchCV(
#     estimator=model, param_grid=cv_params, 
#     scoring='accuracy', cv=5, verbose=True, n_jobs=-1
# )
# optimized_GBM.fit(X_train, y_train)
# evalute_result = optimized_GBM.grid_scores_
# print('每轮迭代运行结果:{0}'.format(evalute_result))
# print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
# print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

# y_true, y_pred = y_val, clf.predict(X_val)
# print(metrics.classification_report(y_true, y_pred))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 