In [3]:
import pandas as pd 
import numpy as np 
import xgboost 
from sklearn import model_selection

import jieba
from jieba import posseg
from pyhanlp import *
import gensim
from gensim.models import KeyedVectors, TfidfModel
from gensim.similarities import SparseMatrixSimilarity

# Load Data & Split Data

In [7]:
### load data 
df_data = pd.read_csv('data/sentiment_corpus_20191108.txt', encoding='utf8', sep='\t', names=['label', 'content'])
label2id = {'negative': -1, 'neutral': 0, 'positive': 1}
df_data['content_id'] = range(len(df_data))
df_data['label_id'] = df_data['label'].apply(lambda x: label2id[x])
print(df_data.shape)
df_data[:3]

(3000, 4)


Unnamed: 0,label,content,content_id,label_id
0,negative,[img]http://img.autohome.com.cn/album/smiles/s...,0,-1
1,negative,“戏说”奔驰女再次向奔驰维权：要求赔偿240万--致广大网友的一封公开信广大支持过我的网友，...,1,-1
2,negative,“这辆二手车多少钱买的?”因为家门口修车店维修工的这一句话，车主殷小姐憋了一肚子气，开着新买...,2,-1


In [8]:
### split dataset 
df_train, df_val = model_selection.train_test_split(
    df_data, test_size=0.2, 
    random_state=42, shuffle=True, stratify=df_data['label_id']
)
print(df_train.shape, df_val.shape)
print(df_train['label_id'].value_counts())
print(df_val['label_id'].value_counts())

(2400, 4) (600, 4)
 1    800
-1    800
 0    800
Name: label_id, dtype: int64
-1    200
 1    200
 0    200
Name: label_id, dtype: int64


In [4]:
### segmentation tokenize
df_train[:3]

Unnamed: 0,label,content,content_id,label_id
2108,positive,。中东V97抢购天津夏季进口三菱帕杰罗外型设计新帕杰罗的外型突出了其开发的主题思想-即“面向...,2108,1
444,negative,偶的F2T958怠速开空调有时也有哒哒声，不过是有规律的响，一阵一阵的，偶试过别的车也有如富...,444,-1
514,negative,上下班市区里开，原来首选是1.8T，但看了不少帖子后，对1.8T的故障越看越害怕，2.0T要...,514,-1


In [5]:
NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
# print(NLPTokenizer.segment("我新造一个词叫幻想乡你能识别并正确标注词性吗？"))  # “正确”是副形词。
# # 注意观察下面两个“希望”的词性、两个“晚霞”的词性
# print(NLPTokenizer.analyze("我的希望是希望张晚霞的背影被晚霞映红").translateLabels())
# print(NLPTokenizer.analyze("支援臺灣正體香港繁體：微软公司於1975年由比爾·蓋茲和保羅·艾倫創立。"))

def seg(doc):
    tokens = []
#     for word, tag in posseg.cut(doc):
#         if tag not in ['x']:
#             tokens.append(word)
    for item in NLPTokenizer.segment(doc):
        word = item.word
        tag = item.nature.toString()
        # http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8
        if tag[0] not in ('w', 'x', 'y'):
            tokens.append(word)
    return tokens

In [6]:
corpus = list(df_train['content'].apply(seg))
dictionary = gensim.corpora.Dictionary(corpus)
print(dictionary)

Dictionary(77213 unique tokens: ['1%', '17694994877', '184', '18802221755', '2']...)


In [7]:
corpus_bow = [dictionary.doc2bow(tokens) for tokens in corpus]

In [8]:
tfidf = TfidfModel(corpus_bow)

In [9]:
# tfidf[dictionary.doc2bow(corpus[1])]
# tfidf[dictionary.doc2bow(seg(df_val['content'].values[1]))]

In [10]:
# index = SparseMatrixSimilarity(tfidf[corpus_bow[:10]], num_features=12)
# index[tfidf[dictionary.doc2bow(seg(df_val['content'].values[1]))]]

In [None]:
### embedding

In [4]:
class WordVectorFetcher:
    def __init__(self, filename):
        self.wv_filename = filename
        self.wv = None

    def init(self):
        self.wv = KeyedVectors.load_word2vec_format(self.wv_filename)

    def get_word_vector(self, word):
        if word not in self.wv:
            return np.zeros(self.wv.vector_size)
        else:
            return self.wv[word]

    def get_sentence_vector(self, sentence):
        words = [item.word for item in HanLP.segment(sentence)]
        cnt = 0
        vec_fin = np.zeros(self.wv.vector_size)
        for w in words:
            if w in self.wv:
                vec_fin += self.get_word_vector(w)
                cnt += 1
        if cnt > 0:
            vec_fin = vec_fin / cnt
        return vec_fin

    def get_sentence_similarity(self, s1, s2):
        v1 = self.get_sentence_vector(s1)
        v2 = self.get_sentence_vector(s2)
        return self.wv.cosine_similarities(v1, [v2])
        # return self.wv.wmdistance(s1, s2)

fn = 'data/sgns.sogou.word.bz2'
fetcher = WordVectorFetcher(fn)
fetcher.init()
# wv1 = fetcher.get_sentence_vector(u'今天天气算不错的了')
# wv2 = fetcher.get_sentence_vector(u'今天没下雨')
print(fetcher.get_sentence_similarity(u'今天天气算不错的了', u'今天在北京没下雨'))
print(fetcher.get_sentence_similarity(u'车头大面积进气格栅用镀铬材质进行装饰后年轻化效果显著', u'同时，在车头两侧，还有LED光源的头灯进行加持，夜间点亮后辨识度也很高'))
print(fetcher.get_sentence_similarity(u'方向盘低速灵活高速平稳，就算18寸的大脚跑高速120都稳稳得一点都不飘', u'在路上不放音乐听发动机声音很平顺，高速过弯车身倾斜也很小，高速120会有风噪声'))


In [11]:
# fetcher.get_sentence_vector(df_val['content'].values[0])
# todo sen_vec * tfidf ==> 300d

In [13]:
### modeling

In [None]:
### training 