In [1]:
import jieba
import gensim, logging
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5103)


In [2]:
chinese_punc = "！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》﹑\\
    「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.。"
punc = chinese_punc.decode("utf-8")
english_punc = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
punc += english_punc

In [3]:
def loadDisease(d):
    with open(d + '.txt') as p:
        s = p.read()
    return s.split('\n'), d

def loadData(d_list):
    texts, labels = [], []
    for d in d_list:
        t, l = loadDisease(d)
        texts.extend(t)
        labels.extend([l]*len(t))
    return texts, labels

# def tokenize(texts):
#     return [list(jieba.cut(t, cut_all=False)) for t in texts]

def tokenize(text):
    return list(jieba.cut(text, cut_all=False))

In [4]:
class Corpus:
    def __init__(self, texts, min_tf = 1, max_tf = 1):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0
        self.voc = []
        self.addTexts(texts)
        
        if min_tf > 1 or max_tf < 1.0:
            v = self.word2count.keys()
            v = sorted(v,key=lambda k: self.word2count[k])
            v_max_remove = v[int(max_tf*(len(v))):]
            self.n_words = 0
            voc = []
            word2index = {}
            word2count = {}
            index2word = {}
            for word in self.voc:
                if word not in v_max_remove and self.word2count[word] >= min_tf:
                    voc.append(word)
                    word2index[word] = self.n_words
                    index2word[self.n_words] = word
                    self.n_words += 1
            for word in voc:
                word2count[word] = self.word2count[word]
            self.word2index = word2index
            self.word2count = word2count
            self.index2word = index2word
            self.voc = voc
                
        
    def addTexts(self, texts):
        for text in texts:
            for word in text:
                self.addWord(word)
    
    def filterWord(self, word):
        if word in punc:
            return True
    
    def addWord(self, word):
        if not self.filterWord(word):
            if word not in self.word2index:
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word[self.n_words] = word
                self.n_words += 1
                self.voc.append(word)
            else:
                self.word2count[word] += 1
    
    def bagOfWords(self, tokens):
        X = np.zeros((len(tokens), len(self.voc)))
        for i, t in enumerate(tokens):
            for word in t:
                if word in self.voc:
                    X[i][self.word2index[word]] += 1
        return X

In [5]:
d_list = ['冠心病','心脏神经官能症','病毒性心肌炎']
texts, labels = loadData(d_list)

tokens = [tokenize(t) for t in texts]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.323 seconds.
Prefix dict has been built succesfully.


In [6]:
corpus = Corpus(tokens,min_tf=2,max_tf=1)
print(len(corpus.voc))

1456


In [7]:
# a = corpus.word2count.keys()
# print(len(a))
# a.sort(key=lambda k: corpus.word2count[k])
# for x in a:
#     print x, corpus.word2count[x]

In [8]:
X = corpus.bagOfWords(tokens)

u_labels = list(set(labels))

label2index = {}
for i, x in enumerate(u_labels):
    label2index[x] = i
y = np.zeros(len(labels))
for i in range(len(labels)):
    y[i] = label2index[labels[i]]

In [9]:
print(y.shape)
print(X.shape)

(476,)
(476, 1456)
