In [1]:
#!/usr/local/bin/python3
import pickle
from prep import Review
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
from sklearn.naive_bayes import MultinomialNB
import numpy as np

swset = None


def load_test_set():
    trainfile = open('test.pkl', 'rb')
    test = pickle.load(trainfile)
    trainfile.close()
    return test


def load_train_set():
    trainfile = open('train.pkl', 'rb')
    train_en = pickle.load(trainfile)
    train_cn = pickle.load(trainfile)
    trainfile.close()
    return train_en, train_cn


def load_stopwords_set():
    stopwords = set()
    swfname = 'ChineseStopWords.txt'
    with open(swfname, 'r') as file:
        for line in file.readlines():
            # if line[-1] == '\0':
            # print(line)
            if line[-1] == '\n':
                stopwords.add(line[:-1])
            else:
                stopwords.add(line)
            # print(line[:-1] + '-----')
    return stopwords


def mycut(text):
    words = jieba.cut(text, cut_all=False)
    ret = []
    for word in words:
        if word not in swset:
            ret.append(word)
    return ' '.join(ret)



train_en, train_cn = load_train_set()
train = train_en + train_cn
test = load_test_set()
swset = load_stopwords_set()
# pre load jieba dict
mycut('xxx')
ntimes = 30
p = 1000
n = 400
en_train = []
cn_train = []
en_test = []
cn_test = []
label = []
for review in train:
    en_train.append(review.en_summary + '.' + review.en_text)
    cn_train.append(mycut(review.cn_summary + '。' + review.cn_text))
    label.append(review.polarity)
for review in test:
    en_test.append(review.en_summary + '.' + review.en_text)
    cn_test.append(mycut(review.cn_summary + '。' + review.cn_text))

en_vectorizer = TfidfVectorizer(min_df=1)
cn_vectorizer = TfidfVectorizer(min_df=1)
train_enx = en_vectorizer.fit_transform(en_train).toarray()
train_cnx = cn_vectorizer.fit_transform(cn_train).toarray()
test_enx = en_vectorizer.transform(en_test).toarray()
test_cnx = cn_vectorizer.transform(cn_test).toarray()
print('Training for %d times. p=%d, n=%d' % (ntimes, p, n))


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/t6/xzky4z1j31l0pypw608w2j_m0000gn/T/jieba.cache
Loading model cost 1.170 seconds.
Prefix dict has been built succesfully.


Training for 30 times. p=1000, n=400


In [2]:
# for itime in range(ntimes):
# print('----------Train #%d----------' % itime)

print('Sample number of train set: %d' % len(label))
print('Sample number of unlabeled set: %d' % len(en_test))

if len(label) < p or len(en_test) < n:
    print('p or n is too large now, break')
#     break

en_clf = MultinomialNB(alpha=0.01)
cn_clf = MultinomialNB(alpha=0.01)

en_clf.fit(train_enx, label)
cn_clf.fit(train_cnx, label)

en_predict = en_clf.predict(test_enx)
en_predict_proba = en_clf.predict_proba(test_enx)
cn_predict = cn_clf.predict(test_cnx)
cn_predict_proba = cn_clf.predict_proba(test_cnx)

Sample number of train set: 12120
Sample number of unlabeled set: 12000


In [3]:
# get index of top p predict_proba in en_predict
en_idx_proba = []
for idx, proba in enumerate(en_predict_proba):
    en_idx_proba.append((idx, proba[en_predict[idx]]))
# print(en_idx_proba)
sorted_en_proba = sorted(
    en_idx_proba,
    key=lambda tmp: tmp[1],
    reverse=True
)
selected_en_idx = [item[0] for item in sorted_en_proba[:p]]

# get index of top n predict_proba in en_predict
cn_idx_proba = []
for idx, proba in enumerate(cn_predict_proba):
    cn_idx_proba.append((idx, proba[cn_predict[idx]]))
sorted_cn_proba = sorted(
    cn_idx_proba,
    key=lambda tmp: tmp[1],
    reverse=True
)
selected_cn_idx = [item[0] for item in sorted_cn_proba[:n]]
idx_merge = set(selected_en_idx) | set(selected_cn_idx)

In [6]:
print(train_enx.shape)
print(train_cnx.shape)
print(test_enx.shape)
print(test_cnx.shape)
print(test_enx[0].shape)

testx = np.vstack((train_enx, test_enx[0]))
print(testx.shape)

(12120, 54836)
(12120, 54234)
(12000, 54836)
(12000, 54234)
(54836,)
(12121, 54836)


In [8]:
cnt = 0
selected_idx = []
for idx in idx_merge:
    if en_predict[idx] != cn_predict[idx]:
        continue
    selected_idx.append(idx)
    label.append(en_predict[idx])
    
# train_enx.append(test_enx[idx])
train_enx = np.vstack((train_enx, test_enx[selected_idx]))
train_cnx = np.vstack((train_cnx, test_cnx[selected_idx]))
test_enx = np.delete(test_enx, selected_idx, axis=0)
test_cnx = np.delete(test_cnx, selected_idx, axis=0)
print('Moving %d sample from unlabeled to labeled set' % len(selected_idx))

IndexError: index 10877 is out of bounds for axis 0 with size 10877

In [9]:
print(len(selected_idx))
print(train_enx.shape, test_enx.shape)

1123
(13243, 54836) (10877, 54836)
