In [2]:
import numpy as np

In [3]:
cut_programs = np.load('cut_Programs.npy')
cut_Question = np.load('cut_Questions.npy')

### Preprocessing: Word Dictionary & Out-of-Vocabulary

In [4]:
word_dict = dict()

In [5]:
def add_word_dict(w):
    if not w in word_dict:
        word_dict[w] = 1
    else:
        word_dict[w] += 1

In [6]:
for program in cut_programs:
    for lines in program:
        for line in lines:
            for w in line:
                add_word_dict(w)

In [7]:
for question in cut_Question:
    lines = question[0]
    for line in lines:
        for w in line:
            add_word_dict(w)
    
    for i in range(1, 7):
        line = question[i]
        for w in line:
            add_word_dict(w)

In [8]:
import operator

word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)

In [9]:
### important word
VOC_SIZE = 30000
VOC_START = 20

voc_dict = word_dict[VOC_START:VOC_START+VOC_SIZE]
print(voc_dict[:10])
np.save('voc_dict.npy', voc_dict)

[('都', 92133), ('說', 91976), ('這個', 91211), ('到', 84288), ('他', 80208), ('也', 78973), ('去', 75817), ('什麼', 62431), ('喔', 61455), ('那', 60232)]


In [10]:
voc_dict = np.load('voc_dict.npy')

### Preprocessing: Generating Training Data

Though the format of question is to select one from six, our traing data only have continuous lines. Naively, i want to change the whole problem into a binary classification which means given two lines, my model want to judge these two are context or not.

In [11]:
import random

NUM_TRAIN = 100000
TRAIN_VALID_RATE = 0.7
NUM_PROGRAM = 8

In [12]:
def generate_training_data():
    Xs, Ys = [], []
    
    for i in range(NUM_TRAIN):
        pos_or_neg = random.randint(0, 1)
        
        if pos_or_neg==1:
            program_id = random.randint(0, NUM_PROGRAM-1)
            episode_id = random.randint(0, len(cut_programs[program_id])-1)
            line_id = random.randint(0, len(cut_programs[program_id][episode_id])-2)
            
            Xs.append([cut_programs[program_id][episode_id][line_id], 
                       cut_programs[program_id][episode_id][line_id+1]])
            Ys.append(1)
            
        else:
            first_program_id = random.randint(0, NUM_PROGRAM-1)
            first_episode_id = random.randint(0, len(cut_programs[first_program_id])-1)
            first_line_id = random.randint(0, len(cut_programs[first_program_id][first_episode_id])-1)
            
            second_program_id = random.randint(0, NUM_PROGRAM-1)
            second_episode_id = random.randint(0, len(cut_programs[second_program_id])-1)
            second_line_id = random.randint(0, len(cut_programs[second_program_id][second_episode_id])-1)
            
            Xs.append([cut_programs[first_program_id][first_episode_id][first_line_id], 
                       cut_programs[second_program_id][second_episode_id][second_line_id]])
            Ys.append(0)
    
    return Xs, Ys

In [13]:
Xs, Ys = generate_training_data()

x_train, y_train = Xs[:int(NUM_TRAIN*TRAIN_VALID_RATE)], Ys[:int(NUM_TRAIN*TRAIN_VALID_RATE)]
x_valid, y_valid = Xs[int(NUM_TRAIN*TRAIN_VALID_RATE):], Ys[int(NUM_TRAIN*TRAIN_VALID_RATE):]

print ("size of train :", len(x_train))
print ("size of test :", len(x_valid))

size of train : 70000
size of test : 30000


> 從cut_programs去生成資料，並建立模型。再將cut_Question變成兩句兩句為一個row，判斷0或1。

In [14]:
### Example
for i in range(3):
    program_id = random.randint(0, NUM_PROGRAM-1) 
    episode_id = random.randint(0, len(cut_programs[program_id])-1)
    line_id = random.randint(0, len(cut_programs[program_id][episode_id])-2)

    print ("Sentence id :", program_id, episode_id, line_id)
    print ([cut_programs[program_id][episode_id][line_id], cut_programs[program_id][episode_id][line_id+1]])

Sentence id : 0 31 647
[['起'], ['一顆', '大西瓜']]
Sentence id : 2 44 145
[['其實', '又', '很', '違背', '你', '的', '人', '生觀'], ['就', '這種', '小氣', '的', '人']]
Sentence id : 1 132 953
[['那要', '熬', '就', '要', '熬', '的', '值', '得', '呀'], ['我要', '做', '一個', '完整', '的', '女人']]


### Word2Vec

In [15]:
example_doc = []

for line in cut_programs[0][0]:
    example_line = ''
    for w in line:
        if w in voc_dict:
            example_line += w+' '
        
    example_doc.append(example_line)

print( example_doc[:10] )

['還好 天氣 不錯 ', '昨天 晚上 流星雨 ', '到 多 流星 ', '這次 收 穫 真 豐富 ', '當然 豐富 啦 ', '說 嘛 ', '精 心 製 作 宵 夜 ', '被 一個 人 吃 掉 一大半 ', '真 嗎 ', '不要 忘 記要 做 秘密 檔案 ']


### Word2Vec: BoW (Bag-Of-Words)

In [16]:
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer

# ngram_range=(min, max), default: 1-gram => (1, 1)
count = CountVectorizer(ngram_range=(1, 1))

count.fit(example_doc)
BoW = count.vocabulary_
print('[vocabulary]\n')
for key in list(BoW.keys())[:10]:
    print('%s %d' % (key, BoW[key]))

[vocabulary]

還好 427
天氣 172
不錯 52
昨天 255
晚上 258
流星雨 289
流星 288
這次 415
豐富 392
當然 317


In [17]:
# get matrix (doc_id, vocabulary_id) --> tf
doc_bag = count.transform(example_doc)
print('(did, vid)\ttf')
print(doc_bag[:10])

print('\nIs document-term matrix a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))

(did, vid)	tf
  (0, 52)	1
  (0, 172)	1
  (0, 427)	1
  (1, 255)	1
  (1, 258)	1
  (1, 289)	1
  (2, 288)	1
  (3, 392)	1
  (3, 415)	1
  (4, 317)	1
  (4, 392)	1
  (7, 9)	1
  (7, 11)	1
  (9, 49)	1
  (9, 273)	1
  (9, 331)	1
  (9, 376)	1

Is document-term matrix a scipy.sparse matrix? True


In [18]:
doc_bag = doc_bag.toarray()
print(doc_bag[:10])

print('\nAfter calling .toarray(), is it a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

After calling .toarray(), is it a scipy.sparse matrix? False


In [19]:
doc_bag = count.fit_transform(example_doc).toarray()

print("[most frequent vocabularies]")
bag_cnts = np.sum(doc_bag, axis=0)
top = 10
# [::-1] reverses a list since sort is in ascending order
for tok, v in zip(count.inverse_transform(np.ones(bag_cnts.shape[0]))[0][bag_cnts.argsort()[::-1][:top]], 
                  np.sort(bag_cnts)[::-1][:top]):
    print('%s: %d' % (tok, v))

[most frequent vocabularies]
蟋蟀: 99
聲音: 19
這樣: 19
這個: 16
還有: 16
可以: 15
所以: 14
什麼: 14
探索: 13
時候: 12


In [20]:
print (doc_bag.shape)

(635, 473)


### Word2Vec: TF-IDF (Term-Frequency & Inverse-Document-Frequency)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,1))
tfidf.fit(example_doc)

top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()
for i in range(top):
    print('%s: %.2f' % (tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(example_doc).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]))[0][tfidf_sum.argsort()[::-1]][:top], 
                  np.sort(tfidf_sum)[::-1][:top]):
    print('%s: %f' % (tok, v))

[vocabularies with smallest idf scores]
蟋蟀: 2.86
這樣: 4.46
聲音: 4.51
這個: 4.62
還有: 4.62
可以: 4.68
所以: 4.75
什麼: 4.75
探索: 4.82
時候: 4.89

[vocabularies with highest tf-idf scores]
蟋蟀: 44.998239
這樣: 13.750685
這個: 10.831737
聲音: 10.469564
還有: 10.007670
可以: 9.158488
豆油伯: 8.888211
什麼: 8.864458
知道: 8.284927
時候: 7.708157


In [22]:
print(doc_tfidf.shape)

(635, 473)


### Word2Vec: Feature Hashing

In [23]:
from sklearn.feature_extraction.text import HashingVectorizer

hashvec = HashingVectorizer(n_features=2**6)

doc_hash = hashvec.transform(example_doc)
print(doc_hash.shape)

(635, 64)


### This case

In [24]:
for i in range(5):
    print (x_valid[i], y_valid[i])

[['所以', '我', '覺得', '他', '的', '成績', '這麼', '好'], ['或', '是', '曾經', '看過']] 0
[['可是', '我', '還是', '不', '懂', '說'], ['為', '什麼', '要', '全部', '把', '它', '拆開']] 1
[['非常', '盡心', '盡力', '的', '在', '照', '顧亞亞'], ['老師', ',', ' ', '那現', '在', '我們', '燈架', '好', '啦']] 0
[['我', '嗎'], ['我覺', '得', '你', '們', '做', '出版社']] 0
[['我', '跟', '平凡', '人', '是', '一模', '一樣', '的'], ['然', '後', '這裡', '可以', '看', '到']] 0


In [None]:
valid_doc = []

for line in x_valid:
    example_line = ''
    for i in [0, 1]:
        for w in line[i]:
            if w in voc_dict:
                example_line += w+' '
        
    valid_doc.append(example_line)

print( valid_doc[:10] )

In [None]:
train_doc = []

for line in x_train:
    example_line = ''
    for i in [0, 1]:
        for w in line[i]:
            if w in voc_dict:
                example_line += w+' '
        
    train_doc.append(example_line)

print( train_doc[:10] )

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

hashvec = HashingVectorizer(n_features=2**20)

doc_hash_train = hashvec.transform(train_doc)
doc_hash_val = hashvec.transform(valid_doc)

print (doc_hash_train.shape, doc_hash_val.shape)
print (len(y_train), sum(y_train), len(y_valid), sum(y_valid))

In [43]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score,roc_auc_score

lr = LogisticRegression(C=1000)
sgd = SGDClassifier(loss='log', n_iter=100)

In [49]:
# training
lr.fit(doc_hash_train, y_train)

# testing
y_pred = lr.predict(doc_hash_val)
score = roc_auc_score(y_valid, lr.predict_proba(doc_hash_val)[:,1])

print ('Misclassified samples: %d' % (y_valid != y_pred).sum())
print ('Accuracy: %.2f' % accuracy_score(y_valid, y_pred))
print ('roc_auc_score:', score)

Misclassified samples: 1458
Accuracy: 0.51
roc_auc_score: 0.504827042291


In [50]:
# training
sgd.fit(doc_hash_train, y_train)

# testing
y_pred = sgd.predict(doc_hash_val)
score = roc_auc_score(y_valid, sgd.predict_proba(doc_hash_val)[:,1])

print ('Misclassified samples: %d' % (y_valid != y_pred).sum())
print ('Accuracy: %.2f' % accuracy_score(y_valid, y_pred))
print ('roc_auc_score:', score)

Misclassified samples: 1487
Accuracy: 0.50
roc_auc_score: 0.506629926906
