In [1]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [2]:
import gensim
from gensim import models, corpora
from gensim.utils import simple_preprocess

In [3]:
import pandas as pd
import time
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import re
from sklearn.decomposition import PCA
from termcolor import colored
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
print(type(punctuation))
punct = punctuation + '’' + '—' + '“' + '”' + '‘'
punct = punct.replace('-', "")
print(punct)

<class 'str'>
!"#$%&'()*+,./:;<=>?@[\]^_`{|}~’—“”‘


# CBOW Model

In [5]:
def preprocess(df_file, df_weight, sen_list, unst_sen_list):
    s_cnt = 0
    for i, row in df_file.iterrows():
        #['id', 'url', 'headline', 'abstract'] 

        sen = str(row['headline'])
        sen = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', sen)
        for s in sen:
            s_cnt += 1
            ori_s = s
            # print('ori_s:', ori_s)

            stem_w = []
            s_p = ''.join(w for w in s if w not in punct)
            s_p = s_p.replace('-', " ")
            token = word_tokenize(s_p)

            # filter numbers, 移除 4碼年份以外的 word
            for w in token:
                try:
                    if str(int(float(w))).isnumeric() and len(w) != 4:
                        token.remove(w)
                except:
                    pass

            num_word = len(token)
            unst_s = " ".join(token)
            # print('unst_s:', unst_s)

            for w in token:
                stem_w.append(ps.stem(w))

            # print('stem_w:', stem_w)
            s = " ".join(stem_w)
            sen_list.append(s)
            unst_sen_list.append(unst_s)
            # print('s_cnt:', s_cnt, '\n ori_s:', ori_s, '\n s:', s)
            df_weight.loc[len(df_weight)] = [s_cnt, ori_s, stem_w, num_word]


        sen = str(row['abstract'])  
        sen = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', sen)
        for s in sen:
            s_cnt += 1
            ori_s = s

            stem_w = []
            s_p = ''.join(w for w in s if w not in punct)
            s_p = s_p.replace('-', " ")
            token = word_tokenize(s_p)

            for w in token:
                try:
                    if str(int(float(w))).isnumeric() and len(w) != 4:
                        token.remove(w)
                except:
                    pass
            
            num_word = len(token)
            unst_s = " ".join(token)

            for w in token:
                stem_w.append(ps.stem(w))

            s = " ".join(stem_w)
            sen_list.append(s)
            unst_sen_list.append(unst_s)
            df_weight.loc[len(df_weight)] = [s_cnt, ori_s, stem_w, num_word]
            
    return df_weight, sen_list, unst_sen_list

In [6]:
# preprocess data, including stemming, tokenizing for model
df_file = pd.read_csv('NYTimes_200.csv') 

col = ['s_id', 'original_sent', 'stemmed_sent', 'num_word']
df_weight = pd.DataFrame(columns = col)
sen_list = []
unst_sen_list = []

df_weight, sen_list, unst_sen_list = preprocess(df_file, df_weight, sen_list, unst_sen_list)
print(df_weight.tail())

token = Tokenizer() 
token.fit_on_texts(sen_list) 
seq = token.texts_to_sequences(sen_list)
print('after seq:', seq)
total_word = sum(len(w) for w in seq) # sum of words
print('total_word:', total_word)
word_count = len(token.word_index) # sum of distinct words
print('word_count:', word_count)

    s_id                                      original_sent  \
471  472  When President Barack Obama gave his farewell ...   
472  473                     For Elite Golfers, Money Talks   
473  474  Sponsors have long paid players to compete in ...   
474  475  Art Basel, Swiss Centerpiece of the Trade’s Ye...   
475  476  Dealers were looking to the event as a bellwet...   

                                          stemmed_sent num_word  
471  [when, presid, barack, obama, gave, hi, farewe...       56  
472                   [for, elit, golfer, money, talk]        5  
473  [sponsor, have, long, paid, player, to, compet...       24  
474  [art, basel, swiss, centerpiec, of, the, trade...       10  
475  [dealer, were, look, to, the, event, as, a, be...       18  
after seq: [[499, 265, 55, 11, 500, 2, 388, 107], [501, 11, 16, 389, 6, 15, 108, 11, 145, 34, 146, 170, 502, 55, 11, 266, 2, 1, 503, 116, 48, 504, 2, 147, 505], [506, 117, 1, 45, 507, 1, 508, 3, 390], [509, 148, 3, 1, 26, 12,

In [7]:
def cbow_model(seq, window_size, total_word):
    total_length = window_size*2
    for text in seq:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_word = []
            target   = []            
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
            # print('context_word:', context_word, '\ntarget:', target)
            contextual = sequence.pad_sequences(context_word, maxlen=total_length)
            # print('contextual:', contextual)
            final_target = np_utils.to_categorical(target, total_word)
            # print('final_target:', final_target)
            yield(contextual, final_target)

In [8]:
def train_model(seq, total_word, window_size, epoch):
    model = Sequential()
    model.add(Embedding(input_dim=total_word, output_dim=100, input_length=window_size*2))
    model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))
    model.add(Dense(total_word, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    for i in range(epoch):
        # timer
        start = time.time()
        loss = 0
        for x, y in cbow_model(seq, window_size, total_word):
            loss += model.train_on_batch(x, y)
        print(i, loss)
        
        end = time.time()
        print('epoch:', i, 'takes time:', end - start)
        
    return model

In [9]:
window_size = 3
model = train_model(seq, total_word, window_size, 8)

0 50729.89162158966
epoch: 0 takes time: 69.67118692398071
1 40411.339516460896
epoch: 1 takes time: 69.31519651412964
2 36257.94778418541
epoch: 2 takes time: 69.22339510917664
3 31400.5911808908
epoch: 3 takes time: 67.86318898200989
4 26490.209703564644
epoch: 4 takes time: 69.08726477622986
5 21954.890511725098
epoch: 5 takes time: 69.91128540039062
6 17954.72737303935
epoch: 6 takes time: 67.49894571304321
7 14499.854848544579
epoch: 7 takes time: 69.0541512966156


In [10]:
model.save('win02_model_travel.bin')

INFO:tensorflow:Assets written to: win02_model_travel.bin\assets


In [11]:
weights = model.get_weights()[0]
list_sw = stopwords.words('english')
# list_sw = list_sw + ['’','—', '“', '”', '‘']

list_text = []
list_weight = []

for text, i in token.word_index.items():
    
    if text in list_sw:
        # print('remove', i, text)
        # pass
        continue
    
    list_text.append(text)
    list_weight.append((weights[i]))
        
list_weight = np.array(list_weight)

In [12]:
# fit a 3d PCA model to the vectors
pca = PCA(n_components=3)
result = pca.fit_transform(list_weight)

df_3d = pd.DataFrame(columns = ['word', '3d'])
for i, r in enumerate(result):
    df_3d.loc[len(df_3d)] = [list_text[i], r]

print(df_3d.tail())

            word                                     3d
1322  centerpiec  [0.30827895, -0.43908545, 0.29263514]
1323       trade  [-0.3004399, 0.16807397, 0.024923302]
1324      dealer  [-0.67597497, -0.7331393, -0.2187654]
1325    bellweth    [-1.2260638, 0.22698018, 0.5379685]
1326      normal   [-1.5819566, 0.15461136, 0.30871683]


In [13]:
vect_file_3 = open('3d_vectors_travel.txt' ,'w')
vect_file_3.write('{} {}\n'.format(len(list_text), 3))

for i, word in enumerate(list_text):

    final_vec_3 = ' '.join(map(str, list(result[i, :])))
    vect_file_3.write('{} {}\n'.format(word, final_vec_3))
    
vect_file_3.close()

# Sklearn TF-IDF

In [14]:
# sklearn uses (by default)
# tf(t) = No. of times term ‘t’ occurs in a document
# idf(t) = log e [ (1+n) / ( 1 + df(t) ) ] + 1 (default i:e smooth_idf = True)
# In Scikit-learn ,The log is not base 10 , though it is the natural logarithm (which has a base e, e is an irrational and transcendental number approximately equal to 2.718)
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(unst_sen_list)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_tfidf = pd.DataFrame(denselist, columns=feature_names)

df_temp = pd.DataFrame(vectors.T.todense()) #為了計算各word的weight加總，轉置
df_temp['word'] = feature_names
print (df_temp.head())

w_list = []
term_list = []
for i in range(len(df_temp)):
    w = 0
    if df_temp.iloc[i,476] in list_sw:
        # print('remove', i, text)
        continue
        
    for j in range(len(df_temp.columns)-1):
        w +=  df_temp.iloc[i,j]
    
    term_list.append(df_temp.iloc[i,476])
    w_list.append(w)

df_temp = pd.DataFrame(term_list, columns=['word'])
df_temp['weight'] = w_list
# df_temp = df_temp[['word', 'weight']]
df_temp = df_temp.sort_values('weight', ascending=False)

     0    1    2    3    4    5    6    7    8    9  ...  467  468  469  470  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   471  472  473  474  475   word  
0  0.0  0.0  0.0  0.0  0.0  1920s  
1  0.0  0.0  0.0  0.0  0.0  1930s  
2  0.0  0.0  0.0  0.0  0.0   1977  
3  0.0  0.0  0.0  0.0  0.0   19th  
4  0.0  0.0  0.0  0.0  0.0   2000  

[5 rows x 477 columns]


In [15]:
test = df_tfidf.iloc[471,5] + df_tfidf.iloc[len(df_tfidf)-1,len(df_tfidf.columns)-1]
test

0.15691360584736375

In [16]:
len(df_tfidf.columns)

1652

In [17]:
tfidf_list = []

for i in range(len(df_tfidf)):
    w = 0
    for j in range(len(df_tfidf.columns)):
        w +=  df_tfidf.iloc[i,j]
    
    tfidf_list.append(w)

# print(tfidf_list)

In [18]:
df_weight['skl_weight'] = tfidf_list
df_weight.tail()

Unnamed: 0,s_id,original_sent,stemmed_sent,num_word,skl_weight
471,472,When President Barack Obama gave his farewell ...,"[when, presid, barack, obama, gave, hi, farewe...",56,6.414338
472,473,"For Elite Golfers, Money Talks","[for, elit, golfer, money, talk]",5,2.172869
473,474,Sponsors have long paid players to compete in ...,"[sponsor, have, long, paid, player, to, compet...",24,4.296807
474,475,"Art Basel, Swiss Centerpiece of the Trade’s Ye...","[art, basel, swiss, centerpiec, of, the, trade...",10,2.968558
475,476,Dealers were looking to the event as a bellwet...,"[dealer, were, look, to, the, event, as, a, be...",18,3.599777


In [19]:
total_w = 0
norm_w_list = []
for i, row in df_weight.iterrows():   
    num = row['num_word']
    weight = row['skl_weight']
    if num != 0:
        norm_w_list.append(weight/num)
    else:
        print('There is no sentence.')
        
    
df_weight['nskl_weight'] = norm_w_list
print(df_weight.tail())

for i, row in df_weight.iterrows():           
    total_w += row['nskl_weight']
    
ave_skl_weight = total_w / len(df_weight)

    s_id                                      original_sent  \
471  472  When President Barack Obama gave his farewell ...   
472  473                     For Elite Golfers, Money Talks   
473  474  Sponsors have long paid players to compete in ...   
474  475  Art Basel, Swiss Centerpiece of the Trade’s Ye...   
475  476  Dealers were looking to the event as a bellwet...   

                                          stemmed_sent num_word  skl_weight  \
471  [when, presid, barack, obama, gave, hi, farewe...       56    6.414338   
472                   [for, elit, golfer, money, talk]        5    2.172869   
473  [sponsor, have, long, paid, player, to, compet...       24    4.296807   
474  [art, basel, swiss, centerpiec, of, the, trade...       10    2.968558   
475  [dealer, were, look, to, the, event, as, a, be...       18    3.599777   

     nskl_weight  
471     0.114542  
472     0.434574  
473     0.179034  
474     0.296856  
475     0.199988  


# Gensim TF-IDF

In [20]:
g_token = [simple_preprocess(s) for s in unst_sen_list]
print(g_token[475])

['dealers', 'were', 'looking', 'to', 'the', 'event', 'as', 'bellwether', 'for', 'return', 'to', 'normality', 'in', 'the', 'art', 'world']


In [44]:
# weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})
g_token = [simple_preprocess(s) for s in unst_sen_list]
dictionary = corpora.Dictionary()
g_bow = [dictionary.doc2bow(t, allow_update=True) for t in g_token]
g_map = dictionary.token2id
# g_map = dict((y,x) for x,y in g_map.iteritems())
g_map = { g_map[k]:k for k in g_map}
# print(g_map)

g_tfidf = models.TfidfModel(g_bow, smartirs='ntc')
# 儲存、載入模型
# g_tfidf.save("g_tfidf.tfidf")
# g_tfidf = models.TfidfModel.load("g_tfidf.tfidf")

genw_list = []
genw_norm_list = []
gen_dict = {}
for i, sen in enumerate(g_tfidf[g_bow]):
    # print([[dictionary[word], freq] for word, freq in sen])
    w = 0
    for j, freq in sen:
        w += freq
        word = g_map[j]
        if word not in list_sw:
            if word in gen_dict:
                gen_dict[word] =  gen_dict[word] + freq
                # print('existed word:', word, 'word_w:', gen_dict[word])
            else:
                gen_dict[word] = freq
                # print('word:', word, 'word_w:', gen_dict[word])
        
    norm_w = w / len(sen)
    genw_list.append(w)
    genw_norm_list.append(norm_w)
        
df_weight['gen_weight'] = genw_list 
df_weight['ngen_weight'] = genw_norm_list
print(df_weight.tail())
gen_dict = dict(sorted(gen_dict.items(), key=lambda item: item[1], reverse=True)[:20])

    s_id                                      original_sent  \
471  472  When President Barack Obama gave his farewell ...   
472  473                     For Elite Golfers, Money Talks   
473  474  Sponsors have long paid players to compete in ...   
474  475  Art Basel, Swiss Centerpiece of the Trade’s Ye...   
475  476  Dealers were looking to the event as a bellwet...   

                                          stemmed_sent num_word  skl_weight  \
471  [when, presid, barack, obama, gave, hi, farewe...       56    6.414338   
472                   [for, elit, golfer, money, talk]        5    2.172869   
473  [sponsor, have, long, paid, player, to, compet...       24    4.296807   
474  [art, basel, swiss, centerpiec, of, the, trade...       10    2.968558   
475  [dealer, were, look, to, the, event, as, a, be...       18    3.599777   

     nskl_weight  gen_weight  ngen_weight  
471     0.114542    6.285033     0.146164  
472     0.434574    2.130364     0.426073  
473     0.1790

In [22]:
for i, row in df_weight.iterrows():           
    total_w += row['ngen_weight']
    
ave_gen_weight = total_w / len(df_weight)

# Get Relevant Sentence(s)

In [23]:
def find_most_similar(output, query):
    r = output.most_similar(positive=[query])
    # print(query,'(3):\n', r)
    # for i in range(num):
    #    top_list.append(r[i][0])
    top = r[0][0]
    print('query:', query, ', the closest:', top)
    return top

In [33]:
# original_sent  stemmed_sent num_word skl_weight  nskl_weight  gen_weight  ngen_weigh
def get_evidence_sentence(df_weight, query, closest, col):
    top_w = 0
    top_sen = ''
    q_w = 0
    q_sen = ''
    c_w = 0
    c_sen = ''
    for i, row in df_weight.iterrows():   
        s = row['stemmed_sent']
        ori_s = row['original_sent']
        w = row[col] #norm

        if query in s and w > q_w:
            q_sen = ori_s
            q_w = w

        if closest in s and w > c_w:
            c_sen = ori_s
            c_w = w

        if query in s and closest in s and w > top_w:
            top_w = w
            top_sen = ori_s

        if i == len(df_weight) -1:
            if top_w == 0 and top_sen == '':
                colored_text = []
                for c in re.split(r'(;|,|:|\s|[()])\s*', q_sen):
                    c_stem = ps.stem(c)
                    if query in c_stem:
                        colored_text.append(colored(c, 'grey','on_yellow'))

                    else:
                        colored_text.append(c)

                q_sen = "".join(colored_text)

                colored_text = []            
                for c in re.split(r'(;|,|:|\s|[()])\s*', c_sen):
                    c_stem = ps.stem(c)
                    if closest in c_stem:
                        colored_text.append(colored(c, 'grey','on_yellow'))

                    else:
                        colored_text.append(c)

                c_sen = "".join(colored_text)

                print('\nThere is no sentence including both',query, 'and', closest + '.')
                print('However, we can get sentences with the maximum weight seperately: \na.', 
                      q_sen, '(', q_w, ') \nand \nb.',  c_sen, '(', c_w, ').' )


            else:
                colored_text = []            
                for c in re.split(r'(;|,|:|\s|[()])\s*', top_sen):
                    c_stem = ps.stem(c)
                    if query in c_stem or closest in c_stem:
                        colored_text.append(colored(c, 'grey','on_yellow'))

                    else:
                        colored_text.append(c)

                top_sen = "".join(colored_text)
                print('\nThe proof sentence is: \n"', top_sen, '"\n, whose', col,'is:', str(top_w) + '.')

In [45]:
print('top cbow words:', list_text[0:20])
print('\ntop sklearn tf-idf: \n', df_temp.head(20))
print('\ntop gensim tf-idf:',list(gen_dict.keys()))

top cbow words: ['travel', 'coronaviru', 'presid', 'new', 'trump', 'pandem', 'ha', 'biden', 'wa', 'year', 'thi', 'state', 'countri', 'week', 'world', 'hi', 'one', 'said', 'inaugur', 'place']

top sklearn tf-idf: 
               word    weight
1422        travel  8.421899
320    coronavirus  7.652863
900            new  6.285331
137          biden  6.179987
957       pandemic  5.994815
1043     president  5.951106
999         places  5.652918
1508          week  4.913102
1542          year  4.882280
1533         world  4.543223
8             2021  4.488951
1186          said  4.201798
1435         trump  4.147086
934            one  4.110811
658   inauguration  3.678143
779           list  3.572476
1465            us  3.569469
613          heres  3.532351
522          first  3.462417
1308        states  3.417919

top gensim tf-idf: ['travel', 'coronavirus', 'places', 'biden', 'president', 'new', 'pandemic', 'week', 'year', 'world', 'said', 'trump', 'one', 'inauguration', 'heres', 'list'

In [72]:
query = input('please input a query:')
# travel, coronaviru, world

please input a query:travel


In [73]:
cbow_output_3 = gensim.models.KeyedVectors.load_word2vec_format('3d_vectors.txt', binary=False, encoding='unicode_escape')

closest = find_most_similar(cbow_output_3, query)

get_evidence_sentence(df_weight, query, closest, 'nskl_weight')
print('The average sklweight of all sentences is', str(ave_skl_weight) +'.')

get_evidence_sentence(df_weight, query, closest, 'ngen_weight')
print('The average genweight of all sentences is', str(ave_gen_weight) +'.')

query: travel , the closest: pro

There is no sentence including both travel and pro.
However, we can get sentences with the maximum weight seperately: 
a. The [43m[30mTravel[0m Industry Pitches In ( 0.4053430552034163 ) 
and 
b. With Positive Tests,Australia’s Summer of [43m[30mPro[0m Tennis Has a Rocky Start ( 0.26804844827199165 ).
The average sklweight of all sentences is 0.2870859267054441.

There is no sentence including both travel and pro.
However, we can get sentences with the maximum weight seperately: 
a. It’s a very 2020 — wait,2021 — [43m[30mtravel[0m predicament. ( 0.4262432052060518 ) 
and 
b. With Positive Tests,Australia’s Summer of [43m[30mPro[0m Tennis Has a Rocky Start ( 0.28575333317091034 ).
The average genweight of all sentences is 0.5881379576898975.


In [55]:
# df_weight = df_weight.sort_values('nskl_weight', ascending= False)
# df_skl = df_weight[['original_sent','nskl_weight']]
# print(df_skl.head(10))