In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import json
import warnings

warnings.filterwarnings('ignore')

ctexts_cn_path = '/home/wjunneng/Ubuntu/2019-NLP/abstractive-summarization/dataset/ctexts_cn.txt'
ctexts_en_path = '/home/wjunneng/Ubuntu/2019-NLP/abstractive-summarization/dataset/ctexts_en.json'
headlines_cn_path = '/home/wjunneng/Ubuntu/2019-NLP/abstractive-summarization/dataset/headlines_cn.txt'
headlines_en_path = '/home/wjunneng/Ubuntu/2019-NLP/abstractive-summarization/dataset/headlines_en.json'

def get_topic_modelling(ctexts_path, headlines_path, n=500):
    """
    获取topic的字词
    :param text_line: 
    :return: 
    """
    ctexts = []
    headlines = []
    with open(file=ctexts_path, encoding='utf-8-sig', mode='r') as file:
        ctexts_filelines = json.load(file)
    with open(file=headlines_path, encoding='utf-8-sig', mode='r') as file:
        headlines_filelines = json.load(file)

    for index in range(len(ctexts_filelines)):
        try:
            vectorizer = TfidfVectorizer()
            # sparse matrix 矩阵
            tf_matrix = vectorizer.fit_transform(raw_documents=[ctexts_filelines[index]])
            # 字词
            tf_features = vectorizer.get_feature_names()
            # 提取特征
            tf_svd = TruncatedSVD(1).fit(X=tf_matrix)
            # add
            ctexts.append(" ".join([tf_features[i] for i in tf_svd.components_[0].argsort()[: -n-1: -1]]))
            # headline
            headlines.append(headlines_filelines[index])
        except:
            pass
    
    return ctexts, headlines
            
ctexts, headlines = get_topic_modelling(ctexts_en_path, headlines_en_path)
print(len(ctexts), len(headlines))
        

4395 4395


In [23]:
from collections import Counter 

def build_vocabulary(words:list, n_words:int):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(Counter(words).most_common(n_words))
    
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    
    data = []
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 1)
        if index == 0:
            unk_count += 1
        data.append(index)
    print('unk_count:', unk_count)
    count[0][1] = unk_count
    
    rev_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, rev_dictionary

concat_from = ' '.join(ctexts).split()
vocabulary_size_from = len(list(set(concat_from)))
data_from, count_from, dictionary_from, rev_dictionary_from = build_vocabulary(concat_from, vocabulary_size_from)

print('vocab from size: %d'%(vocabulary_size_from))
print('Most common words', count_from[4:10])
print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])

concat_to = ' '.join(headlines).split()
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_vocabulary(concat_to, vocabulary_size_to)

print('vocab to size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])

unk_count: 0
vocab from size: 49585
Most common words [('dot', 4394), ('the', 4379), ('comma', 4349), ('to', 4280), ('in', 4268), ('of', 4262)]
Sample data [5, 7, 4, 6, 10, 9, 11, 8, 15, 2062] ['the', 'to', 'dot', 'comma', 'and', 'of', 'on', 'in', 'was', 'festival']
unk_count: 0
vocab to size: 8534
Most common words [('to', 1388), ('in', 1196), ('comma', 876), ('s', 785), ('for', 733), ('of', 596)]
Sample data [2797, 14, 2798, 2799, 656, 2800, 5, 1642, 657, 2086] ['daman', 'and', 'diu', 'revokes', 'mandatory', 'rakshabandhan', 'in', 'offices', 'order', 'malaika']


In [26]:
from sklearn.model_selection import train_test_split

for i in range(len(headlines)):
    headlines[i] = headlines[i] + ' EOS'

GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

def str_idx(corpus, dictionary, UNK=3):
    X = []
    for words in corpus:
        x_idx = []
        for word in words.split():
            x_idx.append(dictionary.get(word, UNK))
        X.append(x_idx)
    
    return X

X, Y = str_idx(ctexts, dictionary_from), str_idx(headlines, dictionary_to)

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2)