In [2]:
import numpy as np
import pickle
import operator
import pandas as pd
# import jieba
# from language.langconv import *
import os

In [3]:
text_summary_data=pd.read_csv('./data/textsummary/data.csv')

In [4]:
text_summary_data.head()

Unnamed: 0.1,Unnamed: 0,text,summary
0,0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...",upGrad learner switches to career in ML & Al w...
1,1,Kunal Shah's credit card bill payment platform...,Delhi techie wins free food from Swiggy for on...
2,2,New Zealand defeated India by 8 wickets in the...,New Zealand end Rohit Sharma-led India's 12-ma...
3,3,"With Aegon Life iTerm Insurance plan, customer...",Aegon life iTerm insurance plan helps customer...
4,4,Speaking about the sexual harassment allegatio...,"Have known Hirani for yrs, what if MeToo claim..."


In [5]:
import re

#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
        row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
        row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
        #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
            repl_url = url.group(3)
            row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
        except:
            pass #there might be emails with no url in them
        

        
        row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        
        
        yield row

In [6]:
brief_cleaning1 = text_strip(text_summary_data['text'])
brief_cleaning2 = text_strip(text_summary_data['summary'])

In [None]:
from time import time
import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

#Taking advantage of spaCy .pipe() method to speed-up the cleaning process:
#If data loss seems to be happening(i.e len(text) = 50 instead of 75 etc etc) in this cell , decrease the batch_size parametre 

t = time()

#Batch the data points into 5000 and run on all cores for faster preprocessing
text = [str(doc) for doc in nlp.pipe(brief_cleaning1, batch_size=5000, n_threads=-1)]

#Takes 7-8 mins
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Taking advantage of spaCy .pipe() method to speed-up the cleaning process:


t = time()

#Batch the data points into 5000 and run on all cores for faster preprocessing
summary = ['_START_ '+ str(doc) + ' _END_' for doc in nlp.pipe(brief_cleaning2, batch_size=5000, n_threads=-1)]

#Takes 7-8 mins
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
pre=pd.DataFrame()
pre['cleaned_text'] = pd.Series(text)
pre['cleaned_summary'] = pd.Series(summary)

In [None]:
text_count = []
summary_count = []

In [None]:
for sent in pre['cleaned_text']:
    text_count.append(len(sent.split()))
for sent in pre['cleaned_summary']:
    summary_count.append(len(sent.split()))

In [None]:
graph_df= pd.DataFrame()
graph_df['text']=text_count
graph_df['summary']=summary_count

In [None]:
import matplotlib.pyplot as plt

graph_df.hist(bins = 5)
plt.show()

In [None]:
#Check how much % of summary have 0-15 words
cnt=0
for i in pre['cleaned_summary']:
    if(len(i.split())<=15):
        cnt=cnt+1
print(cnt/len(pre['cleaned_summary']))

In [None]:
#Check how much % of text have 0-70 words
cnt=0
for i in pre['cleaned_text']:
    if(len(i.split())<=100):
        cnt=cnt+1
print(cnt/len(pre['cleaned_text']))

In [None]:
#Model to summarize the text between 0-15 words for Summary and 0-100 words for Text
max_text_len=100
max_summary_len=15

In [None]:
#Select the Summaries and Text between max len defined above

cleaned_text =np.array(pre['cleaned_text'])
cleaned_summary=np.array(pre['cleaned_summary'])

short_text=[]
short_summary=[]

for i in range(len(cleaned_text)):
    if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])
        
post_pre=pd.DataFrame({'text':short_text,'summary':short_summary})

In [None]:
#Add sostok and eostok at 
post_pre['summary'] = post_pre['summary'].apply(lambda x : 'sostok '+ x + ' eostok')

In [6]:
text_texts=[str(j) for j in post_pre['text'].values]

In [7]:
summary_texts=[str(j) for j in post_pre['summary'].values]

In [8]:
text_texts[1]

'你写的是什么语言'

In [11]:
e=text_texts[i]
str1 = ''.join(e)

In [14]:
type(str1)

str

In [15]:
jieba.lcut(str1)

['什么', '是', 'ai']

In [25]:
a=' '.join(jieba.lcut(str1, cut_all=False))
# ' '.join(jieba.lcut(str1, cut_all=False))

In [27]:
len(a)

7

In [39]:
source_tokens=[]
target_tokens=[]
# =[]
for i in range(len(text_summary_data)):
    e=text_texts[i]
    str1 = ''.join(e)
    source_tokens.append(' '.join(jieba.lcut(str1, cut_all=False)))
#     sentences.append(qa_data.question_texts[i])
for j in range(len(text_summary_data)):
#     sentences.append(qa_data.answer_texts[j])
    c=summary_texts[j]
    str2 = ''.join(c)
    target_tokens.append(' '.join(jieba.lcut(str2, cut_all=False)))

In [40]:
# 生成不同语言的词典
def build_token_dict(token_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for line in token_list:
        for token in line.split(' '):
            if token not in token_dict:
                token_dict[token]=len(token_dict)
    return token_dict

In [41]:
source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

In [43]:
# source_token_dict

In [45]:
# target_token_dict

In [47]:
# target_token_dict_inv

In [48]:
# 添加特殊符号
encode_tokens = [['<START>'] + tokens.split(' ') + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens.split(' ') + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens.split(' ') + ['<END>', '<PAD>'] for tokens in target_tokens]

source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))



encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

print(len(encode_input))

430


In [50]:
import numpy as np
import pickle
import operator
path = 'data/middle_data/transformer/qa/'
with open(path + 'encode_input.pkl', 'wb') as f:
    pickle.dump(encode_input, f, pickle.HIGHEST_PROTOCOL)
with open(path + 'decode_input.pkl', 'wb') as f:
    pickle.dump(decode_input, f, pickle.HIGHEST_PROTOCOL)
with open(path + 'decode_output.pkl', 'wb') as f:
    pickle.dump(decode_output, f, pickle.HIGHEST_PROTOCOL)
with open(path + 'source_token_dict.pkl', 'wb') as f:
    pickle.dump(source_token_dict, f, pickle.HIGHEST_PROTOCOL)
with open(path + 'target_token_dict.pkl', 'wb') as f:
    pickle.dump(target_token_dict, f, pickle.HIGHEST_PROTOCOL)
with open(path + 'source_tokens.pkl', 'wb') as f:
    pickle.dump(source_tokens, f, pickle.HIGHEST_PROTOCOL)

In [52]:
import numpy as np
import pickle
import operator
from keras_transformer import get_model, decode
# main_path = '/content/drive/My Drive/Colab Notebooks/'    #Google Colab FilePath
# path = main_path + 'middle_data/'
path = 'data/middle_data/transformer/qa/'
with open(path + 'encode_input.pkl', 'rb') as f:
    encode_input = pickle.load(f)
with open(path + 'decode_input.pkl', 'rb') as f:
    decode_input = pickle.load(f)
with open(path + 'decode_output.pkl', 'rb') as f:
    decode_output = pickle.load(f)
with open(path + 'source_token_dict.pkl', 'rb') as f:
    source_token_dict = pickle.load(f)
with open(path + 'target_token_dict.pkl', 'rb') as f:
    target_token_dict = pickle.load(f)
with open(path + 'source_tokens.pkl', 'rb') as f:
    source_tokens = pickle.load(f)
print('Done')

Done


In [53]:
print(len(source_token_dict))
print(len(target_token_dict))
print(len(encode_input))
# 构建模型
model = get_model(
    token_num=max(len(source_token_dict), len(target_token_dict)),
    embed_dim=64,
    encoder_num=2,
    decoder_num=2,
    head_num=4,
    hidden_dim=256,
    dropout_rate=0.05,
    use_same_embed=False,  # 不同语言需要使用不同的词嵌入
)
model.compile('adam', 'sparse_categorical_crossentropy')
# model.summary()
print('Done')

518
1134
430
Done


In [None]:
#训练模型
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
filepath = main_path + "modles/W-" + "-{epoch:3d}-{loss:.4f}-.h5"
checkpoint = ModelCheckpoint(filepath,
                    monitor='loss',
                    verbose=1,
                    save_best_only=True,
                    mode='min',
                    period=2,
                    save_weights_only=True
                    )
reduce_lr = ReduceLROnPlateau(monitor='loss', 
                    factor=0.2, 
                    patience=2, 
                    verbose=1, 
                    mode='min', 
                    min_delta=0.0001, 
                    cooldown=0, 
                    min_lr=0
                    )
callbacks_list = [checkpoint, reduce_lr]
model.fit(
    x=[np.array(encode_input), np.array(decode_input[:20000])],
    y=np.array(decode_output),
    epochs=100,
    batch_size=64, 
    verbose=1,
    callbacks=callbacks_list, 
    # class_weight=None, 
    # max_queue_size=5, 
#    workers=1, 
#    use_multiprocessing=False,
    # shuffle=False,
#    initial_epoch=initial_epoch_
    )
# model.save(main_path+'modles/model.h5')

In [None]:
#加载模型
model.load_weights('model/transformer/qa/qa.h5')
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}
print('Done')

In [None]:
from keras.preprocessing import sequence
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import jieba
import requests

def get_input(seq):
    seq = ' '.join(jieba.lcut(seq, cut_all=False))
    # seq = ' '.join(seq)
    seq = seq.split(' ')
    print(seq)
    seq = ['<START>'] + seq + ['<END>']
    seq = seq + ['<PAD>'] * (34 - len(seq))
    print(seq)
    for x in seq:
        try:
            source_token_dict[x]
        except KeyError:
            flag=False
            break
        else:
            flag=True
    if(flag):
        seq = [source_token_dict[x] for x in seq]
    return flag, seq
def get_ans(seq):
    decoded = decode(
    model,
    [seq],
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
    # top_k=10,
    # temperature=1.0,
  )
    print(' '.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))

while True:
    seq = input()
    if seq == 'x':
        break
    flag, seq = get_input(seq)
    if(flag):
        get_ans(seq)
    else:
        print('听不懂呢。')