In [None]:
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8.1-cp37-cp37m-linux_x86_64.whl
!pip install transformers
from transformers import BertTokenizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import re
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm_notebook
# import torch_xla.utils.serialization as xser
import torch.nn as nn
!pip install pytorch_pretrained_bert
from pytorch_pretrained_bert import BertModel, BertConfig
from torch.nn.init import xavier_uniform_

from __future__ import division

import argparse
import glob
import os
import random
import signal
import time

import distributed
# import torch_xla.core.xla_model as xm
# import torch_xla.distributed.parallel_loader as pl
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
class BertData():
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.sep_vid = self.tokenizer.vocab['[SEP]']
        self.cls_vid = self.tokenizer.vocab['[CLS]']
        self.pad_vid = self.tokenizer.vocab['[PAD]']
        self.min_nsents = 3
        self.max_nsents = 100
        self.min_src_ntokens = 3
        self.max_src_ntokens = 200


    def preprocess(self, src):

        if (len(src) == 0):
            return None

        original_src_txt = [' '.join(s) for s in src]


        idxs = [i for i, s in enumerate(src) if (len(s) > self.min_src_ntokens)]
        # print('idxs:',idxs)

        src = [src[i][:self.max_src_ntokens] for i in idxs]

        src = src[:self.max_nsents]

        if (len(src) < self.min_nsents):
            return None


        src_txt = [' '.join(sent) for sent in src]

        text = ' [SEP] [CLS] '.join(src_txt)
        src_subtokens = self.tokenizer.tokenize(text)
        src_subtokens = src_subtokens[:510]
        src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]']

        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]
        cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == self.cls_vid]


        src_txt = [original_src_txt[i] for i in idxs]
        return src_subtoken_idxs, segments_ids, cls_ids, src_txt

In [None]:
class Classifier(nn.Module):
    def __init__(self, hidden_size):
        super(Classifier, self).__init__()
        self.linear1 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, mask_cls):
        h = self.linear1(x).squeeze(-1)
        sent_scores = self.sigmoid(h) * mask_cls.float()
        return sent_scores
class Bert(nn.Module):
    def __init__(self, temp_dir, load_pretrained_bert, bert_config):
        super(Bert, self).__init__()
        if(load_pretrained_bert):
            self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)
        else:
            self.model = BertModel(bert_config)

    def forward(self, x, segs, mask):
        encoded_layers, _ = self.model(x, segs, attention_mask =mask)
        top_vec = encoded_layers[-1]
        return top_vec


temp_dir = '/content/drive/MyDrive/Pro2/temp'
class Summarizer(nn.Module):
    def __init__(self,device, load_pretrained_bert = False, bert_config = None):
        super(Summarizer, self).__init__()
        self.bert = Bert(temp_dir, load_pretrained_bert, bert_config)
        
        self.encoder = Classifier(self.bert.model.config.hidden_size)
        

      
        for p in self.encoder.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

        self.to(device)
    def load_cp(self, pt):
        self.load_state_dict(pt, strict=True)

    def forward(self, x, segs, clss, mask, mask_cls, sentence_range=None):

        top_vec = self.bert(x, segs, mask)
        sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]
        sents_vec = sents_vec * mask_cls[:, :, None].float()
        sent_scores = self.encoder(sents_vec, mask_cls).squeeze(-1)
        return sent_scores, mask_cls
def train(device,train_from):

    model = Summarizer(device, load_pretrained_bert=True)
    
    if train_from != '':
        checkpoint = torch.load(train_from)
        model.load_cp(checkpoint)
        # optim = build_optim(train_from, model, checkpoint)
        checkpoint = 0
    # else:
    #     optim = build_optim(train_from,model, None)
 
    return model
    # model,optim


In [None]:
bert = BertData()

In [None]:
def convert1(datasets):
    def _pad(data, pad_id, width=-1):
        if (width == -1):
            width = max(len(d) for d in data)
        rtn_data = [d + [pad_id] * (width - len(d)) for d in data]
        return rtn_data
    pre_src = [x['src'] for x in datasets[:]]

    pre_segs = [x['segs'] for x in datasets[:]]
    pre_clss = [x['clss'] for x in datasets[:]]
    src_txt = [x['src_txt'] for x in datasets[:]]

    src = torch.tensor(_pad(pre_src, 0))


    segs = torch.tensor(_pad(pre_segs, 0))
    mask = ~(src == 0)

    clss = torch.tensor(_pad(pre_clss, -1))
    mask_cls = ~ (clss == -1)
    clss[clss == -1] = 0



    return src,segs,clss,mask,mask_cls,src_txt

In [None]:
def _get_ngrams(n, text):
            ngram_set = set()
            text_length = len(text)
            max_index_ngram_start = text_length - n
            for i in range(max_index_ngram_start + 1):
                ngram_set.add(tuple(text[i:i + n]))
            return ngram_set

def _block_tri(c, p):
    tri_c = _get_ngrams(3, c.split())
    for s in p:
        tri_s = _get_ngrams(3, s.split())
        if len(tri_c.intersection(tri_s))>0:
            return True
    return False

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

In [None]:
path ='/content/drive/MyDrive/Pro2/WeightBert/bertWeight1'
checkpoint = torch.load(path)
model = train(device,'')
model.load_cp(checkpoint)

In [None]:
def BertSumSummary(X):
    datasets = []
    source = sent_tokenize(X)
    source = [word_tokenize(word) for word in source]
    b_data = bert.preprocess(source)
    if (b_data is None):
        print('None data')
    indexed_tokens, segments_ids, cls_ids, src_txt = b_data
    b_data_dict = {"src": indexed_tokens, "segs": segments_ids, 'clss': cls_ids,
                    'src_txt': src_txt}
    datasets.append(b_data_dict)
    src,segs,clss,mask,mask_cls, src_txt = convert1(datasets)
    ##load model

    # path ='/content/drive/MyDrive/Pro2/WeightBert/bertWeight1'
    # checkpoint = xser.load(path)
    # device = xm.xla_device()
    # model = train(device,'')
    # model.load_cp(checkpoint)
    batch_size = 1
    train_data = TensorDataset(src,segs,clss,mask,mask_cls)
    train_sampler = SequentialSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    ## predict 
    list_selected_ids = []
    with torch.no_grad():
      for step, batch in tqdm_notebook(enumerate(train_dataloader)):

          src = batch[0].to(device)
          segs = batch[1].to(device)
          clss = batch[2].to(device)
          mask = batch[3].to(device)
          mask_cls = batch[4].to(device)
          sent_scores, mask = model(src, segs, clss, mask, mask_cls)
      
          sent_scores = sent_scores + mask.float()

          sent_scores = sent_scores.cpu().data.numpy()
          selected_ids = np.argsort(-sent_scores, 1)
          list_selected_ids.append(selected_ids)
          # xm.mark_step()
    list_out = list_selected_ids[0]
    for i, idx in enumerate(list_selected_ids[1:]):
        print(list_out.shape)
        print(idx.shape)
        list_out = np.concatenate((list_out,idx),axis=0)
    # list_out.shape
    pred = []
    for i, idx in enumerate(list_out):
        _pred = []
        newDict = dict()
        if(len(src_txt[i])==0):
            continue
        for j in list_out[i][:len(src_txt[i])]:
            # print(i,j)
            if(j>=len( src_txt[i])):
                continue
            candidate = src_txt[i][j].strip()

            if(not _block_tri(candidate,_pred)):
                _pred.append(candidate)
                newDict.update({j:candidate})
            if len(_pred) == 5:
                break
            # else:
            #     _pred.append(candidate)
        pred.append(newDict)
    idx = []
    for x in pred[0]:
        idx.append(x)
    idx.sort()
    str_out = ""
    for x in idx:
        str_out += pred[0][x] + ' '
    return str_out

In [None]:
data  = pd.read_csv('/content/drive/MyDrive/Pro2/wikiP1.csv')

In [None]:
data.iloc[101]['text']

' As you move forward in the letter, be upfront about your feelings. A love letter is not the place to be shy. Tell the girl why you admire her and what qualities about her most intrigue you. Go into specifics when possible. You may not know a lot about this person, but mention small things about her that you love. For example, maybe you think the buttons she has on her purse are hilarious. Maybe you notice she listens to a particular band on her headphones that you also enjoy.\n\n\nBe upfront about why you\'ve never approached her in person. While many people enjoy anonymous love letters, there is always the risk of coming off the wrong way. You don\'t want the recipient to feel like she\'s being watched. It can help if you assure her, at some point, you\'re a relatively normal person who happens to feel more comfortable expressing feelings in writing.There are a variety of reasons you may prefer writing a love letter. You could be shy, for example, and find expressing yourself easier

In [None]:
!pip install gevent
!pip install flask_ngrok



In [None]:
vocab = word_vectors.vocab

n_clusters = 5

kmeans = KMeans(n_clusters=n_clusters)
def clearData(content):
    content = content.lower() #Biến đổi hết thành chữ thường
    content = content.replace('\n', ' ') #Đổi các ký tự xuống dòng thành chấm câu
    content = content.replace(',', '') 
    content = content.replace(':', '')
    content = content.strip()
    return content
def clearData1(content):
    content = content.replace('\n', ' ') #Đổi các ký tự xuống dòng thành chấm câu
    content = content.replace(',', '') 
    content = content.replace(':', '')
    content = content.strip()
    return content
def KmeanSumary(text):
    content = clearData(text)
    sentences = nltk.sent_tokenize(content)

    original = clearData1(text)
    original = nltk.sent_tokenize(original)
    if(len(sentences)<5):
        print("article less more than 5 sentences")
    X = []
    for sentence in sentences:
        sentence_tokenized = word_tokenize(sentence)
        # words = sentence_tokenized.split(" ")
        sentence_vec = np.zeros((100))
        for word in sentence_tokenized:
            if word in vocab:
                sentence_vec += word_vectors[word]
        X.append(sentence_vec)
    pred = kmeans.fit(X)
    avg = []
    for j in range(n_clusters):
        idx = np.where(pred.labels_ == j)[0]
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(pred.cluster_centers_, X)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    summary = ' '.join([original[closest[idx]] for idx in ordering])
    return summary



In [None]:
from flask import Flask
from flask_ngrok import run_with_ngrok
from http.client import error
from flask import Flask
from gevent.pywsgi import WSGIServer
from flask import request, jsonify, render_template

app = Flask(__name__, template_folder='/content/drive/MyDrive/Pro2/templates') 
run_with_ngrok(app)

@app.route('/', methods=['GET'])
def index1():
    return render_template('index.html')

@app.route('/', methods=['GET', 'POST'])
def my_form_post():
    text = request.form.get("message")
    option = request.form.getlist('options')
    if len(option) == 0 :
        print("BertSum predicting...")
        output ="BertSum prediction:"+ "\n" + BertSumSummary(text)
    elif option[0] =='option1':
        print("BertSum predicting...")
        output = "BertSum prediction:"+ "\n" + BertSumSummary(text)
    else:
        print("Kmeans predicting...")
        output = "Kmean prediction:" + "\n" + KmeanSumary(text)

    print(output)
    list_out = []
    list_out.append(text)
    list_out.append(output)
    return render_template('index1.html',errors = list_out)
	
app.run()


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://5d6b02cfa45b.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [16/Jul/2021 09:26:57] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [16/Jul/2021 09:26:57] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


BertSum predicting...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

127.0.0.1 - - [16/Jul/2021 09:27:06] "[37mPOST / HTTP/1.1[0m" 200 -



BertSum prediction:
; , Some studios offer short-term programs for people who want to learn more about VFX artistry without pursuing a college degree . Stay informed about the newest software advances by following VFX blogs and taking online computer tutorials.For example , VFX artists are expected to be well-versed in graphics and animation programs , such as Adobe Creative Suite and JavaScript.Clearly list every program that you can work with on your resume . , Watch all of these creations with an eye for detail . Look for the techniques used and any original approaches that you see . Try to recreate any scenes that you find particularly interesting . 


127.0.0.1 - - [16/Jul/2021 09:27:07] "[33mGET /main.css HTTP/1.1[0m" 404 -
127.0.0.1 - - [16/Jul/2021 09:27:13] "[37mPOST / HTTP/1.1[0m" 200 -


Kmeans predicting...
Kmean prediction:
Stay informed about the newest software advances by following VFX blogs and taking online computer tutorials.For example VFX artists are expected to be well-versed in graphics and animation programs such as Adobe Creative Suite and JavaScript.Clearly list every program that you can work with on your resume. Even geometry skills can come in handy when creating a particular type of background or even a person’s face.Make a choice to become an observer of the world around you. As you gain more experience you’ll likely find yourself gravitating toward a certain aspect of design. Some of these videos will focus on a particular skill set such as shading which you then can practice on your own. Watch all of these creations with an eye for detail.


127.0.0.1 - - [16/Jul/2021 09:27:14] "[33mGET /main.css HTTP/1.1[0m" 404 -
