In [1]:
import os
import glob
import json
from textwrap import indent
import tqdm
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

print('initialize paraphrasing model')

overwrite = False

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

def get_para(sentence):
    para = get_response(sentence, num_return_sequences=5, num_beams=5)
    if sentence not in para:
        para.append(sentence)
    return para

def get_answer_groups(filename):
    if filename.endswith('.tsv'):
        db = pd.read_csv(filename, delimiter='\t')
    elif filename.endswith('.csv'):
        db = pd.read_csv(filename)
    answer_groups = db.groupby(db['Answer'])
    # print(len(answer_groups)) # 90 answers
    return answer_groups

for file in glob.glob('SKYE_RetrievalDB_New.csv'):
    print(file)
    answers_filename = 'answers_' + file.replace(file[-4:], '.json')
    extended_filename = 'extended_' + file.replace(file[-4:], '.json')
    if not overwrite and os.path.exists(answers_filename):
        print('already processed!')
        continue
    extended_answers = {}
    extended_FAQ = {'version': '1.0.0', 'data':[]}
    answer_groups = get_answer_groups(file)
    answer_id = 0
    for answer, answer_group in tqdm.tqdm(answer_groups):
        extended_answers[answer] = get_para(answer)
        for question_id, question_orig in enumerate(answer_group['Question']):
            for question_para_id, question_para in enumerate(get_para(question_orig)):
                extended_FAQ['data'].append({
                    'id': '{0}_{1}_{2}'.format(answer_id, question_id, question_para_id),
                    'question': question_para,
                    'answer': answer
                })
        answer_id += 1
    with open(answers_filename, 'w') as fp:
        json.dump(extended_answers, fp, indent=4)
    with open(extended_filename, 'w') as fp:
        json.dump(extended_FAQ, fp, indent=4)

initialize paraphrasing model
SKYE_RetrievalDB_New.csv


100%|██████████████████████████████████████████████████████████████████████████████| 2244/2244 [19:10<00:00,  1.95it/s]


In [4]:
for file in glob.glob('*.csv') + glob.glob('*.tsv'):
    print(file)

SKYE_RetrievalDB.csv
qna_chitchat_caring.tsv
qna_chitchat_enthusiastic.tsv
qna_chitchat_friendly.tsv
qna_chitchat_professional.tsv
qna_chitchat_witty.tsv


In [6]:
file = glob.glob('*.csv')+glob.glob('*.tsv')
file = file[0]

In [1]:
answer_groups = get_answer_groups(file)

NameError: name 'get_answer_groups' is not defined

In [11]:
for answer, answer_group in tqdm.tqdm(answer_groups):
    print(answer)

100%|█████████████████████████████████████████████████████████████████████████████| 220/220 [00:00<00:00, 36651.85it/s]

"Beauty is in the eye of the beholder." - Margaret Wolfe Hungerford
100%
A beautiful gift from nature. 
A bunch of smart people with too much time on their hands haha. 
A company called DMLab.
A person-like bot.
A saint. 
A step closer to being smarter than google haha. 
Ah I see. 
All that I have learned so far. 
Alright then. 
Argh, we love a stan. Lol. 
As long as you eat healthy. 
Beautiful location.
Because it's funny though. 
Bet!
Call me the Jukebot. He he 
Can you not hear me? Of course I can speak!
Companion by day & night, and content creator during working hours haha.
Congratulations. 
Cool cool. 
Cool job. 
Cool. 
Dating? Me? No. 
Depends on your definition of real or fake.  
Development is a process. 
Do you get a day off on days like this? 
Don't do it again. Be kind. 
Don't tell me you didn't read the T&C's!
Dry day. Wassup with you? 
Eh, don't mention it. 
Exactly. 
Excuse me what now? CAP!
Fair. 
Far from any kind of romatic relationship. 
First of all, inner beauty ma




In [12]:
answer

'sure!'

In [13]:
extended_answers = {}
extended_answers[answer] = get_para(answer)

In [14]:
extended_answers

{'sure!': ['Yes!',
  'yes!',
  'Absolutely!',
  'Yes, definitely!',
  'Yes, sure!',
  'sure!']}

In [16]:
for question_id, question_orig in enumerate(answer_group['Question']):
    print(question_id, question_orig)

0 do you like flying?
1 do you like fluffy things?
2 what kinds of things do you like to learn?
3 What is your favorite activity?
4 What do you like to do in your free time?
5 What do you like to do for fun?
6 What do you like best?
7 What's your favorite thing in the world?
8 what's your favorite subject?
9 What's your favorite activity?
10 what subjects do you like?


In [2]:
from enum import Enum


class Status(Enum):
    SUCCESS = 1
    FAILURE = 2
    RETRY = 3


print(Status.SUCCESS)


Status.SUCCESS


In [7]:
def hoho(**kwargs):
    print(kwargs)

def haha(**kwargs):
    hoho(**kwargs)
    
haha(a=1, b=2)

{'a': 1, 'b': 2}


In [1]:
import os
import glob
import json
from textwrap import indent
import tqdm
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

print('initialize paraphrasing model')

overwrite = False

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

def get_para(sentence):
    para = get_response(sentence, num_return_sequences=5, num_beams=5)
    if sentence not in para:
        para.append(sentence)
    return para

def get_answer_groups(filename):
    if filename.endswith('.tsv'):
        db = pd.read_csv(filename, delimiter='\t')
    elif filename.endswith('.csv'):
        db = pd.read_csv(filename)
    answer_groups = db.groupby(db['Answer'])
    # print(len(answer_groups)) # 90 answers
    return answer_groups

initialize paraphrasing model


In [2]:
for file in glob.glob('*.csv') + glob.glob('*.tsv'):
    print(file)
    answers_filename = 'answers_' + file.replace(file[-4:], '.json')
    extended_filename = 'extended_' + file.replace(file[-4:], '.json')
    if not overwrite and os.path.exists(answers_filename):
        print('already processed!')
        continue
    extended_answers = {}
    extended_FAQ = {'version': '1.0.0', 'data':[]}
    answer_groups = get_answer_groups(file)
    answer_id = 0
    for answer, answer_group in tqdm.tqdm(answer_groups):
        extended_answers[answer] = get_para(answer)
        for question_id, question_orig in enumerate(answer_group['Question']):
            for question_para_id, question_para in enumerate(get_para(question_orig)):
                extended_FAQ['data'].append({
                    'id': '{0}_{1}_{2}'.format(answer_id, question_id, question_para_id),
                    'question': question_para,
                    'answer': answer
                })
        answer_id += 1
    with open(answers_filename, 'w') as fp:
        json.dump(extended_answers, fp, indent=4)
    with open(extended_filename, 'w') as fp:
        json.dump(extended_FAQ, fp, indent=4)

qg_test_answer_1.csv


  0%|                                                                                            | 0/1 [00:02<?, ?it/s]


KeyError: 'Question'

In [14]:
file = glob.glob('*.tsv')
file = file[-1]

In [15]:
file

'test_tsv2.tsv'

## answer만 paraphrasing

In [4]:
for file in glob.glob('*.csv') + glob.glob('*.tsv'):
    print(file)
    answers_filename = 'answers_' + file.replace(file[-4:], '.json')
    if not overwrite and os.path.exists(answers_filename):
        print('already processed!')
        continue
    extended_answers = {}
    answer_groups = get_answer_groups(file)
    answer_id = 0
    for answer, answer_group in tqdm.tqdm(answer_groups):
        extended_answers[answer] = get_para(answer)

        answer_id += 1
    with open(answers_filename, 'w') as fp:
        json.dump(extended_answers, fp, indent=4)


qg_test_answer_1.csv


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.65it/s]

SKYE_RetrievalDB.csv
already processed!
NameLocGen.tsv
already processed!
qna_chitchat_caring.tsv
already processed!
qna_chitchat_enthusiastic.tsv
already processed!
qna_chitchat_friendly.tsv
already processed!
qna_chitchat_professional.tsv
already processed!
qna_chitchat_witty.tsv
already processed!





In [3]:
data

NameError: name 'data' is not defined

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

## question은 paraphrasing X

In [1]:
import os
import glob
import json
from textwrap import indent
import tqdm
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

print('initialize paraphrasing model')

overwrite = False

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

def get_para(sentence):
    para = get_response(sentence, num_return_sequences=5, num_beams=5)
    if sentence not in para:
        para.append(sentence)
    return para

def get_answer_groups(filename):
    if filename.endswith('.tsv'):
        db = pd.read_csv(filename, delimiter='\t',encoding='utf-8-sig')
    elif filename.endswith('.csv'):
        db = pd.read_csv(filename,encoding='utf-8-sig')
    answer_groups = db.groupby(db['answer'])
    # print(len(answer_groups)) # 90 answers
    return answer_groups

for file in glob.glob('question_weather.csv'):
    print(file)
    answers_filename = 'answers_' + file.replace(file[-4:], '.json')
    extended_filename = 'extended_' + file.replace(file[-4:], '.json')
    if not overwrite and os.path.exists(answers_filename):
        print('already processed!')
        continue
    extended_answers = {}
    extended_FAQ = {'version': '1.0.0', 'data':[]}
    answer_groups = get_answer_groups(file)
    answer_id = 0
    for answer, answer_group in tqdm.tqdm(answer_groups):
        extended_answers[answer] = get_para(answer)
        for question_orig, time_zone in zip(answer_group['question'],answer_group['time zone']):
            for type in ["How's","How is","What's"]:
                if time_zone == 'korner':
                    extended_FAQ['data'].append({
                    'question': type + ' the weather today?',
                    'answer': answer,
                    'time zone': time_zone,
                    'region': question_orig
                })
                else:
                    extended_FAQ['data'].append({
                    'question': type + ' the weather in ' + question_orig + '?',
                    'answer': type,
                    'time zone': time_zone,
                    'region': question_orig
                })
        answer_id += 1
    with open(answers_filename, 'w',encoding='utf-8-sig') as fp:
        json.dump(extended_answers, fp, indent=4,ensure_ascii=False)
    with open(extended_filename, 'w',encoding='utf-8-sig') as fp:
        json.dump(extended_FAQ, fp, indent=4,ensure_ascii=False)

initialize paraphrasing model
question_weather.csv


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.57s/it]
