In [6]:
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pdfminer.high_level import extract_text
import openai
import argparse

# Add your API key to a file named .openai_api.txt
with open(".openai_api.txt", "r") as f:
    openai.api_key = f.read().strip()

def remove_unnecessary_parts(text):
    text = re.sub(r"(?is)\b(acknowledgments|acknowledgement|references)\b.*", "", text)
    text = re.sub(r"(?is)\bappendix\b.*", "", text)
    return text

def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # tokenization
    words = word_tokenize(text)

    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    return " ".join(words)

def reduce_length_with_gpt35(text):
    prompt = "Summarize this paper into 8000 tokens or less:"
    response = openai.Completion.create(
        engine="gpt-3.5-turbo-16k",
        prompt=prompt + text
    )
    return response.choices[0].text.strip()

def generate_summary_with_gpt4(text):
    prompt = "この論文の要約を生成してください："
    response = openai.ChatCompletion.create(
                model='gpt-4',
                messages=[
                    {'role': 'system', 'content': prompt},
                    {'role': 'user', 'content': text}
                ],
                temperature=0.25,
            )
    summary = response['choices'][0]['message']['content']
    return summary

In [16]:
prompt = """与えられた論文の要点をまとめ、以下の項目で日本語で出力せよ。それぞれの項目は最大でも180文字以内に要約せよ。
```
論文名:タイトルの日本語訳
キーワード:この論文のキーワード
課題:この論文が解決する課題
手法:この論文が提案する手法
結果:提案手法によって得られた結果
```"""

def get_summary(text):
    print("### input text", text)
    #print("### input prompt", prompt)
    response = openai.ChatCompletion.create(
                #model="gpt-3.5-turbo",
                model='gpt-4',
                messages=[
                    {'role': 'system', 'content': prompt},
                    {'role': 'user', 'content': text}
                ],
                temperature=0.25,
            )
    summary = response['choices'][0]['message']['content']
    print("#### GPT", summary)
    dict = {}    
    for b in summary.split('\n'):
        print("****", b)
        if b.startswith("論文名"):
            dict['title_jp'] = b[4:].lstrip()
        if b.startswith("キーワード"):
            dict['keywords'] = b[6:].lstrip()
        if b.startswith("課題"):
            dict['problem'] = b[3:].lstrip()
        if b.startswith("手法"):
            dict['method'] = b[3:].lstrip()
        if b.startswith("結果"):
            dict['result'] = b[3:].lstrip()
    print("Dict by ChatGPT", dict)
    return dict

In [7]:
pdf_filepath = "papers/2303.01639.pdf"
text = extract_text(pdf_filepath)
text

'WESPER: Zero-shot and Realtime Whisper to Normal Voice\nConversion for Whisper-based Speech Interactions\n\nJun Rekimoto\nThe University of Tokyo\n7-3-1, Hongo, Bunkyo-ku, Tokyo, Japan\nSony Computer Science Laboratories, Kyoto\n13-1 Hontoro-cho, Shimogyo-ku, Kyoto-shi, Kyoto, Japan\nrekimoto@acm.org\n\n3\n2\n0\n2\n\nr\na\n\nM\n3\n\n]\n\nD\nS\n.\ns\nc\n[\n\n1\nv\n9\n3\n6\n1\n0\n.\n3\n0\n3\n2\n:\nv\ni\nX\nr\na\n\nFigure 1: WESPER is a real-time whisper-to-normal speech conversion mechanism consisting of a speech-to-unit (STU) en-\ncoder that generates common speech units for whispered and normal utterances using self-supervised pre-training, and a\nunit-to-speech (UTS) decoder that recovers speech from the speech units. It achieves user-independent voice conversion in\nreal time.\n\nABSTRACT\nRecognizing whispered speech and converting it to normal speech\ncreates many possibilities for speech interaction. Because the sound\npressure of whispered speech is significantly lower than that

In [8]:
len(text)

62376

In [9]:
# Remove unnecessary parts
text = remove_unnecessary_parts(text)

# check the length of the text
print(len(text.split()))

# Preprocess text
text = preprocess_text(text)

# check the length of the text
print(len(text.split()))

7027
4871


In [11]:
# If the text is too long, reduce it with GPT-3.5
if len(text.split()) > 8000:
    text = reduce_length_with_gpt35(text)

else:
    print("### input text", text)
    pass

### input text WESPER Zeroshot Realtime Whisper Normal Voice Conversion Whisperbased Speech Interactions Jun Rekimoto The University Tokyo 731 Hongo Bunkyoku Tokyo Japan Sony Computer Science Laboratories Kyoto 131 Hontorocho Shimogyoku Kyotoshi Kyoto Japan rekimotoacmorg 3 2 0 2 r M 3 D S c 1 v 9 3 6 1 0 3 0 3 2 v X r Figure 1 WESPER realtime whispertonormal speech conversion mechanism consisting speechtounit STU en coder generates common speech units whispered normal utterances using selfsupervised pretraining unittospeech UTS decoder recovers speech speech units It achieves userindependent voice conversion real time ABSTRACT Recognizing whispered speech converting normal speech creates many possibilities speech interaction Because sound pressure whispered speech significantly lower normal speech used semisilent speech interaction public places without audible others Converting whispers normal speech also improves speech quality people speech hearing impairments However conventional 

In [17]:
# Generate summary with GPT-4
summary = get_summary(text)

print(summary)

### input text WESPER Zeroshot Realtime Whisper Normal Voice Conversion Whisperbased Speech Interactions Jun Rekimoto The University Tokyo 731 Hongo Bunkyoku Tokyo Japan Sony Computer Science Laboratories Kyoto 131 Hontorocho Shimogyoku Kyotoshi Kyoto Japan rekimotoacmorg 3 2 0 2 r M 3 D S c 1 v 9 3 6 1 0 3 0 3 2 v X r Figure 1 WESPER realtime whispertonormal speech conversion mechanism consisting speechtounit STU en coder generates common speech units whispered normal utterances using selfsupervised pretraining unittospeech UTS decoder recovers speech speech units It achieves userindependent voice conversion real time ABSTRACT Recognizing whispered speech converting normal speech creates many possibilities speech interaction Because sound pressure whispered speech significantly lower normal speech used semisilent speech interaction public places without audible others Converting whispers normal speech also improves speech quality people speech hearing impairments However conventional 

In [22]:
def extract_elements(summary):
    dict = {}    
    for b in summary.split('\n'):
        print("****", b)
        if b.startswith("論文名"):
            dict['title_jp'] = b[4:].lstrip()
        if b.startswith("キーワード"):
            dict['keywords'] = b[6:].lstrip()
        if b.startswith("課題"):
            dict['problem'] = b[3:].lstrip()
        if b.startswith("手法"):
            dict['method'] = b[3:].lstrip()
        if b.startswith("結果"):
            dict['result'] = b[3:].lstrip()
    print("Dict by ChatGPT", dict)
    return dict


AttributeError: 'dict' object has no attribute 'split'

dict

In [19]:
# test extract_elements

extract_elements(summary)

AttributeError: 'dict' object has no attribute 'split'

In [29]:
schema = {
  "type": "object",
  "properties": {
    "title": {"type": "string"},
    "author": {"type": "array", "items": {"type": "string"}},
    "jornal/conference": {"type": "string"},
    "year": {"type": "string"},
    "abstract": {"type": "string"},
    "CCS_concept": {"type": "string"},
    "bib": {"type": "string"},
    "metadata": {"type": "object"},  # ここにさらなるメタデータが入ります
    "keywords": {"type": "string"},
    "problem": {"type": "string"},
    "method": {"type": "string"},
    "interaction": {"type": "string"},
    "technical_contribution": {"type": "string"},
    "result": {"type": "string"},
    "github": {"type": "string"},
    "doi": {"type": "string"}
  },
  "required": ["title", "metadata", "keywords", "problem", "method", "interaction", "technical_contribution", "result", "github", "doi", "texts"]
}


In [50]:
prompt = "Analyze the following paper and provide the required information. metaデータに関しては元の言語を保持せよ．内容に関する項目は日本語で，それぞれの項目は最大でも180文字以内に要約せよ."

completion = openai.ChatCompletion.create(
  model="gpt-4-0613",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content":f"{prompt}"},
    {"role": "user", "content": "Analyze the following paper and provide the required information."},
    {"role": "user", "content": f"The paper is: [{text}]"}
  ],
  functions=[{"name": "analyze_paper", "parameters": schema}],
  function_call={"name": "analyze_paper"},
  temperature=0,
)

print(completion.choices[0].message.function_call.arguments)

{
"title": "WESPER: Zeroshot Realtime Whisper-to-Normal Voice Conversion for Whisper-based Speech Interactions",
"author": ["Jun Rekimoto"],
"jornal/conference": "CHI ’ 23",
"year": "2023",
"abstract": "Recognizing whispered speech and converting it to normal speech creates many possibilities for speech interaction. However, conventional speech conversion techniques do not provide sufficient conversion quality and require speaker-dependent datasets. To address these problems, we propose WESPER, a zeroshot realtime whisper-to-normal speech conversion mechanism based on self-supervised learning. WESPER consists of a speech-to-unit (STU) encoder that generates hidden speech units common to whispered and normal speech, and a unit-to-speech (UTS) decoder that reconstructs speech from the encoded speech units. Unlike existing methods, our conversion is user-independent and does not require a paired dataset of whispered and normal speech.",
"keywords": "speech interaction, whispered voice, wh

In [36]:
json_summary = completion.choices[0].message.function_call.arguments

print(json_summary)

{
"title": "WESPER: Zeroshot Realtime Whisper-to-Normal Voice Conversion for Whisper-based Speech Interactions",
"author": ["Jun Rekimoto"],
"jornal/conference": "CHI ’ 23",
"year": "2023",
"abstract": "Recognizing whispered speech and converting it to normal speech creates many possibilities for speech interaction. Because the sound pressure of whispered speech is significantly lower than normal speech, it can be used for semi-silent speech interaction in public places without being audible to others. Converting whispers to normal speech also improves speech quality for people with speech and hearing impairments. However, conventional speech conversion techniques do not provide sufficient conversion quality and require speaker-dependent datasets consisting of pairs of whispered and normal speech utterances. To address these problems, we propose WESPER, a zero-shot, real-time whisper-to-normal speech conversion mechanism based on self-supervised learning. WESPER consists of a speech-to

In [37]:
type(json_summary)

str

In [42]:
import json



try:
  json_result = json.loads(completion.choices[0].message.function_call.arguments)

except:
  print("error")
  print(completion.choices[0].message.function_call.arguments)


json_result["text"] = text

In [43]:
json_result

{'title': 'WESPER: Zeroshot Realtime Whisper-to-Normal Voice Conversion for Whisper-based Speech Interactions',
 'author': ['Jun Rekimoto'],
 'jornal/conference': 'CHI ’ 23',
 'year': '2023',
 'abstract': 'Recognizing whispered speech and converting it to normal speech creates many possibilities for speech interaction. Because the sound pressure of whispered speech is significantly lower than normal speech, it can be used for semi-silent speech interaction in public places without being audible to others. Converting whispers to normal speech also improves speech quality for people with speech and hearing impairments. However, conventional speech conversion techniques do not provide sufficient conversion quality and require speaker-dependent datasets consisting of pairs of whispered and normal speech utterances. To address these problems, we propose WESPER, a zero-shot, real-time whisper-to-normal speech conversion mechanism based on self-supervised learning. WESPER consists of a speech

In [44]:
# save json_result
import json

# FILEPATH: /Users/naoki/Library/CloudStorage/GoogleDrive-naoki.kimura.ac@gmail.com/My Drive/2023/summarize_arxv/playground.ipynb
# save json_result

with open("json_result.json", "w") as f:
    json.dump(json_result, f)


In [45]:
# load the json file and translate it to japanese

import json

with open('json_result.json', 'r') as f:
    json_result = json.load(f)

print(json_result)



{'title': 'WESPER: Zeroshot Realtime Whisper-to-Normal Voice Conversion for Whisper-based Speech Interactions', 'author': ['Jun Rekimoto'], 'jornal/conference': 'CHI ’ 23', 'year': '2023', 'abstract': 'Recognizing whispered speech and converting it to normal speech creates many possibilities for speech interaction. Because the sound pressure of whispered speech is significantly lower than normal speech, it can be used for semi-silent speech interaction in public places without being audible to others. Converting whispers to normal speech also improves speech quality for people with speech and hearing impairments. However, conventional speech conversion techniques do not provide sufficient conversion quality and require speaker-dependent datasets consisting of pairs of whispered and normal speech utterances. To address these problems, we propose WESPER, a zero-shot, real-time whisper-to-normal speech conversion mechanism based on self-supervised learning. WESPER consists of a speech-to-

In [34]:
# add text to json_summary

json_summary.update({'text': text})

#show
json_summary

AttributeError: 'str' object has no attribute 'update'