In [1]:
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pdfminer.high_level import extract_text
import openai
import argparse

# Add your API key to a file named .openai_api.txt
with open(".openai_api.txt", "r") as f:
    openai.api_key = f.read().strip()

def remove_unnecessary_parts(text):
    text = re.sub(r"(?is)\b(acknowledgments|acknowledgement|references)\b.*", "", text)
    text = re.sub(r"(?is)\bappendix\b.*", "", text)
    return text

def preprocess_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # tokenization
    words = word_tokenize(text)

    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    return " ".join(words)

def reduce_length_with_gpt35(text):
    prompt = "Summarize this paper into 8000 tokens or less:"
    response = openai.Completion.create(
        engine="gpt-3.5-turbo-16k",
        prompt=prompt + text
    )
    return response.choices[0].text.strip()

def generate_summary_with_gpt4(text):
    prompt = "この論文の要約を生成してください："
    response = openai.ChatCompletion.create(
                model='gpt-4',
                messages=[
                    {'role': 'system', 'content': prompt},
                    {'role': 'user', 'content': text}
                ],
                temperature=0.25,
            )
    summary = response['choices'][0]['message']['content']
    return summary

In [2]:
# read text from target.text
def read_text_from_file(file_name):
    with open(file_name, "r") as f:
        text = f.read()
    return text

text = read_text_from_file("target.txt")

len(text)

25994

In [3]:
# Remove unnecessary parts
text = remove_unnecessary_parts(text)

text

'1. INTRODUCTION\nA Silent Speech Interface, or SSI, is defined as a device enabling speech processing in the absence of an exploitable audio signal – for example, speech recognition obtained exclusively from video images of the mouth, or from electromyographic sensors (EMA) glued to the tongue. Classic applications targeted by SSIs include: 1) Voice-replacement for persons who have lost the ability to vocalize through illness or an accident, yet who retain the ability to articulate; 2) Speech communication in environments where silence is either necessary or desired: responding to cellphone in meetings or public places without disturbing others; avoiding interference in call centers, conferences and classrooms; private communications by police, military, or business personnel.\nThe SSI concept was first identified as an outgrowth of speech production research, in tandem with the proliferation of the use of cellular telephones, in 2010 in a special issue of Speech Communication [1], wh

In [4]:
# Remove unnecessary parts
#text = remove_unnecessary_parts(text)

# check the length of the text
print(len(text.split()))

# Preprocess text
text = preprocess_text(text)

# check the length of the text
print(len(text.split()))

# write the preprocessed text to a file
with open("preprocessed.txt", "w") as f:
    f.write(text)

1641
1112
