In [None]:
import os
import json
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
import openai
openai.api_key = 'key'

In [None]:
input_data_path = './data'

In [None]:
# read .txt files and arrange them in list of dicts
if os.path.isdir(input_data_path):
    file_list = os.listdir(input_data_path)
    data = []
    for file_name in file_list:
        if file_name.endswith(".txt"):
            file_path = os.path.join(input_data_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                title, text = content.split('\n\n', 1)
                data_row = {}
                data_row["text"] = text
                data.append(data_row)

In [None]:
# preprocess text
for data_sample in data:
    # 1. remove unnecessary formatting
    data_sample["text"]=data_sample["text"].replace('\n', '')
    data_sample["text"]=data_sample["text"].replace('\n\n', '')
    # tokenixe text and remove stopwords and special characters
    tokenized_text = tokenizer.tokenize(data_sample["text"])
    stopwords_set = set(stopwords.words('english'))
    data_sample["text"] = [token for token in tokenized_text if token not in stopwords_set]
    data_sample["text"] = [token for token in tokenized_text if token.isalnum()]
    data_sample["text"] = ' '.join(data_sample["text"])

In [None]:
# define business entities. Adapt to task needs
business_entities = ["agriculture", 'mining', 'oil and gas extraction', 'construction', 'manufacturing', 'retail', 'information', 'finance', 'real estate', 'health care', 'public administration', 'automotive', 'aerospace', 'transportation', 'telecommunications', 'banking', 'security', 'insurance', 'rental and leasing services', 'legal services', 'accounting', 'tax preparation', 'architecture', 'engineering', 'office administrative services', 'education', 'arts and entertainment', 'food services', 'defence']


In [None]:
business_notes_with_entities = []
for i in data:
    # prompt - define system role
    messages = [{"role": "system", "content": f"""You are a business entity extractor. Analyze the given text and suggest the business industry from provided list that best fits the text.\
           business entities: {business_entities},
           You are given a text in a format 'text:' ''' text to analyze '''.
           Respond in a pattern: ''' choosen entity ''' """}]
    message = i['text']
    messages.append({"role": "user", "content": message})
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
    )
    GPT_reply = response["choices"][0]["message"]["content"]
    messages.append({"role": "assistant", "content": GPT_reply})
    data_row = {}
    data_row['TEXT'] = i['text']
    data_row['ENTITY'] = GPT_reply
    business_notes_with_entities.append(data_row)
    messages = []

In [None]:
# save gpt outputs
output_data_path = './extracted-entities'
with open(output_data_path, 'w') as json_file:
    json.dump(business_notes_with_entities, json_file)