In [1]:
# Ruby equivalent package installation comments
# gem install spacy-ruby
# gem install ruby-openai
# gem install tiktoken_ruby
# gem install pandas-ruby
# gem install tqdm-ruby

## Function Calling with Azure OpenAI to extract organization names from documents

_See more function calling examples from https://github.com/Azure-Samples/openai/blob/main/Basic_Samples/Functions/working_with_functions.ipynb_

In [1]:
require 'openai'
require 'spacy'
require 'tiktoken'
require 'json'
require 'pandas'
require 'tqdm'

# Ruby's memoization for caching
require 'memoist'

# Initialize tiktoken encoder
enc = Tiktoken.encoding_for_model("gpt-3.5-turbo")

# Load spaCy models
nlpsm = Spacy.load('en_core_web_sm')
nlplg = Spacy.load('en_core_web_lg')

# Load configuration
config = JSON.parse(File.read('config.json'))
OpenAI.configure do |config|
    config.api_key = config["api_key"]
    config.api_base = config["api_base"]
    config.api_version = "2023-07-01-preview"
    config.api_type = "azure"
end

In [2]:
def extract_organization_entities(organization)
    []
end

extend Memoist

def get_gpt_completion(context)
    messages = [{ role: "user", content: context }]
    functions = [
        {
            name: "extract_organization_entities",
            description: "Extracts all the Organization (ORG) named entities from the context.",
            parameters: {
                type: "object",
                properties: {
                    organization: {
                        type: "array",
                        items: {
                            type: "string"
                        },
                        description: "The organization entity names"
                    }
                },
                required: ["organization"]
            }
        }
    ]

    OpenAI::Client.chat(
        engine: "gpt-35-turbo-4k",
        temperature: 0.0,
        messages: messages,
        functions: functions,
        function_call: "auto"
    )
end

memoize :get_gpt_completion

In [11]:
def extract_organizations(context)
    response = get_gpt_completion(context)

    if response
        response_message = response['choices'][0]['message']

        if response_message["function_call"]
            function_name = response_message["function_call"]["name"]
            if function_name == "extract_organization_entities"
                JSON.parse(response_message["function_call"]["arguments"])
            end
        end
    end
end

In [4]:
# Load Excel file using pandas-ruby
df = Pandas::DataFrame.read_excel('data/organization_gold_labels.xlsx')
puts df

In [5]:
def count_tokens(context)
    enc.encode(context).length
end

puts count_tokens('hello world!')

In [12]:
def sync_offsets(context, organizations)
    offsets = []
    organizations.each do |org|
        matches = context.scan(/#{Regexp.escape(org)}/i).map do |match|
            {
                "PARTY" => org,
                "start" => Regexp.last_match.begin(0),
                "end" => Regexp.last_match.end(0)
            }
        end

        if matches.empty?
            puts "No match found for \"#{org}\" in context"
            offsets << { "PARTY" => org, "error" => "not found" }
        else
            offsets.concat(matches)
        end
    end
    offsets
end

context_offset_tuples = []
df.each_row do |row|
    context = row['context']
    token_count = count_tokens(context)
    function_args = extract_organizations(context)
    puts "GPT response: #{function_args}"

    if function_args && function_args['organization']
        gpt_orgs = function_args['organization']
        offsets = sync_offsets(context, gpt_orgs)
        puts "Offsets: #{offsets}"
        context_offset_tuples << [context, offsets]
    end
end

In [15]:
def get_spacy_true_positives(doc, original_entities)
    ents = []
    original_entities.each do |entity|
        start_char = entity['start']
        end_char = entity['end']
        label = entity['PARTY']
        start_token = nil
        end_token = nil

        doc.tokens.each do |token|
            start_token = token.i if token.idx == start_char
            end_token = token.i + 1 if token.idx + token.text.length == end_char
        end

        if start_token && end_token
            ents << Spacy::Span.new(doc, start_token, end_token, label: label)
        end
    end
    ents
end

def get_spacy_false_positives(doc, true_positives)
    false_positives = []
    true_ent_ranges = true_positives.map { |ent| [ent.start, ent.end] }

    doc.ents.each do |ent|
        if ent.label == 'ORG' && !true_ent_ranges.include?([ent.start, ent.end])
            false_positives << ent
        end
    end

    false_positives
end

nlpsm.pipe(context_offset_tuples).each do |doc, orgs|
    spacy_true_positives = get_spacy_true_positives(doc, orgs)
    spacy_false_positives = get_spacy_false_positives(doc, spacy_true_positives)

    spacy_true_positives.each do |ent|
        puts "TRUE: '#{ent}', start:#{ent.start}, end:#{ent.end}"
    end

    spacy_false_positives.each do |ent|
        puts "FALSE: '#{ent}', start:#{ent.start}, end:#{ent.end}"
    end
end