In [22]:
import os
import openai
import jsonlines
import spacy
import spacy.displacy
import numpy as np
import bisect
import collections
import re

sail_openai_api_key = os.getenv("SAIL_OPENAI_API_KEY")

In [21]:
sail_openai_api_key

'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'

In [2]:
openai.api_key = sail_openai_api_key

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
movie_data = None
with jsonlines.open("/proj/sbaruah/mica_text_coref/data/example/coref_input.coref_output.jsonl", "r") as f:
    for obj in f:
        movie_data = obj

In [5]:
movie_data.keys()

dict_keys(['movie', 'rater', 'token', 'pos', 'ner', 'parse', 'speaker', 'dep_head', 'dep', 'sent_offset', 'clusters'])

In [6]:
clusters = movie_data["clusters"]
words = movie_data["token"]
parse = movie_data["parse"]
sentence_offsets = movie_data["sent_offset"]
ner = movie_data["ner"]

In [7]:
top_characters = sorted(clusters, key=lambda cluster: len(cluster), reverse=True)[:10]

In [8]:
scene_description_offsets = []
i = 0
while i < len(parse):
    if parse[i] == "N":
        j = i + 1
        while j < len(parse) and parse[j] == "N":
            j += 1
        scene_description_offsets.append([i, j - 1])
        i = j
    else:
        i += 1

In [9]:
scene_description_starts, scene_description_ends = [], []
for i, j in scene_description_offsets:
    scene_description_starts.append(i)
    scene_description_ends.append(j)

In [10]:
limit = 2048
character_names = []
character_descriptions = []

for character in top_characters:
    # find scene descriptions where character is mentioned
    character_description_ids = set()
    named_mentions = []
    for i, j in character:
        mention = " ".join(words[i: j + 1])
        is_person = len([ner[l] == "PERSON" for l in range(i, j + 1)]) > 0
        named_mentions.append(mention)

        k = bisect.bisect_right(scene_description_starts, i) - 1
        if scene_description_starts[k] <= i <= j <= scene_description_ends[k]:
            character_description_ids.add(k)
    
    character_description = ""
    for k in character_description_ids:
        description = " ".join(words[scene_description_starts[k]: scene_description_ends[k] + 1])
        if len(character_description.split()) + len(description.split()) < limit:
            character_description += description + "\n"
        else:
            break
    character_description = character_description.strip()

    named_mentions_counter = collections.Counter(named_mentions).items()
    named_mentions_counter = sorted(named_mentions_counter, key=lambda x: x[1], reverse=True)
    name = named_mentions_counter[0][0]
    print(name)

    n_tokens = len(character_description.split())
    print(f"{n_tokens} words in character description")
    
    character_names.append(name)
    character_descriptions.append(character_description)

COLIN
2038 words in character description
BILLY
1995 words in character description
COSTELLO
1964 words in character description
MADOLYN
888 words in character description
MISTER FRENCH
1211 words in character description
QUEENAN
658 words in character description
COLIN
601 words in character description
SGT . DIGNAM
298 words in character description
MADOLYN
335 words in character description
QUEENAN
256 words in character description


In [11]:
print(character_descriptions[0])

COLIN stands in the door like a gunfighter . Everyone looks at him .
YELLOW RIPPLES PAST THE CAMERA AND WHEN IT CLEARS WE SEE THROUGH DIESEL SMOKE : A BUSING PROTEST IN PROGRESS . THE SCHOOL - BUS , FULL OF BLACK KIDS , IS HIT WITH BRICKS , ROCKS . N . B . : ( THIS IS NOT SETTING THE LIVE ACTION IN 1974 ; IT IS A HISTORICAL MONTAGE , THE BACKGROUND FOR COSTELLO ' S V . O . ) .
COSTELLO ' s profile passes in a dark room .
The neighborhood . 1980 ' s . We won ' t be here long . This isn ' t where Costello ends up . It ' s where he began . Liquor stores with shamrocked signs . MEN FISHING near Castle Island . Catholic SCHOOLKIDS playing in an asphalted schoolyard .
COSTELLO comes in . The shop is one that sells papers , sundries , fountain drinks ... and fronts a bookie operation .
The frightened proprietor hands over money . Fifty bucks , a hundred , doesn ' t matter . COSTELLO is never the threatener . His demeanor is gentle , philosophical . Almost a shrink ' s probing bedside manner .

In [19]:
response = openai.Completion.create(
            model="text-davinci-003", 
            prompt=f"{character_descriptions[0]}\n\nFor each of the following attribute type, answer what attribute "
            f"{character_names[0]} portrays.\nAnswer NA if enough information is not available.\n\n"
            "1. Name\n2. Gender\n3. Race\n4. Age\n5. Appearance\n6. Profession\n8. Archetype\n9. Trope\n"
            "10. Personality\n11. Educational Qualification\n12. Goal\n13. Nationality\n14. Political Affiliation"
            "\n15. Religion",
            temperature=0,
            max_tokens=256,
            )

In [20]:
response

<OpenAIObject text_completion id=cmpl-6smMMnoiqIIEp4FQPtTTxuj5fJZhz at 0x7fd98779ca40> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n\n1. Name: Colin\n2. Gender: Male\n3. Race: NA\n4. Age: 14-15, then 18-19, then mid-30s\n5. Appearance: Bookish, small for his age, vest\n6. Profession: Police Officer\n8. Archetype: Hero\n9. Trope: Gunfighter\n10. Personality: Gentle, philosophical, observant, analytical\n11. Educational Qualification: NA\n12. Goal: NA\n13. Nationality: NA\n14. Political Affiliation: NA\n15. Religion: Catholic"
    }
  ],
  "created": 1678513626,
  "id": "cmpl-6smMMnoiqIIEp4FQPtTTxuj5fJZhz",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 122,
    "prompt_tokens": 2560,
    "total_tokens": 2682
  }
}

In [17]:
prompt=(f"{character_description[0]}\n\nFor each of the following attribute type, answer what attribute "
            f"{character_names[0]} portrays.\nAnswer NA if enough information is not available.\n\n"
            "1. Name\n2. Gender\n3. Race\n4. Age\n5. Appearance\n6. Profession\n8. Archetype\n9. Trope\n"
            "10. Personality\n11. Educational Qualification\n12. Goal\n13. Nationality\n14. Political Affiliation"
            "\n15. Religion")

In [18]:
prompt

'H\n\nFor each of the following attribute type, answer what attribute COLIN portrays.\nAnswer NA if enough information is not available.\n\n1. Name\n2. Gender\n3. Race\n4. Age\n5. Appearance\n6. Profession\n8. Archetype\n9. Trope\n10. Personality\n11. Educational Qualification\n12. Goal\n13. Nationality\n14. Political Affiliation\n15. Religion'

In [26]:
response["choices"][0]["text"]

'\n\n1. Name: Colin\n2. Gender: Male\n3. Race: NA\n4. Age: 14-15, then 18-19, then mid-30s\n5. Appearance: Bookish, small for his age, vest\n6. Profession: Police Officer\n8. Archetype: Hero\n9. Trope: Gunfighter\n10. Personality: Gentle, philosophical, observant, analytical\n11. Educational Qualification: NA\n12. Goal: NA\n13. Nationality: NA\n14. Political Affiliation: NA\n15. Religion: Catholic'

In [28]:
attributes = ["Name", "Gender", "Race", "Age", "Appearance", "Profession", "Archetype", "Trope", "Personality", 
              "Educational Qualification", "Goal", "Nationality", "Political Affiliation", "Religion"]
for attribute in attributes:
    match = re.search(f"{attribute}: ([^\n]+)\n", response["choices"][0]["text"])
    if match is not None:
        print(f"{attribute} = {match.group(1)}")

Name = Colin
Gender = Male
Race = NA
Age = 14-15, then 18-19, then mid-30s
Appearance = Bookish, small for his age, vest
Profession = Police Officer
Archetype = Hero
Trope = Gunfighter
Personality = Gentle, philosophical, observant, analytical
Educational Qualification = NA
Goal = NA
Nationality = NA
Political Affiliation = NA
