In [None]:
# Install required libraries

# If you get any errors about missing libraries in other cells,
# just add the name of the library to the install statement

!pip install openai PyGithub github

In [1]:
# Import libraries

import os
import requests
from github import Github
from github import Auth
from openai import OpenAI

In [3]:
# Mount google drive to save output files

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Set up openAI Api key

from getpass import getpass
api_key = getpass('Enter your OpenAI API key: ')
os.environ['OPENAI_API_KEY'] = api_key

Enter your OpenAI API key: ··········


In [9]:
# If need to download transcripts/scores from github repo

GITHUB_REPO_URL = 'https://github.com/lfedronic/Adam' # You can also replace Adam with Eve or Sarah
GITHUB_FOLDER_PATH = ''  # Replace with desired folder of transcripts from the git repo

In [20]:
# Configuration

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
MODEL_NAME = 'gpt-4o-mini'
TEMP = 0
MAX_TOKENS = 8000  # Configured for the current prompt so that the model fully completes ipsyn



In [10]:
# Function to download files from GitHub

def download_files_from_github(repo_url, folder_path):
    auth = Auth.Token("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")
    g = Github(auth=auth)
    repo_name = '/'.join(repo_url.split('/')[-2:])
    repo = g.get_repo(repo_name)
    contents = repo.get_contents(folder_path)

    for content_file in contents:
        if content_file.type == 'file' and content_file.name.endswith('.txt'):
            file_content = requests.get(content_file.download_url).text
            with open(content_file.name, 'w') as f:
                f.write(file_content)
            print(f'Downloaded {content_file.name}')

In [None]:
download_files_from_github(GITHUB_REPO_URL, GITHUB_FOLDER_PATH)

In [12]:
scoring_categories = ['''N1: Noun
N2: Pronoun
N3: Modifier
N4:Two-word Noun Phrase
N5: Determiner + Noun
N6: Verb + Two-word Noun Phrase
N7: Noun Plural
N8: Two-word Noun Phrase + Verb
N9: Three-word Noun Phrase
N10: Noun Phrase + Adverb
N11: Bound Morpheme''',
'''V1: Verb
V2: Verb Particle or Preposition
V3: Prepositional Phrase
V4: Noun + Copula + Noun
V5: Catenative
V6: Auxiliary Be, Do, Have
V7: Progressive -ing
V8: Adverb
V9: Modal + Verb
V10: Third-person Singular Present
V11: Past Tense Modal
V12: Regular Past Tense
V13: Past Tense Auxiliary
V14: Medial Adverb
V15: Ellipsis
V16: Past Copula
V17: Bound Morpheme''',
'''Q1: contains a question mark
Q2: wh-word alone; routine question with or without a verb
Q3: Simple Negation
Q4: Wh-Question + Verb
Q5: Subject + negation + Verb
Q6: Wh-Question with Subject-Auxiliary inversion
Q7: Negation copula, modal or auxiliary
Q8: Yes/No Question with Subject–Auxiliary Inversion
Q9: Wh-Question
Q10: Tag Question
Q11: Negation Question with Subject–Auxiliary Inversion''',
'''S1: Any two words
S2: Subject + Verb
S3: Verb + Object
S4: Subject + Verb + Object
S5: Any Conjunction
S6: Any Two Verbs
S7: Conjoined Phrases
S8: Infinitive
S9: Let/Make/Help/Watch
S10: Subordinating Conjunction
S11: Mental State Verb
S12: Conjoined Clauses
S13: If or Wh-Clause
S14: Bitransitive Predicate
S15: Three or More Verbs
S16: Relative Clause
S17: Infinitival Clause
S18: Gerund
S19: Left or Center-Embedded Clause
S20: Passive''']

category_templates = ['''N1, #
N2, #
N3, #
N4, #
N5, #
N6, #
N7, #
N8, #
N9, #
N10, #
N11, #''',
'''V1, #
V2, #
V3, #
V4, #
V5, #
V6, #
V7, #
V8, #
V9, #
V10, #
V11, #
V12, #
V13, #
V14, #
V15, #
V16, #
V17, #''',
'''Q1, #
Q2, #
Q3, #
Q4, #
Q5, #
Q6, #
Q7, #
Q8, #
Q9, #
Q10, #
Q11, #''',
'''S1, #
S2, #
S3, #
S4, #
S5, #
S6, #
S7, #
S8, #
S9, #
S10, #
S11, #
S12, #
S13, #
S14, #
S15, #
S16, #
S17, #
S18, #
S19, #
S20, #'''
]

In [13]:
# Function to process transcript and call OpenAI API

def process_transcript(file_name, split_factor):
    response = ""
    client = OpenAI()
    with open(file_name, 'r') as file:
        #transcript = file.read()
        transcript = file.readlines()
    chunk_size = len(transcript) // split_factor
    for category, category_template in zip(scoring_categories, category_templates):
        chunk_start = 0
        for i in range(1, split_factor+1):
            chunk = transcript[chunk_start:chunk_size*i]

            prompt = f"""Objective:
You are to evaluate the following transcript according to the Index of Productive Syntax. Specifically, you should go through the transcript one line at a time, checking for the presence of each grammatical item in the scoring table. You should keep track of how many times each scoring table item is satisfied by the line from the transcript. Each instance adds one to the item’s score. Do not award points based on implied or inferred words; just use what is explicitly stated in the transcript.

Scoring Table:
{category}

Transcript:
{chunk}

Steps:
1) Go through the transcript line by line, checking for the presence of each grammatical item. For each item you encounter, **only list the item(s) positively identified and their scores**, like so:
1. mine .
   - N2: 1 (Pronoun)

2. Mommy .
   - N1: 1 (Noun)
2) Once you have finished parsing the entire transcript, compile each item's score
3) Finally, present the final score adhering to this exact format, making sure to include every item even if its score is 0:
Final score:
{category_template}

"""

            completion = client.chat.completions.create(
                messages=[
                    {"role": "user", "content": prompt}
                ],
                model=MODEL_NAME,
                max_tokens=MAX_TOKENS,
                temperature=TEMP
            )

            response += completion.choices[0].message.content + "\n"

            chunk_start += chunk_size

    return response


In [14]:
# Feel free to replace these with whichever files you want to run
# Just make sure to use the 6-7 digit number that its in the filename

test_suite = ['0203041', '0207011', '0211131', '0303041', '0307071', '0311141', '0403091', '0407011', '0502122']

In [24]:
# Split factor determines how many parts each transcript is split into
# Empirically, raising the split factor
# raises both the scores and the spearman coefficient
# Also, note that a higher split factor will take longer to run

SPLIT_FACTOR = 4
OUTPUT_FOLDER = '/content/drive/MyDrive'
FOLDER_PATH = '' # If you put files in a folder, put the folder name here

for test_id in test_suite:
    if FOLDER_PATH == '':
        file_name = test_id + '_ipsyn.txt'
    else:
        file_name = FOLDER_PATH + '/' + test_id + '_ipsyn.txt'
    result = process_transcript(file_name, SPLIT_FACTOR)
    output_file_name = os.path.join(OUTPUT_FOLDER, f'evaluation_{file_name}')
    with open(output_file_name, 'w') as output_file:
        output_file.write(result)
    print(f'Saved evaluation for {file_name} to {output_file_name}')


Saved evaluation for 0203041_ipsyn.txt to /content/drive/MyDrive/evaluation_0203041_ipsyn.txt


KeyboardInterrupt: 