# IPSYN with Meta LLaMa

In [1]:
!unzip transcripts.zip

Archive:  transcripts.zip
   creating: transcripts/Sarah/
  inflating: transcripts/Sarah/020305.cha  
  inflating: transcripts/Sarah/030216.cha  
  inflating: transcripts/Sarah/040128.cha  
  inflating: transcripts/Sarah/050106.cha  


In [2]:
!apt install -q nvidia-cuda-toolkit
!pip install -q -U accelerate bitsandbytes peft transformers

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libaccinj64-11.5 libatk-wrapper-java libatk-wrapper-java-jni
  libbabeltrace1 libcub-dev libcublas11 libcublaslt11 libcudart11.0 libcufft10 libcufftw10
  libcuinj64-11.5 libcupti-dev libcupti-doc libcupti11.5 libcurand10 libcusolver11 libcusolvermg11
  libcusparse11 libdebuginfod-common libdebuginfod1 libegl-dev libfontenc1 libgail-common libgail18
  libgl-dev libgl1-mesa-dev libgles-dev libgles1 libglvnd-core-dev libglvnd-dev libglx-dev
  libgtk2.0-0 libgtk2.0-bin libgtk2.0-common libipt2 libnppc11 libnppial11 libnppicc11 libnppidei11
  libnppif11 libnppig11 libnppim11 libnppist11 libnppisu11 libnppitc11 libnpps11 libnvblas11
  libnvidia-compute-495 libnvidia-compute-510 libnvidia-compute-535 libnvidia-ml-dev libnvjpeg11
  libnvrtc-builtins11.5 libnvrtc11.2 libnvtoolsext1 libnvvm4 libopengl-dev librsvg2-common
  l

In [3]:
import torch
from google.colab import userdata
from os import listdir, mkdir
from os.path import isdir, isfile, join
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [4]:
HF_TOKEN = userdata.get('HF_TOKEN')
model = 'meta-llama/Llama-3.2-1B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model, token=HF_TOKEN, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(model, load_in_4bit=True, device_map='auto', token=HF_TOKEN)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [5]:
pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_new_tokens=118)
pipe.tokenizer.pad_token_id = model.config.eos_token_id[0]

In [6]:
ipsyn_category = [
'''
    N1: noun
    N2: pronoun or prolocative
    N3: modifier
    N4: two-word noun phrase
    N5: article + noun
    N6: verb + two-word noun phrase
    N7: plural noun
    N8: two-word noun phrase + verb
    N9: three-word noun phrase
    N10: adverb + noun phrase
    N11: bound morpheme in noun phrase
''',
'''
    V1: verb
    V2: particle or preposition
    V3: prepositional phrase
    V4: copula linking two nominals
    V5: catenative + verb
    V6: auxiliary be, do, or have
    V7: progressive suffix
    V8: adverb
    V9: modal + verb
    V10: third person singular present suffix
    V11: past tense modal
    V12: past tense suffix
    V13: past tense auxiliary
    V14: medial adverb
    V15: copula, modal, or auxiliary for emphasis or ellipsis
    V16: past tense copula
    V17: bound morpheme in verb phrase
''',
'''
    Q1: question
    Q2: wh-pronoun
    Q3: simple negation
    Q4: wh-pronoun + verb
    Q5: subject + negation + verb
    Q6: wh-question with inversion
    Q7: negation of copula, modal, or auxiliary
    Q8: yes/no question with inversion
    Q9: why, when, which, or whose
    Q10: tag question
    Q11: question with both negation and inversion
''',
'''
    S1: two word combination
    S2: subject + verb
    S3: verb + object
    S4: subject + verb + object
    S5: conjunction
    S6: two verb phrases
    S7: conjoined phrases
    S8: infinitive without catenative
    S9: let/make/help/watch introducer
    S10: adverbial conjunction
    S11: propositional complement
    S12: conjoined sentences
    S13: wh-clause
    S14: bitransitive predicate
    S15: three or more verb phrases
    S16: relative clause
    S17: infinitive clause
    S18: gerund
    S19: front or center-embedded subordinate clause
    S20: passive construction or intrusion
'''
]

ipsyn_whole = ''.join([ipsyn_category[i].rstrip() for i in range(len(ipsyn_category))]) + '\n'
ipsyn_phenomenon = ['\n' + phenomenon + '\n' for phenomenon in ipsyn_whole.strip('\n').splitlines()]

In [7]:
prompt = '''
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert in child language development. You will be presented with a list
of utterances said by a child and a list of linguistic phenomena in JSON format. Each
linguistic phenomenon will consist of an ID followed by the name of the phenomenon.
For each phenomenon, consider whether the utterance exhibits it. Respond with the IDs
of the phenomena exhibited by the excerpt in a comma-separated list and NOTHING ELSE.

For example, you may be presented with the following:
QUESTION: Which of the given linguistic phenomena are exhibited by these utterances?
UTTERANCES:
[
    in my room .
]
LINGUISTIC PHENOMENA:
[
    N1: Noun,
    N2: Pronoun,
    N3: Modifier,
    N4: Two-word Noun Phrase
]

In this case, your response should be: N1, N4
You SHOULD NOT include any other text in your response.
<|eot_id|><|start_header_id|>user<|end_header_id|>
QUESTION: Which of the given linguistic phenomena are exhibited by these utterances?
UTTERANCES: [{}]
LINGUISTIC PHENOMENA: [{}]
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
'''

In [14]:
def process_transcript(child, filename, utterance_chunk, ipsyn_chunk):
    utterances = []
    with open(join('transcripts', child, filename)) as transcript:
        utterances = ['\n    ' + line.removeprefix('*CHI:').strip() \
                      for line in list(transcript) if line.startswith('*CHI:')][:100]
    for i in range(100 // utterance_chunk):
        utterances = utterances[:i] + [''.join(utterances[i:i + utterance_chunk]) + '\n'] + \
                     utterances[i + utterance_chunk:]
    ipsyn = ipsyn_whole if ipsyn_chunk == 'whole' else \
            ipsyn_phenomenon if ipsyn_chunk == 'phenomenon' else ipsyn_category
    pairs = [(utterance, phenomena) for utterance in utterances for phenomena in ipsyn]

    response_header = '<|start_header_id|>assistant<|end_header_id|>'
    header_length = len(response_header)
    with open(join('output', child, filename.removesuffix('cha') + 'txt'), 'a') as output:
        for out in pipe([prompt.format(*pair) for pair in pairs], batch_size=128):
            text = out[0]['generated_text']
            output.write(text[text.index(response_header) + header_length:].strip() + '\n')

    complete_output = ''
    with open(join('output', child, filename.removesuffix('cha') + 'txt')) as output:
        complete_output = output.read()

    with open(join('scores', child + '.csv'), 'a') as scores:
        score = 0
        for phenomenon in ipsyn_phenomenon:
                score += min(complete_output.count(phenomenon[:phenomenon.index(':')].strip()), 2)
        scores.write(filename.removesuffix('.cha') + ',' + str(utterance_chunk) + ',' + \
                     ipsyn_chunk + ',' + str(score) + '\n')

def process_child(child, utterance_chunk, ipsyn_chunk):
    # mkdir(join('output', child))
    with open(join('scores', child + '.csv'), 'a') as scores:
        scores.write('age,utterance_chunk,ipsyn_chunk,score\n')
    filenames = [f for f in listdir(join('transcripts', child)) if isfile(join('transcripts', child, f))]
    for filename in filenames:
        process_transcript(child, filename, utterance_chunk, ipsyn_chunk)

def process_children(utterance_chunk, ipsyn_chunk):
    # mkdir('output')
    # mkdir('scores')
    children = [f for f in listdir('transcripts') if isdir(join('transcripts', f))]
    for child in children:
        process_child(child, utterance_chunk, ipsyn_chunk)

In [15]:
process_children(1, 'category')

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin