<a href="https://colab.research.google.com/github/vifirsanova/phat-llm/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

0. **Download and Prepare the Data:**
   - Transcribe a set of audio recordings with OpenAI Whisper
   - IPA annotate audio files via GPT-4
   - Use Praat and ELAN-annotated speech samples

In [None]:
!pip install transformers
!pip install datasets
!pip install git+https://github.com/huggingface/peft.git
!pip install praatio
!pip install pydub
from google.colab import drive
drive.mount('/content/drive')
!git clone https://github.com/vifirsanova/phat-llm.git
%cd phat-llm

1. **Load the Pre-trained Model:**

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

model_name = "openai/whisper-base"
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)

2. **Add LoRA Adapters:**

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
         r=16,  # Rank of the low-rank approximation
         lora_alpha=32,  # Scaling factor
         lora_dropout=0.1,  # Dropout probability
         target_modules=["q_proj", "v_proj"]  # Target modules to apply LoRA
     )

model = get_peft_model(model, lora_config)

3. **Prepare the Training Data:**

In [None]:
from datasets import load_dataset

def preprocess_function(examples):
    audio_inputs = processor(examples["audio"], sampling_rate=16000, return_tensors="pt")
    with processor.as_target_processor():
        labels = processor(examples["text"], return_tensors="pt").input_ids
    return {"input_features": audio_inputs["input_features"], "labels": labels}

def load_and_preprocess_dataset(dataset_name, task):
    dataset = load_dataset(dataset_name)
    train_dataset = dataset["train"].map(preprocess_function, batched=True)
    return train_dataset

ipa_dataset = load_and_preprocess_dataset('ipa_transcription_dataset', 'ipa')
prosody_dataset = load_and_preprocess_dataset('prosody_dataset', 'prosody')
non_verbal_dataset = load_and_preprocess_dataset('non_verbal_dataset', 'non_verbal')

4. **Fine-Tune the Model for Each Task:**

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

def fine_tune_model(dataset, task, model_class, output_dir, num_train_epochs, per_device_train_batch_size, learning_rate):
    model = model_class.from_pretrained(model_name)
    model = get_peft_model(model, lora_config)
    
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        evaluation_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        save_total_limit=2,
        save_strategy="epoch",
        fp16=True,
        learning_rate=learning_rate,
    )
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=dataset["validation"],
        data_collator=processor,
    )
    
    trainer.train()
    return model

ipa_model = fine_tune_model(ipa_dataset, 'ipa', WhisperForConditionalGeneration, './results/ipa', 3, 16, 5e-5)
prosody_model = fine_tune_model(prosody_dataset, 'prosody', WhisperForConditionalGeneration, './results/prosody', 3, 16, 5e-5)
non_verbal_model = fine_tune_model(non_verbal_dataset, 'non_verbal', WhisperForConditionalGeneration, './results/non_verbal', 3, 16, 5e-5)

5. **Evaluate the Models:**

In [None]:
def evaluate_model(model, dataset):
    eval_results = model.evaluate()
    print(eval_results)

evaluate_model(ipa_model, ipa_dataset)
evaluate_model(prosody_model, prosody_dataset)
evaluate_model(non_verbal_model, non_verbal_dataset)

6. **Inference for Each Task:**

In [None]:
def transcribe_audio(model, audio_path):
    audio_input = processor(audio_path, sampling_rate=16000, return_tensors="pt")
    generated_ids = model.generate(input_ids=audio_input["input_features"])
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return transcription

audio_path = '/path/to/your/audio/file.wav'
ipa_transcription = transcribe_audio(ipa_model, audio_path)
prosody_analysis = transcribe_audio(prosody_model, audio_path)
non_verbal_annotation = transcribe_audio(non_verbal_model, audio_path)

print(f'IPA Transcription: {ipa_transcription}')
print(f'Prosody Analysis: {prosody_analysis}')
print(f'Non-Verbal Annotation: {non_verbal_annotation}')

7. **Convert to XML through Prompt-Tuning**

In [None]:
def convert_to_xml(ipa_transcription, prosody_analysis, non_verbal_annotation, output_path):
    xml_content = f"""
    <transcription>
        <ipa>{ipa_transcription}</ipa>
        <prosody>{prosody_analysis}</prosody>
        <non_verbal>{non_verbal_annotation}</non_verbal>
    </transcription>
    """
    with open(output_path, 'w') as f:
        f.write(xml_content)

output_path = '/path/to/your/output/file.xml'
convert_to_xml(ipa_transcription, prosody_analysis, non_verbal_annotation, output_path)
print(f'XML saved to {output_path}')