In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/ipa-feat/IPA_Features_Result.tsv", sep="\t")

train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

train_df["ASR_Error"].to_csv("/kaggle/working/err_syl.train", index=False, header=False)
train_df["Correct"].to_csv("/kaggle/working/crr_syl.train", index=False, header=False)

val_df["ASR_Error"].to_csv("/kaggle/working/err_syl.val", index=False, header=False)
val_df["Correct"].to_csv("/kaggle/working/crr_syl.val", index=False, header=False)


In [None]:
!pip3 install OpenNMT-py

In [None]:
!pip install "numpy<2"

In [None]:
!sudo apt-get install libgoogle-perftools-dev libsparsehash-dev

In [None]:
!git clone https://github.com/clab/fast_align.git

!mkdir fast_align/build
%cd fast_align/build
!cmake ..
!make -j4  

In [None]:
!/kaggle/working/fast_align/build/fast_align \
    -i /kaggle/input/ipa-feat/IPA_Features_Result.tsv \
    -d -o -v > /kaggle/working/forward1.align

In [None]:
 # Create the YAML configuration file
# On a regular machine, you can create it manually or with nano

config = '''

# mm_spell_t.yaml

save_data: /kaggle/working/transformer
overwrite: True

src_vocab: /kaggle/working/transformer/mm_spell.vocab.src
tgt_vocab: /kaggle/working/transformer/mm_spell.vocab.tgt
vocab_size_multiple: 8
src_words_min_frequency: 1
tgt_words_min_frequency: 1
share_vocab: True
n_sample: 0

src_seq_length: 200
tgt_seq_length: 200

src_feats: 1
n_src_feats: 1
feat_merge: "mlp"
feat_vec_size: 512

data:
    corpus_1:
        path_src: /kaggle/working/err_syl.train
        path_tgt: /kaggle/working/crr_syl.train
        path_align: /kaggle/working/forward1.align
        transforms: [inferfeats, filtertoolong]
        weight: 1
    valid:
        path_src: /kaggle/working/err_syl.val
        path_tgt: /kaggle/working/crr_syl.val
        transforms: [inferfeats]

reversible_tokenization: "joiner"

save_model: ./transformer/working/model/transformer.mmspell
save_checkpoint_steps: 10000
keep_checkpoint: 10
average_decay: 0
seed: 3435
train_steps: 200000
valid_steps: 10000
warmup_steps: 5000
report_every: 1000
early_stopping: 4


decoder_type: transformer
encoder_type: transformer
word_vec_size: 512
hidden_size: 512
enc_layers: 4
dec_layers: 4
transformer_ff: 2048
heads: 8

accum_count: 4

model_dtype: "fp16"
optim: adam
adam_beta1: 0.9
adam_beta2: 0.998
decay_method: noam
learning_rate: 0.1
max_grad_norm: 0.0

batch_size: 64
batch_type: tokens
normalization: tokens
dropout_steps: [0]
dropout: [0.3]
attention_dropout: [0.3]
position_encoding: true
label_smoothing: 0.1

max_generator_batches: 2

param_init: 0.0
param_init_glorot: true

world_size: 1
gpu_ranks: [0]

tensorboard: true
tensorboard_log_dir: ./transformer/logs


'''

with open("mm_spell_t.yaml", "w+") as config_yaml:
  config_yaml.write(config)

!cat mm_spell_t.yaml

In [None]:
!onmt_build_vocab -config mm_spell_t.yaml -n_sample -1

In [None]:
vocab_path = "/kaggle/working/transformer/mm_spell.vocab.src"

with open(vocab_path, "r") as f:
    lines = f.readlines()

with open(vocab_path, "w") as f:
    for line in lines:
        parts = line.strip().split()
        if len(parts) == 2 and parts[1].isdigit():
            f.write(line)

In [None]:
!onmt_train -config mm_spell_t.yaml


In [None]:
import os

model_dir = "./transformer/working/model/"
checkpoints = [f for f in os.listdir(model_dir) if f.endswith(".pt")]
latest_checkpoint = sorted(checkpoints)[-1] 
latest_model_path = os.path.join(model_dir, latest_checkpoint)

print("Using model:", latest_model_path)

In [None]:
import os
from huggingface_hub import HfApi, upload_file

model_dir = "./transformer/working/model/"

checkpoints = [f for f in os.listdir(model_dir) if f.endswith(".pt")]
checkpoints = sorted(checkpoints)

print(f"Found {len(checkpoints)} checkpoints: {checkpoints}")

repo_id = "LULab/whisper-synde-align-IPA-checkpoints"
api = HfApi()
api.create_repo(repo_id, repo_type="model", exist_ok=True)

for ckpt in checkpoints:
    ckpt_path = os.path.join(model_dir, ckpt)
    print(f"Uploading {ckpt_path}...")
    upload_file(
        path_or_fileobj=ckpt_path,
        path_in_repo=f"checkpoints/{ckpt}",  
        repo_id=repo_id,
        repo_type="model"
    )

print("All checkpoints uploaded.")
