<a href="https://colab.research.google.com/github/whkwls2653/Emotion-Recognition/blob/main/wav2vec_local.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(allfile_datalist, test_size=0.25, random_state=0)
print(len(dataset_train))
print(len(dataset_test))

In [None]:
!pip install transformers==4.11.3
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install dataset

In [5]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import torch
import ast
from transformers import Wav2Vec2Processor
from torch.nn.utils.rnn import pad_sequence

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor.from_pretrained("/content/gdrive/MyDrive/감정인식_대회/wave2vec2", pad_token_id=49)
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def mapping(self,data):
        with self.processor.as_target_processor():
            ret = self.processor("".join([i if i!='\x1b' else '|' for i in ast.literal_eval(data)])).input_ids
            ret_torch = torch.tensor([int(0 if value is None else value) for value in ret])
        return ret_torch

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature['input_values']} for feature in features]
        # e.g. feature['label'] = "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"
        label_features = [{"input_ids": self.mapping(feature["label"])} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch


# 자소 단위로 나누어지지 않은 경우 사용 -> 한번 하면 저장해놓기
class DataProc:
    def __init__(self, model_name="/content/gdrive/MyDrive/감정인식_대회/wave2vec2"):
        self.processor = Wav2Vec2Processor.from_pretrained(model_name, pad_token_id=49)

    def to_jaso(self, sentence):
        NO_JONGSUNG = ''
        CHOSUNGS = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
        JOONGSUNGS = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']
        JONGSUNGS = [NO_JONGSUNG,  'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

        N_CHOSUNGS, N_JOONGSUNGS, N_JONGSUNGS = 19, 21, 28
        FIRST_HANGUL, LAST_HANGUL = 0xAC00, 0xD7A3 #'가', '힣'    
     
        result = []
        for char in sentence:
            if ord(char) < FIRST_HANGUL or ord(char) > LAST_HANGUL: 
                result.append('|')
            else:          
                code = ord(char) - FIRST_HANGUL
                jongsung_index = code % N_JONGSUNGS
                code //= N_JONGSUNGS
                joongsung_index = code % N_JOONGSUNGS
                code //= N_JOONGSUNGS
                chosung_index = code
                result.append(CHOSUNGS[chosung_index])
                result.append(JOONGSUNGS[joongsung_index])
                if jongsung_index!=0:
                    result.append(JONGSUNGS[jongsung_index])
                
        with self.processor.as_target_processor():
            ret = self.processor("".join(result))
        
        return ret.input_ids

    def prepare_dataset(self, df):
        """
        df.cols = ['audio', 'sentence', 'path']
        """
        df['label'] = df['sentence'].apply(self.to_jaso)
        return df

In [21]:
!pip install datasets
# import datasets
from datasets import load_metric, load_from_disk
# from data_proc import  get_senior_data
# from data_collator import DataCollatorCTCWithPadding
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import TrainingArguments, Trainer
from tqdm import tqdm
import torch
import ast
import pickle
import numpy as np
# import os
# os.environ['CUDA_LAUNCH_BLOCKING']='1'
# os.environ['CUDA_VISIBLE_DEVICES'] = "2,3"

repo_name = '/content/gdrive/MyDrive/감정인식_대회/wave2vec2'

processor = Wav2Vec2Processor.from_pretrained('/content/gdrive/MyDrive/감정인식_대회/wave2vec2')
wer_metric = load_metric("wer")
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = 49 # as fleek model has 2 pad tokens
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

def get_model(model_name="/content/gdrive/MyDrive/감정인식_대회/wave2vec2"): 
    model = Wav2Vec2ForCTC.from_pretrained(
        model_name, 
        attention_dropout=0.1,
        hidden_dropout=0.1,
        feat_proj_dropout=0.0,
        mask_time_prob=0.05,
        layerdrop=0.1,
        ctc_loss_reduction="mean", 
        pad_token_id=49,
        vocab_size=50,
        ignore_mismatched_sizes=True
    )

    model.freeze_feature_extractor()
    model.gradient_checkpointing_enable()
    return model

def train_model(train, test, model):
    training_args = TrainingArguments(
        output_dir=repo_name,
        group_by_length=True,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=4,
        evaluation_strategy="steps",
        num_train_epochs=30,
        fp16=True,
        save_steps=300,
        eval_steps=300,
        logging_steps=50,
        learning_rate=3e-4,
        warmup_steps=300,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        dataloader_num_workers=6
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train,
        eval_dataset=test,
        tokenizer=processor.feature_extractor,
        data_collator=data_collator
    )
    print(f"build trainer on device {training_args.device} with {training_args.n_gpu} gpus")
    trainer.train()

if __name__ == "__main__":
    
    # torch.cuda.empty_cache()
    
    # dataset = load_from_disk('/content/gdrive/MyDrive/감정인식_대회/데이터셋/2019wav_txt_label_zip_toteval.pkl')
    # dataset = dataset.remove_columns(['wav', 'text', 'labels'])

    # train = dataset['train'] #.select([i for i in range(0,5000)])
    # test =  dataset['valid'] #.select([i for i in range(0,5000)])
  wav_txt_label_root='/content/gdrive/MyDrive/감정인식_대회/데이터셋/2019wav_txt_label_zip.pkl'
  with open(wav_txt_label_root,'rb') as f:
    allfile_datalist=pickle.load(f)

  # print(len(allfile_datalist))
  # print(allfile_datalist[0])
  train=allfile_datalist[:10000]
  test=allfile_datalist[10000:]

  model = get_model()
  train_model(train, test, model)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
19257
['/content/gdrive/MyDrive/감정인식 대회/데이터셋/KEMDy19/wav/Session01/Sess01_script01/Sess01_script01_M001.wav', '어 저 지그 지금 사람 친 거야? 지금 사람 친 거 맞지? 그치?\n', '1']


Using amp fp16 backend


build trainer on device cuda:0 with 1 gpus


ValueError: ignored

In [17]:
a=allfile_datalist[0]
print(a)
for a in allfile_datalist:
  a[0],a[1],a[2]=a[1],a[0],a[2]

print(allfile_datalist[0])
print(allfile_datalist[1])

['어 저 지그 지금 사람 친 거야? 지금 사람 친 거 맞지? 그치?\n', '/content/gdrive/MyDrive/감정인식 대회/데이터셋/KEMDy19/wav/Session01/Sess01_script01/Sess01_script01_M001.wav', '1']
['/content/gdrive/MyDrive/감정인식 대회/데이터셋/KEMDy19/wav/Session01/Sess01_script01/Sess01_script01_M001.wav', '어 저 지그 지금 사람 친 거야? 지금 사람 친 거 맞지? 그치?\n', '1']
['/content/gdrive/MyDrive/감정인식 대회/데이터셋/KEMDy19/wav/Session01/Sess01_script01/Sess01_script01_F001.wav', 'b/ 몰라. o/ b/ 아 몰라 어떡해. o/\n', '0']


In [19]:

with open('/content/gdrive/MyDrive/감정인식_대회/데이터셋/2019wav_txt_label_zip.pkl','wb') as f:
  pickle.dump(allfile_datalist,f)