In [None]:

!pip install -U datasets
!pip install evaluate
!pip install transformers
!pip install jiwer


# NOTE: The recordings, reading_passage.txt, and speakers_all.csv can be downloaded from:
# https://www.kaggle.com/code/danijel3/accent-classification/input?select=recordings
import torchaudio
import torch
from google.colab import drive # Link your drive if you are a colab user
from pathlib import Path
from datasets import Dataset, ClassLabel
from IPython.display import Audio
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
from jiwer import wer, wil
import pandas as pd

drive.mount('/content/drive') 

'''
load_data - Generates audio numpy objects from the" mp3 file data from 
            the Speech Accent Archive (saved to drive).
'''
def load_data():
    for file in Path('/content/drive/MyDrive/recordings/').glob('*.mp3'):
        audio,rate=torchaudio.load(str(file))
        if audio.size()[1]/rate > 40:
            continue
        transform=torchaudio.transforms.Resample(rate,16000)
        audio=transform(audio).squeeze(0).numpy()
        if file.name[:7]=='english':
            is_english=1
        else:
            is_english=0
        yield {'audio':audio,'name':file.stem,'label':is_english}


def make_prediction(sample, model, tokenizer, feature_extractor):
  # Get the predicted ids.
  input_values = feature_extractor(sample["audio"], return_tensors="pt").input_values
  logits = model(input_values).logits[0]
  pred_ids = torch.argmax(logits, axis=-1)
 
  # retrieve word stamps (analogous commands for `output_char_offsets`) 
  outputs = tokenizer.decode(pred_ids, output_word_offsets=True)

  # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
  time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
  word_offsets = []
  word_preds = []
  
  for d in outputs.word_offsets:
    word_dict = {"word": d["word"],"start_time": round(d["start_offset"] * time_offset, 2),"end_time": round(d["end_offset"] * time_offset, 2), "is_english":sample["label"]}
    word_offsets.append(word_dict)
    word_preds.append(word_dict["word"].upper())
  
  return word_preds

# Load all the english speaking data from Speech Accent Archive. 
data=Dataset.from_generator(load_data)
data.features['label']=ClassLabel(2,['foreign','native']) # Load all the english data

# Load the pretrained wav2vec model. 
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
dataset_iter = iter(data)
filename_wordOffsets_dict = {}

# Make word predictions for all samples. 

for sample in dataset_iter:
  filename_wordOffsets_dict[sample["name"]] = make_prediction(sample, model, tokenizer, feature_extractor)
  
  

ground_truth = []
f_passage = open('/content/drive/MyDrive/reading-passage.txt')
lines = f_passage.readlines()
for line in lines:
  line_list = line.split()
  for word in line_list:
    word = word.strip(',.:')
    ground_truth.append(word.upper())

print(ground_truth)
ground_truth_str = " ".join(ground_truth)

# Compute the word error rate for native english speakers vs. non-native.
groupA_wer = 0.0
groupB_wer = 0.0

# Open the speaker csv and save the WER and WIL for each file. 
speakers = pd.read_csv('/content/drive/MyDrive/speakers_all.csv')
speakers['wer'] =1.0
speakers['wil'] = 1.0

for filename in filename_wordOffsets_dict:
  print(filename)
  try:
    speakers.at[speakers['filename'] == filename, 'wer'] = wer(ground_truth_str, " ".join(filename_wordOffsets_dict[filename]))
    speakers.at[speakers['filename'] == filename, 'wil'] = wil(ground_truth_str, " ".join(filename_wordOffsets_dict[filename]))
    print("The WER of the filename {} is {}".format(filename, speakers.loc[speakers['filename'] == filename, 'wer']))
  except:
    print(filename + " is missing.")

speakers.to_csv('/content/drive/MyDrive/wav2vec_results.csv')
print("finished")



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset({
    features: ['audio', 'name', 'label'],
    num_rows: 577
})


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failin

dict_keys(['english321', 'english322', 'english323', 'english324', 'english325', 'english326', 'english327', 'english328', 'english329', 'english331', 'english330', 'english332', 'english333', 'english334', 'english335', 'english336', 'english337', 'english338', 'english339', 'english340', 'english341', 'english342', 'english343', 'english344', 'english345', 'english346', 'english347', 'english348', 'english349', 'english350', 'english351', 'english352', 'english353', 'english354', 'english355', 'english356', 'english357', 'english358', 'english359', 'english360', 'english361', 'english362', 'english363', 'english364', 'english365', 'english366', 'english367', 'english368', 'english369', 'english370', 'english371', 'english372', 'english373', 'english374', 'english375', 'english376', 'english377', 'english378', 'english379', 'english380', 'english381', 'english382', 'english383', 'english384', 'english385', 'english386', 'english387', 'english388', 'english389', 'english390', 'english3