In [None]:
"""
Preamble for most code and jupyter notebooks
@author: tobinsouth
@notebook date: 9 Sep 2021
"""

import numpy as np, pandas as pd, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns
import math, string, re, pickle, json, os, sys, datetime, itertools, glob
from collections import Counter
from tqdm import tqdm

# Set panda's options
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 120)

# Better graphics
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')
plt.style.use('seaborn-poster')

In [None]:
# Useful for code, also may need to navigate there to run following snippets.
data_path = '/u/tsouth/projects/educationText/data/'

First we need to convert the file to a wav:

`ffmpeg -i SpanishMovie.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 fullmovieaudio.wav`

and

`ffmpeg -ss 3120 -t 60 -i fullmovieaudio.wav clip.wav`

Next, to run you need to split audio into segments no-longer that 120 seconds. This can be done via:
`ffmpeg -i fullmovieaudio.wav -f segment -segment_time 119 -c copy moviesplit/split%03d.wav`

Alternatively, you can first split files where their are silences. Note that you will probably still need to split into 120s segments.

```bash
ffmpeg -i "fullmovieaudio.wav" -af silencedetect=noise=-50dB:d=0.5 -f null - 2> vol.txt
ffmpeg -ss <silence_end - 0.25> -t <next_silence_start - silence_end + 2 * 0.25> -i input.mov word-N.mov
```

As a oneliner:

```
ffmpeg -i fullmovieaudio.wav -filter_complex "[0:a]silencedetect=n=-40dB:d=0.3[outa]" -map [outa] -f s16le -y /dev/null |& F='-aq 70 -v warning' perl -ne 'INIT { $ss=0; $se=0; } if (/silence_start: (\S+)/) { $ss=$1; $ctr+=1; printf "ffmpeg -nostdin -i fullmovieaudio.wav -ss %f -t %f $ENV{F} -y moviesplit/%03d.wav\n", $se, ($ss-$se), $ctr; }  if (/silence_end: (\S+)/) { $se=$1; } END { printf "ffmpeg -nostdin -i fullmovieaudio.wav -ss %f $ENV{F} -y moviesplit/%03d.wav\n", $se, $ctr+1; }' | bash -x
```

May want to remove extremely small files:
`find moviesplit/. -name "*.wav" -type 'f' -size -160k -delete`

In [3]:
from asrecognition import ASREngine
# https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish
models = ["facebook/wav2vec2-large-xlsr-53-spanish", "jonatasgrosman/wav2vec2-large-xlsr-53-spanish","flax-community/wav2vec2-spanish"]# "facebook/s2t-medium-mustc-multilingual-st",]
for model in tqdm(models):
    asr = ASREngine("es", model_path=model)
    audio_paths = glob.glob(data_path+"moviesplit/*.wav")
    transcriptions = asr.transcribe(audio_paths)

    tmap = {int(t['path'][-7:-4]): t['transcription'] for t in transcriptions}
    full_transcription = ' '.join(tmap[i] for i in range(len(tmap)))
    open(models.replacce('/','_')+'.txt').write(full_transcription)

## Alt approach:

In [3]:
# from asrecognition import ASREngine
asr_alt = ASREngine("es", model_path="flax-community/wav2vec2-spanish")
# asr_alt = ASREngine("es", model_path="facebook/s2t-medium-mustc-multilingual-st")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at flax-community/wav2vec2-spanish and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
audio_paths = glob.glob(data_path+"moviesplit/*.wav")
transcriptions = asr_alt.transcribe(audio_paths)

tmap = {int(t['path'][-7:-4]): t['transcription'] for t in transcriptions}
full_transcription = ' '.join(tmap[i] for i in range(len(tmap)))
open('tobin_transcribed.txt').write(full_transcription)


# Comparing Results

In [4]:
with open(data_path + 'tobin_transcribed.txt', 'r') as f:
    fb_trans = f.read()

with open(data_path + 'subtitles_final.txt', 'r') as f:
    subtitles_final = f.read()

with open(data_path + 'script_sp.txt', 'r') as f:
    script_sp = f.read()

with open(data_path + 'script_mx.txt', 'r') as f:
    script_mx = f.read()

In [14]:
Pfb_trans = Counter(fb_trans.split())
Psubtitles_final =  Counter(subtitles_final.split())
Pscript_sp =  Counter(script_sp.split())
Pscript_mx =  Counter(script_mx.split())

In [25]:
ground_truth = list(set(Psubtitles_final.keys()))
Pground_truth = [Psubtitles_final[k] for k in ground_truth]

In [29]:
from sklearn.metrics import mutual_info_score
print("Facebook", mutual_info_score(Pground_truth, [Pfb_trans[k] for k in ground_truth]))
print("Spanish" , mutual_info_score(Pground_truth, [Pscript_sp[k] for k in ground_truth]))
print("Mexican", mutual_info_score(Pground_truth, [Pscript_mx[k] for k in ground_truth]))

Facebook 0.6370062146900024
Spanish 0.6995193559342343
Mexican 0.7882892806597096
