/
main_extractor_clean_audio.py
70 lines (62 loc) · 2.99 KB
/
main_extractor_clean_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
import cv2
import sys
import subprocess
import os
import wave
from scipy import signal
from scipy.io import wavfile
import matplotlib.pyplot as plt
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
def format_time(msecs):
#Segurament es podira optimitzar fent operacions de modul
#en comptes de divisions i casts
h = int(msecs/3600000)
m = int((msecs/60000) - (h*60))
s = int((msecs/1000) - (m*60))
ms = int(msecs - s*1000)
return h,m,s,ms
audio_lenght = 35
audio_lenght2 = 65
face_margin = 1.25
if not os.path.exists('./output'):
os.makedirs('./output')
with open('vid_list.txt') as f:
for line in f:
print(line)
next_video = line.rstrip('\n')
input_vid = next_video
video_capture = cv2.VideoCapture(input_vid+'.mp4')
command = "ffmpeg -i " + input_vid + ".mp4 -vn "+input_vid+".wav"
subprocess.call(command, shell=True)
rate, sig = wavfile.read(input_vid+".wav")
sig = sig.sum(axis=1) / 2 # per passar a monochanel en principi
audio_lenght_n = int(round(audio_lenght*rate/1000))
audio_lenght2_n = int(round(audio_lenght2*rate/1000))
width = video_capture.get(cv2.CAP_PROP_FRAME_WIDTH) # float
height = video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)
fps = video_capture.get(cv2.CAP_PROP_FPS)
num_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
video_duration = num_frames/fps
video_capture.set(cv2.CAP_PROP_POS_FRAMES, 2)
current_frame = 2
while (current_frame < num_frames-100): #deixem un marge de 100 frames per si mal codificat
current_frame = int(video_capture.get(cv2.CAP_PROP_POS_FRAMES))
current_time = video_capture.get(cv2.CAP_PROP_POS_MSEC)
video_capture.grab()
current_time_audio_sample=int(round(current_time*rate/1000))
aux = current_time_audio_sample-int(round(audio_lenght_n/2))
audio_seg = sig[current_time_audio_sample-int(round(audio_lenght_n/2)):current_time_audio_sample+int(round(audio_lenght_n/2))]
audio_seg2 = sig[current_time_audio_sample-int(round(audio_lenght2_n/2)):current_time_audio_sample+int(round(audio_lenght2_n/2))]
mfcc_feat = mfcc(audio_seg, rate,0.002, 0.001)
mfcc_feat = mfcc_feat[:,1:]
mfcc_feat2 = mfcc(audio_seg2, rate,0.004, 0.002)
mfcc_feat2 = mfcc_feat[:,1:]
cv2.imwrite("./output/" + input_vid + "_spectogram_" + str(current_frame) + ".jpg", mfcc_feat);
np.save('./output/' + input_vid + '_MFCC_' + str(current_frame), mfcc_feat)
cv2.imwrite("./output/" + input_vid + "_spectogram2_" + str(current_frame) + ".jpg", mfcc_feat2);
np.save('./output/' + input_vid + '_MFCC2_' + str(current_frame), mfcc_feat2)
video_capture.release()