# Лабораторная работа 2: Построение моделей на основе предобученных эмбеддингов

## Import libs

In [1]:
from pathlib import Path

import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import warnings
warnings.filterwarnings('ignore')





## Get data

In [2]:
df = pd.read_csv("data/youtube.csv")[["link", "category"]]
df["sample_url"] = df["link"].apply(lambda x: f"https://www.youtube.com/watch?v={x}")
df


Unnamed: 0,link,category,sample_url
0,JLZlCZ0,travel,https://www.youtube.com/watch?v=JLZlCZ0
1,i9E_Blai8vk,travel,https://www.youtube.com/watch?v=i9E_Blai8vk
2,r284c-q8oY,travel,https://www.youtube.com/watch?v=r284c-q8oY
3,Qmi-Xwq-ME,travel,https://www.youtube.com/watch?v=Qmi-Xwq-ME
4,_lcOX55Ef70,travel,https://www.youtube.com/watch?v=_lcOX55Ef70
...,...,...,...
3594,#NAME?,history,https://www.youtube.com/watch?v=#NAME?
3595,d-2Trw8bCa0,history,https://www.youtube.com/watch?v=d-2Trw8bCa0
3596,RCKWarkUL,history,https://www.youtube.com/watch?v=RCKWarkUL
3597,MF6F3BxJIY,history,https://www.youtube.com/watch?v=MF6F3BxJIY


In [3]:
categories = df['category'].unique()
categories


array(['travel', 'food', 'art_music', 'history'], dtype=object)

In [68]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
    
def get_embedding(filename):
    audio_input, _ = librosa.load(filename, sr=16000)
    input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
    hidden_states = model(input_values).last_hidden_state
    return np.mean(hidden_states.detach().numpy(), axis=1).squeeze()



emb = get_embedding("data/audio_from_videos/history_1.wav")
emb


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


array([-0.26720756,  0.19039923,  0.07929158, ..., -0.45387655,
        0.5132634 ,  0.20095946], dtype=float32)

## Download audio

In [5]:
from __future__ import unicode_literals
import yt_dlp
import ffmpeg
import pandas as pd

category = 'history'
output_directory = 'data/audio'
ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
    'outtmpl': f'{output_directory}/{category}/%(title)s.%(ext)s',  # Save the file with a counter and category name
}

def download_from_url(url, category):
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info_dict = ydl.extract_info(url, download=True)
        except yt_dlp.DownloadError as e:
            print(f"Error downloading audio for {url}: {e}")
            
# filtered_df = df[df['category'] == category].head(50)

# for index, row in filtered_df.iterrows():
#     url = row['sample_url']
#     download_from_url(url, category)
    

In [12]:
import os
def rename_files(directory_path):
    # Get the list of files in the specified directory
    files = os.listdir(directory_path)

    for file in files:
        if file.endswith(".webm"):
            # Construct the new file name by replacing ".webm" with ".wav"
            new_name = file.replace(".webm", ".wav")

            # Build the full paths for the old and new file names
            old_path = os.path.join(directory_path, file)
            new_path = os.path.join(directory_path, new_name)

            # Rename the file
            os.rename(old_path, new_path)

directory_path = "data/audio/travel"
rename_files(directory_path)


## Cut segments

In [18]:
from pydub import AudioSegment
import os

def cut_and_save_files(directory_path):
    files = os.listdir(directory_path)

    for file in files:
        if file.endswith(".wav"):
            file_path = os.path.join(directory_path, file)
            audio = AudioSegment.from_file(file_path)
            duration_ms = len(audio)
            segment_duration = 5000
            num_segments = duration_ms // segment_duration
            for i in range(num_segments):
                start_time = i * segment_duration
                end_time = (i + 1) * segment_duration
                segment = audio[start_time:end_time]
                new_name = f"{os.path.splitext(file)[0]}_segment_{i+1}.wav"
                new_path = os.path.join(directory_path, new_name)
                segment.export(new_path, format="webm")

directory_path = "data/audio/art_music"
cut_and_save_files(directory_path)


data/audio/art_music\1.wav
data/audio/art_music\2.wav
data/audio/art_music\3.wav
data/audio/art_music\37 BRIGHT AND EASY PAINTING IDEAS.wav
data/audio/art_music\5 Paintings For Beginners ｜｜ Complete Guide on Blending Techniques ｜｜ Painting on 5 Tiny Canvases.wav
data/audio/art_music\Abstract Floral Art using a String⧸Chain Pull on a Large Canvas.wav
data/audio/art_music\Acrylic Painting Tree Meadow Landscape.wav
data/audio/art_music\ART VLOG paint along with me, BIGGEST painting tip & paintings I don't want you to see!？...wav
data/audio/art_music\Como Pintar FLORES Fáciles con Pintura Acrilica.wav
data/audio/art_music\COOL DRAWING CHALLENGE ｜｜ CRAZY SCHOOL ART FOR 24 HOURS ｜｜ DIY Painting Hacks! By 123 GO! BOYS.wav
data/audio/art_music\Daily Challenge #34 ⧸ Easy Art  ⧸  Power lines at sunset painting.wav
data/audio/art_music\DIY ｜ Easy & Unique Bottle Painting Idea ｜Bottle Craft ｜ Beginners Bottle Art ｜ Jyoshita Ghate ｜.wav
data/audio/art_music\Drawing House form Shapes, easy acrylic p

In [19]:
directory_path = "data/audio/food"
cut_and_save_files(directory_path)


data/audio/food\5 of the Best Street Food Finds in Paris.wav
data/audio/food\50 Traditional European Dishes You Have to Try (2020).wav
data/audio/food\A Taste of European Food Culture.wav
data/audio/food\Americans Try Bizarre European Food.wav
data/audio/food\Amsterdam Street Food Tour - DUTCH STREET FOOD of Holland ｜ UNIQUE Street Food in The Netherlands.wav
data/audio/food\Barcelona Food Tour at LA BOQUERIA and Sagrada Familia - Barcelona, Spain, Travel Guide!.wav
data/audio/food\Chicken Francaise Recipe over 200 Million Views.wav
data/audio/food\Common American Foods That Are Banned In Other Countries.wav
data/audio/food\COOKING EUROPEAN FOOD  ചോറും കറീം  കിട്ടില്ലേ ？ WITH GERMAN SUBTITLES MALAYALAM COOKING.wav
data/audio/food\DEATH BY EUROPEAN FOOD!.wav
data/audio/food\Delicious Europe.wav
data/audio/food\DESAFIO EM DOBRO!! 6KG DE BURRITO COM NICK WEHRY!.wav
data/audio/food\Eastern Europe Food Tour： Poland, Czech Republic, Slovakia, Hungary.wav
data/audio/food\European Fast Food Ch

In [20]:
directory_path = "data/audio/history"
cut_and_save_files(directory_path)


data/audio/history\#AskAbhijit 1： Indian History, Physics, Geopolitics, Education, Spirituality.wav
data/audio/history\#AskAbhijit 2： Indian History, China, Quantum Mechanics, String Theory, Sanskrit.wav
data/audio/history\07_00 PM - NTA UGC NET 2021 ｜ Indian History by Shiv Meena ｜ Important Treaties in Indian History.wav
data/audio/history\7 brave queens in Indian History ｜｜ भारत के ईतिहास की 7 बहादुर रानीया ｜｜.wav
data/audio/history\Akhanda Bharatam Part 1 - Magadha Samrajyam 1 ｜ Magadha Kingdom ｜ Magadha Dynasty ｜ Indian History.wav
data/audio/history\All Indian states ｜ Political history of Indian State ｜ INM ｜  WBCS ｜ Police ｜ DFCCIL Exam.wav
data/audio/history\ANCIENT INDIAN HISTORY ｜ NCERT ｜ R.S.SHARMA ｜ CLASS 11 ｜ Lecture 7 ｜ Chapter 9 ｜ BUDDHISM.wav
data/audio/history\Complete Modern History In One Video ｜ संपूर्ण आधुनिक भारतीय इतिहास एक वीडियो में.wav
data/audio/history\In Memoriam - Jallianwala Bagh Massacre ｜ Indian History with BYJU'S.wav
data/audio/history\Indian history

In [21]:
directory_path = "data/audio/travel"
cut_and_save_files(directory_path)


data/audio/travel\Amazing NorthEast ｜ Assam Meghalaya Arunachal ｜ Complete Tour ｜  Pradesh ｜ Northeast India｜ TRAVEL.wav
data/audio/travel\EGYPT ： HOW PEOPLE TREAT AN INDIAN TOURIST 😡 #INDIANINEGYPT ｜EP-10｜..wav
data/audio/travel\EP 14 Meghalaya Tour complete Travel Guide ｜ North East India.wav
data/audio/travel\EXPLORING VARANASI ｜ Benaras Travel Vlog #1.wav
data/audio/travel\Gongoni tour #sangitabideshfamily.wav
data/audio/travel\How I Became a TRAVEL VLOGGER ｜ Make Money Even if You're Just Starting Out as a YouTuber!.wav
data/audio/travel\How to Start a Travel Blog [2021] Travel Blogging Full-Time.wav
data/audio/travel\I Went to Japan for the First Time ｜ Tokyo Travel Vlog (Pt.1).wav
data/audio/travel\Kenya Trip Begins ｜｜ Must Watch ｜｜.wav
data/audio/travel\Koh Phangan： 'New Thailand' for Indian Family and Couples in LOW BUDGET (trip cost⧸Flight).wav
data/audio/travel\London Travel Vlog 🇬🇧 ｜ MissMikaylaG.wav
data/audio/travel\MALDIVES ｜ Traveling after lockdown! ｜ Soneva Fushi - Pa

## Prepare dataset

In [51]:
data = []
for category in categories:
    file_paths = list(Path(f'data/audio/{category}').glob("*.wav"))
    file_paths = [path for path in file_paths if 'segment' in str(path)]
    category_data = [{"category": category, "filename": file_path} for file_path in file_paths]
    data.extend(category_data)

df_train = pd.DataFrame(data)
df_train


Unnamed: 0,category,filename
0,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...
1,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...
2,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...
3,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...
4,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...
...,...,...
31372,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...
31373,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...
31374,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...
31375,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...


In [41]:
df_train.to_csv('audio_segments.csv', index=False)


In [42]:
df_train = pd.read_csv('audio_segments.csv')


In [52]:
df_train["is_travel"] = df_train.category.apply(
    lambda x: "travel" if x == "travel" else "not_travel"
)
df_train


Unnamed: 0,category,filename,is_travel
0,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...,travel
1,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...,travel
2,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...,travel
3,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...,travel
4,travel,data\audio\travel\Amazing NorthEast ｜ Assam Me...,travel
...,...,...,...
31372,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...,not_travel
31373,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...,not_travel
31374,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...,not_travel
31375,history,data\audio\history\ప్రాచీన భారత దేశ చరిత్ర ｜｜ ...,not_travel


In [69]:
shuffled_df = df_train.sample(1000).sample(frac=1).reset_index(drop=True)
shuffled_df


Unnamed: 0,category,filename,is_travel
0,history,data\audio\history\भारत का संपूर्ण इतिहास 1 ही...,not_travel
1,travel,data\audio\travel\MALDIVES ｜ Traveling after l...,travel
2,history,data\audio\history\Complete Modern History In ...,not_travel
3,art_music,data\audio\art_music\5 Paintings For Beginners...,not_travel
4,history,data\audio\history\All Indian states ｜ Politic...,not_travel
...,...,...,...
995,history,data\audio\history\#AskAbhijit 1： Indian Histo...,not_travel
996,history,data\audio\history\Complete Modern History In ...,not_travel
997,art_music,data\audio\art_music\Paint with me - Easy and ...,not_travel
998,history,data\audio\history\Complete Modern History In ...,not_travel


## Get results

In [70]:
from tqdm import tqdm
tqdm.pandas()
shuffled_df["embedding"] = shuffled_df["filename"].progress_apply(get_embedding)

X = np.stack(shuffled_df["embedding"].values, axis=0)
y = shuffled_df["is_travel"].map({"travel": 1, "not_travel": 0}).values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [21:28<00:00,  1.29s/it]


In [71]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    

In [76]:
model = Sequential()
model.add(Dense(10, input_dim=1024, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy", f1])
model.fit(X_train, y_train, epochs=100, batch_size=10);


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [75]:
_, accuracy, f1_score = model.evaluate(X_test, y_test)
print("Accuracy: %.2f" % (accuracy * 100))
print("F1: %.2f" % (f1_score))


Accuracy: 82.33
F1: 0.35
