![./img/sp_tiny.png](./img/sp_tiny.png)
<h1><center>Accent Detection: <br>
Signal Processing  + CNN</center></h1>
<h3><center>Part 1: Getting the dataset</center></h3>
<center>Group 2: Katerina Bosko and Victor Wilm
<center>Northeastern University, CS6140

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pydub import AudioSegment
from urllib.request import urlretrieve
import os
import random
import time

In this notebook, we scrape [Speech Accent Archive](http://accent.gmu.edu/) gathered by the George Mason University to get data for two languages sets:
1) English, Russian, German, Mandarin <br>
2) English, Russian, French, Arabic

We are downloading data in wav format instead of compressed mp3 because most audio processing libraries support wav format better. 

Note that we do not provide the datasets because their size is very large (languages set 1 - 2GB, languages set 2 - 1.65 GB). 


In [2]:
def get_languages() -> list:
    """
    This function gets the list of languages from http://accent.gmu.edu/browse_language.php
    and returns a list of languages
    """
    url = "http://accent.gmu.edu/browse_language.php"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    main_content = soup.find(id="maincontent")
    languages = [language.text for language in main_content.find_all("li")]
    return languages

In [3]:
def get_language_urls(language: str) -> dict:
    """
    This function finds all the urls for a given language
    """
    url = f"http://accent.gmu.edu/browse_language.php?function=find&language={language}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    main_content = soup.find(id="maincontent")
    content = main_content.find(class_="content")
    samples = content.find_all("p")
    result = {}
    for sample in samples:
        result[sample.text.replace(", ", "_")] = "http://accent.gmu.edu/" + sample.find("a")["href"]
    return result

In [4]:
def get_audio(data_path: str, url: str, folder: str, name: str) -> None:
    """
    This function downloads the audio file from a given url
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    audio_url = "http://accent.gmu.edu" + soup.find("source").get("src")
    (audio, headers) = urlretrieve(audio_url)
    audio = AudioSegment.from_mp3(audio)
    if not os.path.exists(f"{data_path}/{folder}/"):
        os.makedirs(f"{data_path}/{folder}/")
    audio.export(f"{data_path}/{folder}/{name}.wav", format="wav")

In [5]:
def download_data(data_path, languages_chosen):
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    languages = get_languages()
    for language in languages:
        if language in languages_chosen:
            print(f"Downloading {language}...")
            urls = get_language_urls(language)
            for name, url in urls.items():
                time.sleep(random.randint(1, 5))
                get_audio(data_path, url, language, name)

In [6]:
DATA_PATH_1 = "./data/lang_set_1"
LANGUAGES_CHOSEN_1 = ["english", "russian", "german", "mandarin"]
download_data(DATA_PATH_1, LANGUAGES_CHOSEN_1)

Downloading english...
Downloading german...
Downloading mandarin...
Downloading russian...


In [8]:
# DATA_PATH_2 = "./data/lang_set_2"
# LANGUAGES_CHOSEN_2 = ["english", "russian", "arabic", "french"]
# download_data(DATA_PATH_2, LANGUAGES_CHOSEN_2)