In [22]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pydub import AudioSegment
from urllib.request import urlretrieve
import os
import random
import time

In [None]:
requests.url

In [20]:
def get_languages()->list:
  """
  This function gets the list of languages from http://accent.gmu.edu/browse_language.php
  and returns a list of languages
  """
  url = "http://accent.gmu.edu/browse_language.php"
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  main_content = soup.find(id="maincontent")
  languages = [language.text for language in main_content.find_all('li')]
  return languages

def get_language_urls(language:str)->dict:
  """
  This function finds all the urls for a given language
  """
  url = f"http://accent.gmu.edu/browse_language.php?function=find&language={language}"
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  main_content = soup.find(id="maincontent")
  content = main_content.find(class_='content')
  samples = content.find_all('p')
  result = {}
  for sample in samples:
    result[sample.text.replace(', ', "_")] = "http://accent.gmu.edu/" + sample.find('a')['href']
  return result

def get_audio(url:str, folder:str, name:str)->None:
  """
  This function downloads the audio file from a given url
  """
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  audio_url = "http://accent.gmu.edu" + soup.find("source").get("src")
  (audio, headers) = urlretrieve(audio_url)
  audio = AudioSegment.from_mp3(audio)
  if not os.path.exists(f"data/{folder}/"):
    os.makedirs(f"data/{folder}/")
  audio.export(f"data/{folder}/{name}.mp3", format="mp3")

def download_data():
  languages = get_languages()
  for language in languages:\
    print(f"Downloading {language}...")
    if language in ['english', 'russian', 'german', 'mandarin']:
      urls = get_language_urls(language)
      for name, url in urls.items():
        # Sleep a random time to possibly avoid triggering a bot detection
        time.sleep(random.randint(1, 5))
        get_audio(url, language, name)

In [3]:
r = requests.get("http://accent.gmu.edu/browse_language.php?function=detail&speakerid=61")
soup = BeautifulSoup(r.text, 'html.parser')

In [9]:
audio = soup.find("source")
audio.get("src")

'/soundtracks/english1.mp3'

In [18]:
get_audio("http://accent.gmu.edu/browse_language.php?function=detail&speakerid=61", "english", "english_1")

In [23]:
download_data()

KeyboardInterrupt: 

In [11]:
get_language_audio("english")

{'english1_male_pittsburgh_pennsylvania_usa': 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=61',
 'english2_female_birmingham_uk': 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=77',
 'english3_female_brisbane_australia': 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=88',
 "english4_female_saint anne's bay_jamaica": 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=99',
 'english5_male_fairfax_virginia_usa': 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=110',
 'english6_female_brooklyn_new york_usa': 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=121',
 'english7_male_macon_mississippi_usa': 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=132',
 'english8_female_perth_australia': 'http://accent.gmu.edu/browse_language.php?function=detail&speakerid=143',
 'english9_female_carthage_texas_usa': 'http://accent.gmu.edu/browse_language.php?f

In [9]:
get_languages()

['aceh',
 'afrikaans',
 'agni',
 'agny',
 'akan',
 'albanian',
 'amazigh',
 'american sign language',
 'amharic',
 'ancient greek',
 'antigua and barbuda creole english',
 'anyin',
 'appolo',
 'arabic',
 'aramaic',
 'armenian',
 'aromanian',
 'ashanti',
 'asl',
 'azerbaijani',
 'azerbaijani, south',
 'azeri turk',
 'babur',
 'bafang',
 'baga',
 'bahasa indonesia',
 'bahasa melayu',
 'bai',
 'balant',
 'balanta ganja',
 'bamanankan',
 'bambara',
 'bamun',
 'banganthe',
 'bangla',
 'baoule',
 'bari',
 'basque',
 'bassa',
 'bavarian',
 'belarusan',
 'bengali',
 'bikol',
 'bisayan',
 'bislama',
 'bosnian',
 'bulgarian',
 'burmese',
 'cameroon creole english',
 'cantonese',
 'carolinian',
 'castellano',
 'catalan',
 'cebuano',
 'chaam',
 'chagga',
 'chaldean',
 'chaldean neo aramaic',
 'chamorro',
 'charapa-spanish',
 'chichewa',
 'chin, mizo',
 'chinese',
 'chittagonian',
 'chuukese',
 'classical greek',
 'cotocoli',
 'creole',
 'creole french',
 'crioulo',
 'croatian',
 'czech',
 'danish'