### Imports

In [39]:
import requests
import pandas as pd
import json
import time
import os

### Function for API Call to Pandas df

In [26]:
def query(query: str = "", country: str = "cnt:singapore", page: int = 1) -> pd.DataFrame:
    """
    This function takes in the query(species), country with cnt: at the start,
    page as page number. This will output a dataframe from the reqeust of the 
    xeno canto api
    ---
    Args: query:str,country:str, page:str
    ---
    Returns: Dataframe consisting of the data retrieved from the api
    """
    try:
        response = requests.get(
            f"https://xeno-canto.org/api/2/recordings?query={query}{country}&page={page}",
            timeout=5
        )
        if response.status_code != 200:
            print(f"Request failed with status code {response.status_code}")
            return None
        data = response.json()
        if 'recordings' not in data:
            print("No recordings found")
            return None
        recordings = data['recordings']
        df = pd.json_normalize(recordings)
        time.sleep(1)
        
        json_obj = json.loads(response.text)
        num_recordings = json_obj['numRecordings']
        num_species = json_obj['numSpecies']
        page = json_obj['page']
        num_pages = json_obj['numPages']

        print("Number of recordings: ", num_recordings),
        print("Number of species: ", num_species)
        print("Page: ", page)
        print("Number of pages: ", num_pages)
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [30]:
page = [1,2,3,4,5]
df_sg = pd.DataFrame()

for i  in page:
    df_temp = query(country="cnt:singapore",page=i)
    df_sg = df_sg.append(df_temp)

Number of recordings:  2118
Number of species:  230
Page:  1
Number of pages:  5
Number of recordings:  2118
Number of species:  230
Page:  2
Number of pages:  5
Number of recordings:  2118
Number of species:  230
Page:  3
Number of pages:  5
Number of recordings:  2118
Number of species:  230
Page:  4
Number of pages:  5
Number of recordings:  2118
Number of species:  230
Page:  5
Number of pages:  5


### Meta Data

In [87]:
df_sg['sp'].value_counts()

zeylanicus     62
paradiseus     56
chinensis      51
zanthopygia    34
saularis       34
               ..
olor            1
mongolus        1
sacra           1
maculatus       1
glareola        1
Name: sp, Length: 203, dtype: int64

In [71]:
df_sg['en'].value_counts()

Straw-headed Bulbul             62
Greater Racket-tailed Drongo    56
Black-naped Oriole              40
Arctic Warbler                  34
Oriental Magpie-Robin           34
                                ..
Bali Myna                        1
Great Egret                      1
Mute Swan                        1
Blue Whistling Thrush            1
Red-necked Stint                 1
Name: en, Length: 230, dtype: int64

In [83]:
df_sg[['file','en']].value_counts()

file                                    en                     
                                        Straw-headed Bulbul        62
                                        Black-naped Oriole         40
                                        Oriental Magpie-Robin      34
                                        Common Hill Myna           24
                                        Asian Fairy-bluebird       18
                                                                   ..
https://xeno-canto.org/576712/download  Brown-throated Sunbird      1
https://xeno-canto.org/576711/download  Brown-throated Sunbird      1
https://xeno-canto.org/576703/download  Red-breasted Parakeet       1
https://xeno-canto.org/576702/download  Long-tailed Parakeet        1
https://xeno-canto.org/97698/download   Copper-throated Sunbird     1
Length: 1876, dtype: int64

In [37]:
df_sg.columns

Index(['id', 'gen', 'sp', 'ssp', 'group', 'en', 'rec', 'cnt', 'loc', 'lat',
       'lng', 'alt', 'type', 'sex', 'stage', 'method', 'url', 'file',
       'file-name', 'lic', 'q', 'length', 'time', 'date', 'uploaded', 'also',
       'rmk', 'bird-seen', 'animal-seen', 'playback-used', 'temp', 'regnr',
       'auto', 'dvc', 'mic', 'smp', 'sono.small', 'sono.med', 'sono.large',
       'sono.full', 'osci.small', 'osci.med', 'osci.large'],
      dtype='object')

   
This is the schema obtained from the website, which is missing ['sono.small', 'sono.med', 'sono.large','sono.full', 'osci.small', 'osci.med', 'osci.large']

* id: the catalogue number of the recording on xeno-canto
* gen: the generic name of the species
* sp: the specific name (epithet) of the species
* ssp: the subspecies name (subspecific epithet)
* group: the group to which the species belongs (birds, grasshoppers)
* en: the English name of the species
* rec: the name of the recordist
* cnt: the country where the recording was made
* loc: the name of the locality
* lat: the latitude of the recording in decimal coordinates
* lng: the longitude of the recording in decimal coordinates
* type: the sound type of the recording (combining both predefined terms such as 'call' or 'song' and additional free text options)
* sex: the sex of the animal
* stage: the life stage of the animal (adult, juvenile, etc.)
* method: the recording method (field recording, in the hand, etc.)
* url: the URL specifying the details of this recording
* file: the URL to the audio file
* file-name: the original file name of the audio file
* sono: an object with the urls to the four versions of sonograms
* osci: an object with the urls to the three versions of oscillograms
* lic: the URL describing the license of this recording
* q: the current quality rating for the recording
* length: the length of the recording in minutes
* time: the time of day that the recording was made
* date: the date that the recording was made
* uploaded: the date that the recording was uploaded to xeno-canto
* also: an array with the identified background species in the recording
* rmk: additional remarks by the recordist
* bird-seen: despite the field name (which was kept to ensure backwards compatibility), this field indicates whether the recorded animal was seen
* animal-seen: was the recorded animal seen?
* playback-used: was playback used to lure the animal?
* temperature: temperature during recording (applicable to specific groups only)
* regnr: registration number of specimen (when collected)
* auto: automatic (non-supervised) recording?
* dvc: recording device used
* mic: microphone used
* smp: sample rate

In [46]:
os.chdir('..')
os.getcwd()

'c:\\Users\\Justin\\Desktop\\AI CV\\Sound Duck\\SoundDuck'

In [112]:
birds_of_interest = ['zeylanicus','paradiseus','chinensis']
df = df_sg[(df_sg['sp'] == birds_of_interest[0]) | (df_sg['sp'] == birds_of_interest[1]) | (df_sg['sp'] == birds_of_interest[2])]

In [116]:
df.drop(df[df['file'] == ''].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [118]:
df['file']

0      https://xeno-canto.org/578938/download
162    https://xeno-canto.org/773797/download
163    https://xeno-canto.org/756521/download
164    https://xeno-canto.org/756425/download
165     https://xeno-canto.org/40042/download
                        ...                  
488    https://xeno-canto.org/614160/download
489    https://xeno-canto.org/610725/download
490    https://xeno-canto.org/583368/download
491    https://xeno-canto.org/352037/download
492    https://xeno-canto.org/335907/download
Name: file, Length: 63, dtype: object

### Function to download Audio Files from a Pandas DataFrame

In [47]:
def download_files(df):
    """
    Downloads audio files from URLs specified in a Pandas DataFrame,
    saves them to a local 'data' folder, and appends the file path as
    a new column to the DataFrame.
    ---
    Args:
        df: Pandas DataFrame with 'file' and 'file-name' columns specifying
            the URLs and file names of the audio files to be downloaded.
    ---
    Returns:
        df: Pandas DataFrame with an additional 'file-path' column that
            contains the local file paths of the downloaded audio files.
    """
    for i, row in df.iterrows():
        url = row['file']
        file_name = row['file-name']
        response = requests.get(url)
        file_path = os.path.join('data', file_name)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"File '{file_name}' downloaded to 'data' folder.")
        df.loc[i, 'file-path'] = file_path
        
    return df

In [69]:
download_files(df_sg)

File 'XC578938-Quail, King_2019-12-28_0757_SG_Harvest-Link.mp3' downloaded to 'data' folder.
File 'XC728917-433785121.mp3' downloaded to 'data' folder.
File 'XC689265-Red Junglefowl_Macritche_DLY.mp3' downloaded to 'data' folder.
File 'XC677303-C1M.mp3' downloaded to 'data' folder.
File 'XC550149-Red Junglefowl_CCNR_DLY.mp3' downloaded to 'data' folder.
File 'XC352425-Red Junglefowl Chick.mp3' downloaded to 'data' folder.
File 'XC341564-Red Junglefowl @ Botanic Garden_161104_0297.mp3' downloaded to 'data' folder.
File 'XC665890-Red Junglefowl @ Ubin 210724_1180.mp3' downloaded to 'data' folder.
File 'XC610745-Junglefowl, Red_2020-12-14_0910_SG_DFNP.mp3' downloaded to 'data' folder.
File 'XC609632-Red Junglefowl_BL_DLY.mp3' downloaded to 'data' folder.
File 'XC603625-Red Junglefowl.mp3' downloaded to 'data' folder.
File 'XC599043-BBNP_rooster2_20201031.mp3' downloaded to 'data' folder.
File 'XC563290-BBNP_rooster_20190528.mp3' downloaded to 'data' folder.
File 'XC498034-rjf_ubin_call.mp

OSError: [Errno 22] Invalid argument: 'data\\XC605669-??.mp3'

In [None]:
df_sg['file-path']

In [None]:
df_sg.to_csv('/data/csv/my_data.csv', index=False)

In [None]:
df = pd.read_csv('/data/csv/my_data.csv')
df