---

# Data Preprocessing

> This notebook contains code for preprocessing the corpus for training.
> This includes splitting latitudes, longitudes and normalization.
> The processed data is saved to `mappings.csv`

---

In [1]:
import os
import json
import numpy as np
import pandas as pd
import librosa

## Load mappings.json

In [2]:
with open('../corpus/mappings.json', 'r') as f:
    data = json.load(f)

In [3]:
paths = list(data.keys())
geotags = list(data.values())

**Get full file paths**

In [4]:
fullpaths = list()

for path in paths:
    fullpath = os.path.join('../corpus/audio', path)

    fullpaths.append(fullpath)

## Get latitudes and longitudes
- Split latitudes and longitudes
- Normalize them

In [5]:
lats = list()
longs = list()

for geotag in geotags:
    lat, long = geotag.split()
    lat = float(lat) / 90.0
    long = float(long) / 180.0

    lats.append(lat)
    longs.append(long)

## Write to csv
- Create mappings dataframe with full file paths, latitudes and longitudes
- Write to mappings.csv

In [6]:
df = pd.DataFrame({
    'path': fullpaths,
    'lat': lats,
    'long': longs
})

In [7]:
df.head()

Unnamed: 0,path,lat,long
0,../corpus/audio/169884.mp3,0.501047,0.041804
1,../corpus/audio/169885.mp3,0.501052,0.041806
2,../corpus/audio/697381.mp3,0.187898,0.534158
3,../corpus/audio/187893.mp3,0.459852,0.012109
4,../corpus/audio/788102.mp3,0.236748,-0.876991


In [8]:
df.to_csv('../corpus/mappings.csv', index=False)

## Process audio

In [9]:
audio = list()

for i, row in df.iterrows():
    y, sr = librosa.load(row['path'])

    # Pad if short
    if y.shape[0] < 44100:
        y = np.pad(y, (0, max(0, 44100 - len(y))), mode="constant")

    # Pick the loudest part if long
    else:
        rms = librosa.feature.rms(y=y[:-44100])
        max_rms = librosa.frames_to_time(np.argmax(rms), sr=sr)
        y = y[int(max_rms * sr): int(max_rms * sr) + 44100]

    audio.append(y)

### Write to numpy file

In [10]:
y = np.array(audio)
y.shape
np.save('../corpus/audio.npy', y)