# Initialize the Nomic API and Processor

In [1]:
!pip install datasets evaluate transformers[sentencepiece] &> /dev/null
!pip install nomic &> /dev/null

In [2]:
!nomic login nk-txKrSg8mGV7UtWdxNJ8YZ6H3auobZpzTBYWQHo3pzEw

In [3]:
import os

In [4]:
!pip install datasets[audio] librosa transformers
import librosa
import pandas as pd
from datasets import load_dataset
from transformers import ClapAudioModelWithProjection, ClapProcessor
from nomic import atlas
import numpy as np

model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Data Acquisition

## ESC50: ClapAudioWithProjection


In [5]:
import numpy as np

In [15]:
dataset = load_dataset("ashraq/esc50")



In [7]:
# Get the embeddings
if (os.path.exists('/content/audio_embeddings.npy')):
    audio_embeddings = np.load('/content/audio_embeddings.npy')
else:
    # Initialize an empty array to store the concatenated arrays
    audio_embeddings = np.empty((0, 512))

    # Iterate over the dictionaries to concatenate the arrays
    for split in dataset.values():
        split_audio = split['audio']
        for item in split_audio:
            audio_sample = item["array"]
            inputs = processor(audios=audio_sample, return_tensors="pt")
            outputs = model(**inputs)
            audio_embeds = outputs.audio_embeds
            total_array = np.concatenate((total_array, audio_embeds.detach().numpy().reshape(1, -1)), axis=0)

    # Check the shape of the concatenated array
    print("Shape of the concatenated array:", audio_embeddings.shape)

In [22]:
# Get the categories
if (os.path.exists('/content/categories.txt')):
    # Open the text file in read mode
    with open("/content/categories.txt", "r") as file:
        # Read all lines from the file and store them in a list
        categories = file.readlines()
else:
    categories = [dataset['train']['category'][i] for i in range(len(dataset['train']['category']))]

In [13]:
categories = [line.replace("_", " ") for line in categories]

## Common Language dataset

In [9]:
dataset = load_dataset("common_language", streaming=True)

In [10]:
langlist = [
    "Arabic",
    "Basque",
    "Breton",
    "Catalan",
    "Chinese_China",
    "Chinese_Hongkong",
    "Chinese_Taiwan",
    "Chuvash",
    "Czech",
    "Dhivehi",
    "Dutch",
    "English",
    "Esperanto",
    "Estonian",
    "French",
    "Frisian",
    "Georgian",
    "German",
    "Greek",
    "Hakha_Chin",
    "Indonesian",
    "Interlingua",
    "Italian",
    "Japanese",
    "Kabyle",
    "Kinyarwanda",
    "Kyrgyz",
    "Latvian",
    "Maltese",
    "Mangolian",
    "Persian",
    "Polish",
    "Portuguese",
    "Romanian",
    "Romansh_Sursilvan",
    "Russian",
    "Sakha",
    "Slovenian",
    "Spanish",
    "Swedish",
    "Tamil",
    "Tatar",
    "Turkish",
    "Ukranian",
    "Welsh",
]

In [11]:
# Get Common Language Embedding and Dataset dataframe
embed_cl = [] # list of dictionaries
embed_cl_arr = [] # list of embeddings
categories_cl = [] # list of categories/labels

for i in dataset['train']:
    tmp_embed_cl = {}
    tmp_embed_cl['client_id'] = i['client_id']
    tmp_embed_cl['mp3'] = f'''<audio src="https://drive.google.com/file/d/1KuBoKtof94dYq97s1Aya2JwzAwSYxJmn/view?usp=drive_link" type="audio/wav" controls autoplay> </audio>'''
    tmp_embed_cl['client_id'] = i['sentence']
    tmp_embed_cl['age'] = i['age']
    tmp_embed_cl['gender'] = i['gender']
    tmp_embed_cl['language'] = langlist[i['language']]
    categories_cl.append(langlist[i['language']])

    # embeddings
    audio_sample = i['audio']["array"]
    inputs = processor(audios=audio_sample, return_tensors="pt", sampling_rate=i['audio']['sampling_rate'])
    outputs = model(**inputs)
    audio_embeds = outputs.audio_embeds
    embed_cl_arr.append(audio_embeds.detach().numpy())

    # append the dict to the master list
    embed_cl.append(tmp_embed_cl)
    del tmp_embed_cl, audio_sample, inputs, outputs, audio_embeds


In [None]:
embed_cl_df = pd.DataFrame(embed_cl)

# Nomic Atlas Map Generation

## Map for ESC-50

In [18]:
categories

["['dog', 'chirping birds', 'vacuum cleaner', 'vacuum cleaner', 'thunderstorm', 'thunderstorm', 'door wood knock', 'can opening', 'crow', 'door wood knock', 'door wood knock', 'clapping', 'clapping', 'clapping', 'dog', 'clapping', 'thunderstorm', 'fireworks', 'fireworks', 'fireworks', 'fireworks', 'clapping', 'clapping', 'clapping', 'chainsaw', 'airplane', 'mouse click', 'pouring water', 'train', 'sheep', 'water drops', 'water drops', 'water drops', 'church bells', 'church bells', 'clock alarm', 'keyboard typing', 'wind', 'clock alarm', 'footsteps', 'footsteps', 'footsteps', 'footsteps', 'footsteps', 'footsteps', 'frog', 'frog', 'fireworks', 'fireworks', 'cow', 'water drops', 'brushing teeth', 'brushing teeth', 'car horn', 'crackling fire', 'helicopter', 'helicopter', 'helicopter', 'helicopter', 'helicopter', 'helicopter', 'drinking sipping', 'rain', 'crackling fire', 'insects', 'crackling fire', 'crackling fire', 'crackling fire', 'laughing', 'frog', 'hen', 'hen', 'helicopter', 'helic

In [16]:
dataset_for_atlas = dataset['train']['audio'].copy()
url_s3 = 'https://static.nomic.ai/penngenai/audio/' # hosted on s3
for i in range(len(dataset_for_atlas)):
    dataset_for_atlas[i]['label'] = categories[i]
    tmp_url = url_s3 + dataset['train']['filename'][i]
    dataset_for_atlas[i]['mp3'] = f'''<audio src="{tmp_url}" controls> </audio>'''
    dataset_for_atlas[i].pop('path') # path = None ==> Error
    dataset_for_atlas[i].pop('array') # array is np.array ==> Error
    dataset_for_atlas[i].pop('sampling_rate') # all are same at 44.1 kHz

IndexError: list index out of range

In [None]:
dataset_for_atlas[1]

In [None]:
dataset_atlas = atlas.map_data(data= dataset_for_atlas,
                         embeddings = audio_embeddings,
                         identifier='Environment sound',
                         description='Sound from environments',
                         topic_model={'topic_label_field': 'label'})

In [None]:
# Access your Atlas map and download your embeddings
map = dataset_for_atlas.maps[0]

projected_embeddings = map.embeddings.projected
latent_embeddings = map.embeddings.latent

In [None]:
print(projected_embeddings)

In [None]:
# Access your Atlas map
map = dataset_for_atlas.maps[0]

# Access a pandas DataFrame associating each datum on your map to their topics at each topic depth.
topic_df = map.topics.df

print(map.topics.df)


In [17]:
# Load map and perform vector search for the five nearest neighbors of datum with id "my_query_point"
map = dataset_for_atlas.maps[0]

with dataset_for_atlas.wait_for_dataset_lock():
  neighbors, _ = map.embeddings.vector_search(ids=['my_query_point'], k=5)

# Return similar data points
similar_datapoints = dataset_for_atlas.get_data(ids=neighbors[0])

print(similar_datapoints)

AttributeError: 'list' object has no attribute 'maps'

## Map for Common Language dataset

In [None]:
# get the embeddings to be in array
embedding_np_array = np.array(embed_cl_arr).squeeze()
embedding_np_array.shape

(25, 512)

In [None]:
embed_cl_df = embed_cl_df.to_dict('records')

[{'client_id': 'عليك أن تفي بوعدك.',
  'mp3': '<audio src="https://drive.google.com/file/d/1KuBoKtof94dYq97s1Aya2JwzAwSYxJmn/view?usp=drive_link" type="audio/wav" controls autoplay> </audio>',
  'age': 'twenties',
  'gender': 'male',
  'language': 'Arabic'},
 {'client_id': 'يشبه أباه.',
  'mp3': '<audio src="https://drive.google.com/file/d/1KuBoKtof94dYq97s1Aya2JwzAwSYxJmn/view?usp=drive_link" type="audio/wav" controls autoplay> </audio>',
  'age': 'twenties',
  'gender': 'female',
  'language': 'Arabic'},
 {'client_id': 'لن يُغَيِّرَ ذلك شيئًا.',
  'mp3': '<audio src="https://drive.google.com/file/d/1KuBoKtof94dYq97s1Aya2JwzAwSYxJmn/view?usp=drive_link" type="audio/wav" controls autoplay> </audio>',
  'age': 'fourties',
  'gender': 'male',
  'language': 'Arabic'},
 {'client_id': 'كيف حال الجميع ؟',
  'mp3': '<audio src="https://drive.google.com/file/d/1KuBoKtof94dYq97s1Aya2JwzAwSYxJmn/view?usp=drive_link" type="audio/wav" controls autoplay> </audio>',
  'age': 'not_defined',
  'gender

In [None]:
dataset_atlas_cl = atlas.map_data(data=embed_cl_df,
                         embeddings = embedding_np_array,
                         identifier='Common Language',
                         description='Common Language Dataset',
                         topic_model={'topic_label_field': 'language'})

[32m2024-02-17 20:48:43.973[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m861[0m - [1mCreating dataset `common-language-12`[0m
[32m2024-02-17 20:48:44.351[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m108[0m - [1mUploading data to Atlas.[0m
1it [00:00,  1.49it/s]
[32m2024-02-17 20:48:45.038[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1529[0m - [1mUpload succeeded.[0m
[32m2024-02-17 20:48:45.043[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m123[0m - [1m`tpremrud/common-language-12`: Data upload succeeded to dataset`[0m
[32m2024-02-17 20:48:46.460[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1238[0m - [1mCreated map `Common Language` in dataset `tpremrud/common-language-12`: https://atlas.nomic.ai/data/tpremrud/common-language-12/map[0m


# Nomic Atlas Example: News_dataset

In [None]:
import pandas
news_articles = pandas.read_csv('https://raw.githubusercontent.com/nomic-ai/maps/main/data/ag_news_25k.csv').to_dict('records')

# project = atlas.map_data(data=news_articles,
#                          indexed_field='text',
#                          id_field='id',
#                          identifier='News Articles 25k',
#                          description='25k News articles.',
#                          topic_model={'topic-label-field': [news_articles[i]['label'] for i in range(len(news_articles))]})