### Authentication and other code admin

* Importing Python libraries
* Setting up Google Cloud Platform (GCP) services
    * Service account impersonation authentication and secrets retrieval 
    * Cloud storage access and file retrieval
* Getting files organized

#### Importing Python libraries

In [5]:
#data org/manipulation
import pandas as tian
tian.set_option('display.max_columns', None)
from io import StringIO
import builtins
from tqdm import tqdm
import pickle
import os


#gcp
from google.auth import default, impersonated_credentials
from google.cloud import secretmanager, storage

#llm libraries
from transformers import AutoTokenizer
from langchain import document_loaders
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
import random


#### Setting up Google Cloud Platform (GCP) services

##### Service account impersonation authentication and secrets retrieval 

You will need to change this to your preferred setup for accessing files and secrets if you want to follow along. My first start is authenticating into GCP from the command line:

```
gcloud auth application-default login
```


In [3]:
# Default credentials from the environment
credentials, project_id = default()
# service account impersonation setup
service_account_email = "gcp-music-service@thoughts-on-music.iam.gserviceaccount.com"
target_scopes = ['https://www.googleapis.com/auth/cloud-platform']

impersonated_credentials = impersonated_credentials.Credentials(
    source_credentials=credentials,
    target_principal=service_account_email,
    target_scopes=target_scopes
)

# secrets client
secret_client = secretmanager.SecretManagerServiceClient()
project_id = "thoughts-on-music"
secret_id = "HUGGING_FACE_READ_TOKEN_LLAMA"
version_id = "latest" 
secrets_name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"

# retrieve HF API key from secrets manager
response = secret_client.access_secret_version(name=secrets_name)
HUGGING_FACE_READ_TOKEN_LLAMA = response.payload.data.decode("UTF-8")


##### Cloud storage access and file admin

I've defined a couple functions for used as needed:

* gcp_download takes a defined GPC bucket name, folder location, and file name and returns a dataframe of that file
    * It's designed so other bucket and folder names can be introduced without wrecking the code
* df_deets takes either a single dataframe name or a list of dataframe name and prints out details I like:
    * The name of the dataframe
    * The first few rows
    * The column names
    * The data types for each field
    * Sum of duplicates and NAs for each column 

In [None]:
# cloud storage client setup
storage_client = storage.Client(credentials=credentials)
love_uwsthoughts = "love-uwsthoughts"
bp_csvs_folder = 'bp_csvs'
bp_artist_bios_folder = 'bp_artist_bios'
uwsthoughts_bucket = storage_client.bucket(love_uwsthoughts)

#function to import files 
def gcp_download(bucket_name, gcp_folder, file_name):
    bucket = storage_client.bucket(bucket_name)
    blob_name = f"{gcp_folder}/{file_name}".strip("/")
    file_blob = bucket.blob(blob_name)
    file_data = file_blob.download_as_text() 
    gcp_df = tian.read_csv(StringIO(file_data))
    
    return gcp_df

def df_deets(df_or_names):
    def print_info(df, df_name):
        print(f"{df_name}:")
        if hasattr(builtins, "display"):
            display(df)
        else:
            print(df)
        print("\n")
        
        # Summary of data types, duplicates, NAs, and distinct values
        summary_df = tian.DataFrame({
            "Data types": df.dtypes,
            "Duplicates": [df.duplicated(subset=[col]).sum() for col in df.columns],
            "NAs": df.isna().sum().values,
            "Distinct values": df.nunique().values
        }).set_index(df.columns)
        
        print(f"{df_name} summary:")
        if hasattr(builtins, "display"):
            display(summary_df)
        else:
            print(summary_df)
        print("\n")
        print(df.columns)
        print("\n")
        total_duplicates = df.duplicated().sum()
        print(f"Total number of duplicate rows in {df_name}: {total_duplicates}\n")

    if isinstance(df_or_names, list): 
        for item in df_or_names:
            if isinstance(item, str):
                df = globals()[item] 
                print_info(df, item)
            else:
                print_info(item, "dataframe")
    else: 
        if isinstance(df_or_names, str):
            df = globals()[df_or_names]  
            print_info(df, df_or_names)
        else:
            print_info(df_or_names, "dataframe")


##### Getting files organized 

This may seem beautifully consolidated now but trust me when I say that there was a lot of chaos I went through to get to this point. I keep like with like to make it easier for me to remember what I was thinking.

Running list of different files I need at various points:

In [None]:
fact_bp_track_audio_features_csv = 'fact_bp_track_audio_features.csv' 
fact_bpmeta_audio_csv = 'fact_bpmeta_audio.csv'
fact_label_bpmeta_audio_csv = 'fact_label_bpmeta_audio.csv'
bp_artist_release_csv = 'bp_artist_release.csv'
bp_artist_csv = 'bp_artist.csv'
bp_artist_track_csv = 'bp_artist_track.csv'
bp_label_csv = 'bp_label.csv'
bp_genre_csv = 'bp_genre.csv'
bp_key_csv = 'bp_key.csv'
bp_label_artist_csv = 'bp_label_artist.csv'

I only had to download from GCP once and then I cached the ones I need for later use. If I make a change that I want to be permanent, I update the cache with the changes. From time to time I'll send a changed file up to GCP if I have a long term need for it. 

In [None]:
bp_artist_df = gcp_download(love_uwsthoughts, bp_csvs_folder, bp_artist_csv)
bp_artist_release_df =  gcp_download(love_uwsthoughts, bp_csvs_folder, bp_artist_release_csv)
bp_artist_track_df = gcp_download(love_uwsthoughts, bp_csvs_folder, bp_artist_track_csv)

bp_label_df = gcp_download(love_uwsthoughts, bp_csvs_folder, bp_label_csv)
bp_label_artist_df = gcp_download(love_uwsthoughts, bp_csvs_folder, bp_label_artist_csv)

bp_genre_df = gcp_download(love_uwsthoughts, bp_csvs_folder, bp_genre_csv)
bp_key_df = gcp_download(love_uwsthoughts, bp_csvs_folder, bp_key_csv)
fact_label_bpmeta_audio_df = gcp_download(love_uwsthoughts, bp_csvs_folder, fact_label_bpmeta_audio_csv)

fact_bp_track_audio_features_df = gcp_download(love_uwsthoughts, bp_csvs_folder, fact_bp_track_audio_features_csv)
fact_bpmeta_audio_df = gcp_download(love_uwsthoughts, bp_csvs_folder, fact_bpmeta_audio_csv)

bp_artist_df.to_parquet('bp_artist_df_cache.parquet', index=False)
bp_artist_release_df.to_parquet('bp_artist_release_df_cache.parquet', index=False)
bp_artist_track_df.to_parquet('bp_artist_track_df_cache.parquet', index=False)

bp_label_df.to_parquet('bp_label_df_cache.parquet', index=False)
bp_label_artist_df.to_parquet('bp_label_artist_df_cache.parquet', index=False)

bp_genre_df.to_parquet('bp_genre_df_cache.parquet', index=False)
bp_key_df.to_parquet('bp_key_df_cache.parquet', index=False)



A consolidated list of the cached dataframes I need:

In [2]:
# bpmeta_audio_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bpmeta_audio_df_cache.parquet') # removed fact from name
# bpmeta_audio_shield_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bpmeta_audio_shield_df_cache.parquet')

# bp_artist_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_artist_df_cache.parquet')
# bp_artist_release_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_artist_release_df_cache.parquet')
# bp_artist_track_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_artist_track_df_cache.parquet')

# bp_artist_label_names_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_artist_label_names_df_cache.parquet')
# artist_bios_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/artist_bios_df_cache.parquet')

# bp_label_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_label_df_cache.parquet')
# bp_label_artist_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_label_artist_df_cache.parquet')

# bp_genre_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_genre_df_cache.parquet')
# bp_key_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_key_df_cache.parquet')

bp_text_values_df = tian.read_parquet('/Users/uwsthoughts/Desktop/dolly_shield_local/bp_text_values_df_cache.parquet')



### Data here, there, and everywhere

This first section is about futzing around with files I made when I was much younger, maybe early summer 2024? I'm going to sweep through and see which ones are actually valuable and which ones are the result of me wrapping myself in a massive vortext of delusion.

In [None]:
fact_bp_track_audio_features_df = gcp_download(love_uwsthoughts, bp_csvs_folder, fact_bp_track_audio_features_csv)
fact_bpmeta_audio_df = gcp_download(love_uwsthoughts, bp_csvs_folder, fact_bpmeta_audio_csv)

In [None]:
dataframes_list = ['fact_bp_track_audio_features_df', 'fact_bpmeta_audio_df']

df_deets(dataframes_list)

Based on the above, it looks like I intended for 'fact_bpmeta_audio_df' to be the cleaned up version of 'fact_bp_track_audio_features_df'. There were ~13,000 rows without Beatport track_id's and, given that this heavily revolves around Beatport data, I decided that it wasn't worth keeping records without that ID. While I do want to get the show on the road, I do have to return to  'mix' having 477,121 distinct values. The table above already has all the cleaned up values in place but below is how I previously did it. I've also added some data to show that we are dealing with the longest of longest tails for this one.

In [None]:
bpmeta_audio_df = tian.merge(
    fact_bpmeta_audio_df,
    fact_bp_track_audio_features_df[['track_id', 'title', 'track_url']],
    on='track_id',
    how='left'
)

bpmeta_audio_df

In [None]:
mix_replacements = {
    "continuous": "Set Mixed",
    "live": "Set Mixed",
    "remastered": "Remastered Mix",
    "orginal": "Original Mix",
    "ambient": "Ambient Mix",
    "chill": "Ambient Mix",
    "lounge": "Ambient Mix",
    "rework": "Remastered Mix",
    "remix": "Remix",
    "original": "Original Mix",
    "club": "Club Mix",
    "dub": "Dub Mix",
    "extended": "Extended Mix",
    "instrumental": "Instrumental Mix",
    "radio": "Radio Mix",
    "vip": "Remix",
    "album": "Album Mix",
    "Continuous DJ Mix": "Mixed",
    "Mix Cut": "Set Mixed",
    "Mixed": "Set Mixed",
    "Intro Mix": "Set Mixed",
    "Edit": "Radio Edit",
    "Main Mix": "Original Mix",
    "Album Version": "Album Mix",
    "Deep Mix": "Remix",
    "House Mix": "Remix",
    "Tribal Mix": "Remix",
    "Intro": "Set Mixed",
    "Edit Mix": "Radio Mix",
    "Bonus Track": "Album Mix"
}

# Function to clean up mix values
def clean_mix_values(mix):
    if isinstance(mix, str):
        for key, value in mix_replacements.items():
            if key.lower() in mix.lower():
                return value
    return mix


bpmeta_audio_df['mix'] = bpmeta_audio_df['mix'].apply(clean_mix_values)
total_rows = len(bpmeta_audio_df)
top_20 = bpmeta_audio_df['mix'].value_counts().head(20)
top_20_df = top_20.reset_index(name='count').rename(columns={'index': 'mix'})
top_20_df['percent_of_total'] = (top_20_df['count'] / total_rows) * 100

print("\nTop 20 mix values after cleanup:")
print(top_20_df)


I made the cache for bpmeta_audio_df down here and then moved the read cache part near the top with the rest. 

In [None]:
bpmeta_audio_df.to_parquet('bpmeta_audio_df_cache.parquet', index=False)

### A suspicious label data table

I had an inkling that this old table I made was a botched attempt at joining label data to the other metadata. 

In [None]:
df_deets(fact_label_bpmeta_audio_df)

Based on the above, that's exactly what happened. I'm willing to bet I can just append label_name onto fact_bpmeta_audio_df and have that be that.

In [None]:
bpmeta_audio_shield_df = tian.merge(bpmeta_audio_df, fact_label_bpmeta_audio_df[['track_id', 'label_name']],
                     on='track_id', how='left')

bpmeta_audio_shield_df 

In [None]:
bpmeta_audio_shield_df.to_parquet('bpmeta_audio_shield_df_cache.parquet', index=False)

df_deets(bpmeta_audio_shield_df)

### Artist bios integration

I scraped some aritst bios from Beatport in batches of 1,000 that were initially stored in separate files. This is where I bring them all together and have one list.

In [None]:
#sort folder by created date and then grab
def gcp_folder_sort(bucket, folder):
    blobs = list(bucket.list_blobs(prefix=folder))
    sorted_blobs = sorted(blobs, key=lambda x: x.time_created)
    return sorted_blobs

#combine csv files into one df
def artists_united(bucket, folder):
    sorted_blobs = gcp_folder_sort(bucket, folder)
    artists_united_df = tian.DataFrame()
    for blob in sorted_blobs:
        if blob.name.endswith('.csv'):
            the_drop = gcp_download(bucket.name, folder, blob.name.split('/')[-1])
            the_drop = the_drop[['beatport_artist_id', 'artist_name', 'beatport_bio']]
            artists_united_df = tian.concat([artists_united_df, the_drop], ignore_index=True)
    artists_united_df = artists_united_df.drop_duplicates(ignore_index=True)        
    return artists_united_df

#save df back to gcp
def paranoid_guard(df, bucket_name, gcp_folder, file_name):
    bucket = storage_client.bucket(bucket_name)
    csv_data = df.to_csv(index=False)
    blob_name = f"{gcp_folder}/{file_name}".strip("/")
    blob = bucket.blob(blob_name)
    blob.upload_from_string(csv_data, content_type='text/csv')


In [None]:
bppoints_artist_bios_csv = "bppoints_artist_bios.csv"

bppoints_artist_bios_df = artists_united(uwsthoughts_bucket, bp_artist_bios_folder)

paranoid_guard(bppoints_artist_bios_df, love_uwsthoughts, bp_artist_bios_folder, bppoints_artist_bios_csv)

I decided I was only interested in the artists that had a bio so of the ~55,000 I started with, I ended up with ~48,000. 

In [None]:
artist_bios_df = bppoints_artist_bios_df.drop_duplicates(subset='beatport_bio', keep='first')

df_deets(artist_bios_df)

In [None]:
artist_bios_df.to_csv('/Users/uwsthoughts/Desktop/bp_spotify_raw_data/csv_data/artist_bios_df.csv', index=False)
artist_bios_df.to_parquet('artist_bios_df_cache.parquet', index=False)

### A big ol' effort to make human readable text

A lot of my previous work has been premised on using different IDs to establish realtionships between data, with text values only coming in at the very end to help identify outputs. Now that I'm shifting into trying to use a large language model, I want text values instead of IDs so the model can tokenize and understand what it means. My thinking is that these IDs probably won't mean too much to it but it's probably been trained on enough data that it can make relationships with the addition data I give it. 

Below are a a few pieces that take me from ID mappings to human readable. I've never worked directly with an LLM before but my thinking is this: the human brain is essentially machine learning, perfected. In most languages, text is read left to right, top to bottom. If I create tables where the data can be read and understood by a human in a left to right fasion, an LLM should be able to do the same.

That means I need to take a bunch of data I have in different palces and bring it all together into a few, human readable tables.

#### Data sample reference

This is just a holding pen for samples of all the data I'm working with in this section. I used it as a reference point to confirm I was taking the right values from the right table. You can skip to 'The data in play' to see the actual work.

In [None]:
df_list = [bpmeta_audio_shield_df, bp_artist_df, bp_artist_release_df, bp_artist_track_df, bp_label_df, bp_label_artist_df, bp_genre_df, bp_key_df]
df_deets(df_list)

#### Miami = synergy

While each dataframe has it's column names printed as part of df_deets() above, for this specific part I needed all the column names neatly stacked on top of each other so I could figure out what needed to go where. I've been working with this data for a few months now so all these tables and fields mean something to me, with the samples above used as needed. The only one that isn't an original dataset is```bpmeta_audio_shield_df```.

In [None]:
df_names = ["bpmeta_audio_shield_df", "bp_artist_df", "bp_artist_release_df", "bp_artist_track_df", "bp_label_df", "bp_label_artist_df", "bp_genre_df", "bp_key_df"]

for name, df in zip(df_names, df_list):
    print(f"{name}:\n {df.columns}\n")

After looking at the above, I decided the best approach was to use ```bpmeta_audio_shield_df``` as the starting point and add in text values where it made sense. I did this with a couple of goals in mind:
* Make copies of tables I want to change so that I can preserve original tables in their original structure
* Combine data that can realistically go together, like the data saying which artists are on which labels

#### From the underground to the world 

This first part is straight forward: there were separate tables for artist and label metadata and I wanted to see a label and its artists together. I thought of it has "Anjunadeep has Marsh and Eli & Fur as artists." I kept the IDs and urls but tucked them in at the end. My spider sense says having those urls available later is going to be useful.

In [None]:
label_artist_temp = tian.merge(bp_label_artist_df, bp_label_df, on='label_id', suffixes=('', '_label'))
bp_artist_label_names_df = tian.merge(label_artist_temp, bp_artist_df, on='artist_id', suffixes=('', '_artist'))
bp_artist_label_names_df = bp_artist_label_names_df[[ 'label_name', 'artist_name', 'label_id', 'label_url', 'artist_id', 'artist_url']]


In [None]:
df_deets(bp_artist_label_names_df)

In [None]:
bp_artist_label_names_df.to_parquet('bp_artist_label_names_df_cache.parquet', index=False)


### Four to the floor

Take a deep breath with me. From the above, I wanted to be able to say something like the following:

> On August 18th, 2023, Mira released 'Celo' on Kiosk ID. Her first release in 2023, it was also her first foray into melodic house & techno. It brought a newer and edgier side to her growing repertoire.

That led me to a table design that look like this:

- release_date
- artist_name
- title
- label_name
- genre_name
- bpm
- key_id
- mix
- is_remixed
- is_remixer
- mode
- valence
- danceability
- energy
- speechiness
- loudness
- liveness
- instrumentalness
- acousticness
- isrc
- artist_id
- artist_url
- track_id
- track_url
- label_id
- label_url
- genre_id
- genre_url

All of the text values are one after another on the left, with IDs and urls at the end. I have text value descriptions for all these Spotift audio metrics so it's easy to see how those go from continuous 0-1 values to something a human can understand. But,as they say, that's a problem for a different day.

To do this, I simply joined on the different IDs I needed, with track_id being the center of gravity. For now, I decided to have one track ID for each artist, which means multiple track IDs for each artist. In future work, I want to put all the artists together since the human brain doesn't separate out artists on a track like that. 


In [None]:
bp_track_artist_merge = tian.merge(bpmeta_audio_shield_df, bp_artist_track_df, on='track_id', suffixes=('', '_artist'))
bp_track_artist_label_merge = tian.merge(bp_track_artist_merge, bp_artist_df, on='artist_id', suffixes=('', '_artist_info'))
bp_track_artist_label_merge = tian.merge(bp_track_artist_label_merge, bp_label_df, on='label_id', suffixes=('', '_label'))
bp_track_artist_label_merge = tian.merge(bp_track_artist_label_merge, bp_genre_df, on='genre_id', suffixes=('', '_genre'))
bp_text_values_df = bp_track_artist_label_merge[[
    'release_date', 'artist_name', 'title',  'label_name', 'duration', 'genre_name', 'bpm', 'key_id', 'mix', 'is_remixed', 'is_remixer',  
    'mode', 'valence', 'danceability', 'energy', 'speechiness', 'loudness', 'liveness', 'instrumentalness', 'acousticness', 'isrc', 'artist_id', 'artist_url', 
    'track_id', 'track_url', 'label_id', 'label_url', 'genre_id', 'genre_url'
]]

This brought me this beatifully massive table. On spec, it looks how I wanted. I need to handle things like ```is_remixed``` and ```is_remixer``` since they're telling the same story from different perspectives.

In [None]:
df_deets(bp_text_values_df)

All of that work means I can also find artists I'm looking for much easier. Text values as IDs are, generally speaking, not great because it's easy to have overlaps and duplicates. For example, ```Mira (Berlin)``` has (Berlin) after her name because 1) she's from Berlin so it clearly identifies her and there's another ```Mira``` on Beatport and so someone has to change a bit. This is the divergence between the relational database world I've lived in and the more unstructured world of LLMs. I also didn't deal with ```bp_key_df``` meaningfully because it's a whole separate chunk of work to make it ready to add to this one. The key_id was preserved so I can easily return to it when ready. 

In [None]:
# bp_text_values_df[(bp_text_values_df['artist_name'] == 'Fideles') & (bp_text_values_df['title'] == 'Away With Me')].sort_values(by='release_date', ascending=False)
bp_text_values_df[bp_text_values_df['artist_name'] == 'Mira (Berlin)'].sort_values(by='release_date', ascending=False).head(10)


In [None]:
bp_text_values_df.to_parquet('bp_text_values_df_cache.parquet', index=False)


## Tokenizing Work

Initial work using Hugging Face and Langchain to access and use the folllowing open source Large Language Models (LLMs):

* [Meta's Llama 3.1-8B](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1#llama-3.1)
* [Meta's Llama-3.2-11B-Vision](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-vision-models-(11b/90b)-)

First, I'm gonna bring in the files I cached. If you're following along at home, this notebook is coming together over several different sessions and I need a mechanism to bring the data back. I could redownload from GCP but that seems excessive.

### If we say tokens enough times, they appear

In [6]:
llama_318b = "meta-llama/Meta-Llama-3.1-8B"
llama_318b_tokenizer = AutoTokenizer.from_pretrained(llama_318b, token="HUGGING_FACE_READ_TOKEN_LLAMA")

llama_3211b = "meta-llama/Llama-3.2-11B-Vision"
llama_3211b_tokenizer = AutoTokenizer.from_pretrained(llama_3211b)

logging.basicConfig(filename='/Users/uwsthoughts/Desktop/dolly_shield_local/tokenization_logs.txt', level=logging.INFO, format='%(asctime)s - %(message)s')

def melodies(row):
    text = (
        f"Track ID: {row['track_id']}, Title: {row['title']}, "
        f"Artist: {row['artist_name']}, Artist ID: {row['artist_id']}, "
        f"Genre: {row['genre_name']}, Genre ID: {row['genre_id']}, "
        f"Label: {row['label_name']}, Label ID: {row['label_id']}, "
        f"Release Date: {row['release_date']}, Track URL: {row['track_url']}, "
        f"Mix: {row['mix']}, Remix: {'Yes' if row['is_remixed'] else 'No'}, "
        f"Remixer: {'Yes' if row['is_remixer'] else 'No'}, Duration: {row['duration']} minutes, "
        f"BPM: {row['bpm']}, Key ID: {row['key_id']}, "
        f"Mode: {row['mode']}, Valence: {row['valence']}, Danceability: {row['danceability']}, "
        f"Energy: {row['energy']}, Speechiness: {row['speechiness']}, "
        f"Loudness: {row['loudness']}, Liveness: {row['liveness']}, "
        f"Instrumentalness: {row['instrumentalness']}, Acousticness: {row['acousticness']}, "
        f"ISRC: {row['isrc']}, Artist URL: {row['artist_url']}, Label URL: {row['label_url']}, "
        f"Genre URL: {row['genre_url']}."
    )
    return text

def find_your_flow(dataframe, farm_trips):
    hay_barrels = (len(dataframe) // farm_trips) + 1
    full_shed = []

    for i in range(0, hay_barrels, 20):
        last_barrel = min(i + 20, hay_barrels)
        for barrel in tqdm(range(i, last_barrel), desc=f"Processing batches {i+1}-{last_barrel}"):
            fresh_cut = barrel * farm_trips
            full_up = min((barrel + 1) * farm_trips, len(dataframe))
            batch_df = dataframe.iloc[fresh_cut:full_up]
            
            full_barrel = [melodies(row) for _, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Processing batch {barrel+1}/{hay_barrels}")]
            full_shed.extend(full_barrel)
    
    return full_shed


def dolly_tunes(texts, farm_trips):
    hay_barrels = (len(texts) // farm_trips) + 1
    dolly_grazing = []

    #sample of 100 random rows 
    sample_indices = random.sample(range(len(texts)), min(100, len(texts)))

    for i in range(0, hay_barrels, 20):
        last_barrel = min(i + 20, hay_barrels)
        for barrel in tqdm(range(i, last_barrel), desc=f"Tokenizing batches {i+1}-{last_barrel}"):
            fresh_cut = barrel * farm_trips
            full_up = min((barrel + 1) * farm_trips, len(texts))
            full_barrel = texts[fresh_cut:full_up]
            
            #Tokenization of batch
            full_truck = llama_3211b_tokenizer(
                full_barrel,
                return_tensors="tf",
                truncation=True,
                padding=True
            )

            for idx, text in enumerate(full_barrel):
                tokens = llama_3211b_tokenizer.tokenize(text)
                token_ids = llama_3211b_tokenizer.convert_tokens_to_ids(tokens)

                #log all tokens into file
                logging.info(f"Text: {text}\nTokens: {tokens}\nToken IDs: {token_ids}")

                #print out match rows of random sample using global index
                global_idx = fresh_cut + idx  
                if global_idx in sample_indices:
                    print("Tokens:", tokens)
                    print("Token IDs:", token_ids)

            dolly_grazing.append(full_truck)

    return dolly_grazing

# def dolly_tunes(texts, farm_trips):
#     hay_barrels = (len(texts) // farm_trips) + 1
#     dolly_grazing = []

#     for i in range(0, hay_barrels, 20):
#         last_barrel = min(i + 20, hay_barrels)
#         for barrel in tqdm(range(i, last_barrel), desc=f"Tokenizing batches {i+1}-{last_barrel}"):
#             fresh_cut = barrel * farm_trips
#             full_up = min((barrel + 1) * farm_trips, len(texts))
#             full_barrel = texts[fresh_cut:full_up]
            
#             full_truck = llama_3211b_tokenizer(full_barrel, return_tensors="tf", truncation=True, padding=True)
#             dolly_grazing.append(full_truck)
    
#     return dolly_grazing



In [None]:
# versions of above that have standard names for fields so I can underdstand

# def find_your_flow(dataframe, batch_size):
#     num_batches = (len(dataframe) // batch_size) + 1
#     all_texts = []

#     for i in range(0, num_batches, 20):
#         end_batch = min(i + 20, num_batches)
#         for batch_idx in tqdm(range(i, end_batch), desc=f"Processing batches {i+1}-{end_batch}"):
#             start_idx = batch_idx * batch_size
#             end_idx = min((batch_idx + 1) * batch_size, len(dataframe))
#             batch_df = dataframe.iloc[start_idx:end_idx]
            
#             batch_texts = [melodies(row) for _, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Processing batch {batch_idx+1}/{num_batches}")]
#             all_texts.extend(batch_texts)
    
#     return all_texts

# def dolly_tunes(texts, batch_size):
#     num_batches = (len(texts) // batch_size) + 1
#     tokenized_batches = []

#     for i in range(0, num_batches, 20):
#         end_batch = min(i + 20, num_batches)
#         for batch_idx in tqdm(range(i, end_batch), desc=f"Tokenizing batches {i+1}-{end_batch}"):
#             start_idx = batch_idx * batch_size
#             end_idx = min((batch_idx + 1) * batch_size, len(texts))
#             batch_texts = texts[start_idx:end_idx]
            
#             large_token = llama_3211b_tokenizer(batch_texts, return_tensors="tf", truncation=True, padding=True)
#             tokenized_batches.append(large_token)
    
#     return tokenized_batches


Dolly tunes that supports visualizations

In [9]:
import random
import logging
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Set up logging configuration
logging.basicConfig(filename='tokenization_log.txt', level=logging.INFO, format='%(asctime)s - %(message)s')

def dolly_tunes(texts, farm_trips):
    hay_barrels = (len(texts) // farm_trips) + 1
    dolly_grazing = []

    # Variables for analysis
    all_tokens = []
    token_lengths = []
    vocabulary_growth = []
    sample_indices = random.sample(range(len(texts)), min(100, len(texts)))

    for i in range(0, hay_barrels, 20):
        last_barrel = min(i + 20, hay_barrels)
        for barrel in tqdm(range(i, last_barrel), desc=f"Tokenizing batches {i+1}-{last_barrel}"):
            fresh_cut = barrel * farm_trips
            full_up = min((barrel + 1) * farm_trips, len(texts))
            full_barrel = texts[fresh_cut:full_up]
            
            # Tokenization of batch
            full_truck = llama_3211b_tokenizer(
                full_barrel,
                return_tensors="tf",
                truncation=True,
                padding=True
            )

            for idx, text in enumerate(full_barrel):
                tokens = llama_3211b_tokenizer.tokenize(text)
                token_ids = llama_3211b_tokenizer.convert_tokens_to_ids(tokens)

                # Collect tokens for analysis
                all_tokens.extend(tokens)
                token_lengths.extend([len(token) for token in tokens])
                
                # Log all tokens into file
                logging.info(f"Text: {text}\nTokens: {tokens}\nToken IDs: {token_ids}")

                # Print out matching rows of random sample using global index
                global_idx = fresh_cut + idx  
                if global_idx in sample_indices:
                    print("Tokens:", tokens)
                    print("Token IDs:", token_ids)

            dolly_grazing.append(full_truck)
            vocabulary_growth.append(len(set(all_tokens)))

    # Summarize Token Statistics
    summarize_token_statistics(all_tokens, token_lengths)

    # Token Frequency Analysis
    token_frequency_analysis(all_tokens)

    # Visualize Aggregated Data
    visualize_vocabulary_growth(vocabulary_growth)
    visualize_token_length_distribution(token_lengths)
    visualize_token_frequency_heatmap(all_tokens)

    return dolly_grazing

# Function to summarize token statistics
def summarize_token_statistics(all_tokens, token_lengths):
    avg_length = sum(token_lengths) / len(token_lengths) if len(token_lengths) > 0 else 0
    print(f"Total number of tokens: {len(all_tokens)}")
    print(f"Vocabulary size: {len(set(all_tokens))}")
    print(f"Average token length: {avg_length}")
    print(f"Minimum token length: {min(token_lengths)}")
    print(f"Maximum token length: {max(token_lengths)}")

# Function for Token Frequency Analysis
def token_frequency_analysis(all_tokens):
    token_counts = Counter(all_tokens)
    most_common_tokens = token_counts.most_common(20)
    print("\nMost Common Tokens:")
    for token, count in most_common_tokens:
        print(f"{token}: {count}")

# Function to visualize vocabulary growth
def visualize_vocabulary_growth(vocabulary_growth):
    plt.figure(figsize=(10, 6))
    plt.plot(vocabulary_growth)
    plt.xlabel('Batch Number')
    plt.ylabel('Vocabulary Size')
    plt.title('Vocabulary Growth Over Time')
    plt.show()

# Function to visualize token length distribution
def visualize_token_length_distribution(token_lengths):
    plt.figure(figsize=(10, 6))
    plt.hist(token_lengths, bins=20, color='skyblue', edgecolor='black')
    plt.xlabel('Token Length')
    plt.ylabel('Frequency')
    plt.title('Token Length Distribution')
    plt.show()

# Function to visualize token frequency heatmap
def visualize_token_frequency_heatmap(all_tokens):
    token_counts = Counter(all_tokens)
    top_tokens = [token for token, _ in token_counts.most_common(20)]
    top_token_counts = [token_counts[token] for token in top_tokens]

    plt.figure(figsize=(10, 6))
    sns.heatmap([top_token_counts], annot=True, cmap='YlGnBu', xticklabels=top_tokens)
   


In [7]:
farm_trips = 50000
bp_text_values = find_your_flow(bp_text_values_df, farm_trips)

Processing batch 1/191: 100%|██████████| 50000/50000 [00:01<00:00, 32801.32it/s]
Processing batch 2/191: 100%|██████████| 50000/50000 [00:01<00:00, 32199.59it/s]
Processing batch 3/191: 100%|██████████| 50000/50000 [00:01<00:00, 32956.93it/s]
Processing batch 4/191: 100%|██████████| 50000/50000 [00:01<00:00, 31053.80it/s]
Processing batch 5/191: 100%|██████████| 50000/50000 [00:01<00:00, 33235.40it/s]
Processing batch 6/191: 100%|██████████| 50000/50000 [00:01<00:00, 32753.55it/s]
Processing batch 7/191: 100%|██████████| 50000/50000 [00:01<00:00, 33094.96it/s]
Processing batch 8/191: 100%|██████████| 50000/50000 [00:01<00:00, 32097.25it/s]
Processing batch 9/191: 100%|██████████| 50000/50000 [00:01<00:00, 31640.97it/s]
Processing batch 10/191: 100%|██████████| 50000/50000 [00:01<00:00, 31172.07it/s]
Processing batch 11/191: 100%|██████████| 50000/50000 [00:01<00:00, 31462.69it/s]
Processing batch 12/191: 100%|██████████| 50000/50000 [00:01<00:00, 31430.33it/s]
Processing batch 13/191: 

In [10]:
bp_text_tokens = dolly_tunes(bp_text_values, farm_trips)

Tokenizing batches 1-20:  15%|█▌        | 3/20 [01:17<07:18, 25.81s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '178', '056', '49', ',', 'ĠTitle', ':', 'ĠBang', 's', 'ĠIn', 'ĠThe', 'ĠHead', ',', 'ĠArtist', ':', 'ĠVal', 'eri', 'Ã¸', 'ĠInn', 'Ã¸r', 'ta', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '647', '817', ',', 'ĠGenre', ':', 'ĠHard', 'ĠTechn', 'o', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '2', ',', 'ĠLabel', ':', 'ĠCar', 'bone', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '591', '59', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '3', '-', '06', '-', '30', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/b', 'angs', '-in', '-the', '-head', '/', '178', '056', '49', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '20', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '80', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '3', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '527', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '686', ',', 'ĠEnergy', ':', '

Tokenizing batches 1-20:  25%|██▌       | 5/20 [02:11<06:37, 26.47s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '153', '921', '98', ',', 'ĠTitle', ':', 'ĠMy', 'ĠLittle', 'ĠFantasy', ',', 'ĠArtist', ':', 'ĠMad', 'agas', 'ca', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '744', '949', ',', 'ĠGenre', ':', 'ĠHard', 'ĠDance', 'Ġ/', 'ĠHardcore', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '8', ',', 'ĠLabel', ':', 'ĠChe', 'ek', 'y', 'ĠTracks', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '182', '33', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '1', '-', '07', '-', '16', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/my', '-l', 'ittle', '-f', 'antasy', '/', '153', '921', '98', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '39', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '150', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '19', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '522', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '602', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '991', 

Tokenizing batches 1-20:  30%|███       | 6/20 [02:38<06:12, 26.64s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '162', '376', '90', ',', 'ĠTitle', ':', 'ĠFree', 'fall', 'ing', ',', 'ĠArtist', ':', 'ĠTechn', 'ikal', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '823', '2', ',', 'ĠGenre', ':', 'ĠHard', 'ĠDance', 'Ġ/', 'ĠHardcore', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '8', ',', 'ĠLabel', ':', 'ĠT', 'idy', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '101', '417', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '9', '-', '10', '-', '16', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/free', 'fall', 'ing', '/', '162', '376', '90', ',', 'ĠMix', ':', 'ĠRob', 'ĠT', 'isser', 'a', "'s", 'ĠEpic', 'ĠRe', '-R', 'ub', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '11', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '150', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '10', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '199', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '572', ',', 'ĠEnergy', ':', 'Ġ', '0', '.

Tokenizing batches 1-20:  40%|████      | 8/20 [03:33<05:25, 27.10s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '153', '785', '03', ',', 'ĠTitle', ':', 'ĠTrust', 'ĠIssue', ',', 'ĠArtist', ':', 'ĠDub', 'iosity', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '272', '568', ',', 'ĠGenre', ':', 'ĠTechn', 'o', 'Ġ(', 'Peak', 'ĠTime', 'Ġ/', 'ĠDriving', '),', 'ĠGenre', 'ĠID', ':', 'Ġ', '6', ',', 'ĠLabel', ':', 'ĠLux', 'ĠRec', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '945', '86', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '8', '-', '05', '-', '23', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/tr', 'ust', '-issue', '/', '153', '785', '03', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '36', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '127', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '22', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '044', '4', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '607', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '836', ',', 'Ġ

Tokenizing batches 1-20:  75%|███████▌  | 15/20 [06:55<02:25, 29.01s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '110', '535', '16', ',', 'ĠTitle', ':', 'ĠImportant', 'ĠRole', ',', 'ĠArtist', ':', 'ĠMaster', 'ĠMaster', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '104', '926', ',', 'ĠGenre', ':', 'ĠTechn', 'o', 'Ġ(', 'Peak', 'ĠTime', 'Ġ/', 'ĠDriving', '),', 'ĠGenre', 'ĠID', ':', 'Ġ', '6', ',', 'ĠLabel', ':', 'ĠBerlin', 'ĠAfter', 'ĠDark', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '707', '86', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '8', '-', '10', '-', '21', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/', 'important', '-role', '/', '110', '535', '16', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '07', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '128', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '19', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '349', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '71', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '534', 

Tokenizing batches 1-20:  80%|████████  | 16/20 [07:25<01:57, 29.39s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '587', '258', '0', ',', 'ĠTitle', ':', 'ĠEarth', 'ĠStation', ',', 'ĠArtist', ':', 'ĠSyn', 'ac', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '200', '963', ',', 'ĠGenre', ':', 'ĠTechn', 'o', 'Ġ(', 'Peak', 'ĠTime', 'Ġ/', 'ĠDriving', '),', 'ĠGenre', 'ĠID', ':', 'Ġ', '6', ',', 'ĠLabel', ':', 'ĠD', '-L', 'ab', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '166', '19', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '4', '-', '10', '-', '01', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/', 'earth', '-st', 'ation', '/', '587', '258', '0', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '28', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '128', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '20', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '202', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '826', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '34

Tokenizing batches 1-20: 100%|██████████| 20/20 [09:26<00:00, 28.31s/it]
Tokenizing batches 21-40:   0%|          | 0/20 [00:00<?, ?it/s]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '112', '692', '02', ',', 'ĠTitle', ':', 'Ġ', '47', 'ĠRon', 'in', ',', 'ĠArtist', ':', 'ĠVital', 'iy', 'ĠBlack', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '344', '998', ',', 'ĠGenre', ':', 'ĠTechn', 'o', 'Ġ(', 'Peak', 'ĠTime', 'Ġ/', 'ĠDriving', '),', 'ĠGenre', 'ĠID', ':', 'Ġ', '6', ',', 'ĠLabel', ':', 'ĠBlack', 'ĠDrop', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '574', '45', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '8', '-', '11', '-', '19', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/', '47', '-', 'ron', 'in', '/', '112', '692', '02', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '15', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '65', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '13', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '057', '7', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '733', ',', 'ĠEnergy', ':', 'Ġ', '0',

Tokenizing batches 21-40:  15%|█▌        | 3/20 [01:33<08:50, 31.21s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '165', '972', '7', ',', 'ĠTitle', ':', 'ĠBlack', 'ĠMoon', 'ĠPart', 'Ġ', '2', ',', 'ĠArtist', ':', 'ĠSv', 'art', '1', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '172', '609', ',', 'ĠGenre', ':', 'ĠTechn', 'o', 'Ġ(', 'Peak', 'ĠTime', 'Ġ/', 'ĠDriving', '),', 'ĠGenre', 'ĠID', ':', 'Ġ', '6', ',', 'ĠLabel', ':', 'ĠMono', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '137', '71', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '1', '-', '01', '-', '28', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/black', '-m', 'oon', '-part', '-', '2', '/', '165', '972', '7', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '43', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '126', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '17', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '033', '9', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '616', ',', 'ĠEnergy', ':', 

Tokenizing batches 21-40:  25%|██▌       | 5/20 [02:39<08:01, 32.07s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '100', '137', '13', ',', 'ĠTitle', ':', 'ĠAd', 'renal', 'ine', ',', 'ĠArtist', ':', 'ĠAlias', 'ĠUK', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '647', '119', ',', 'ĠGenre', ':', 'ĠTechn', 'o', 'Ġ(', 'Peak', 'ĠTime', 'Ġ/', 'ĠDriving', '),', 'ĠGenre', 'ĠID', ':', 'Ġ', '6', ',', 'ĠLabel', ':', 'ĠA', 'ILA', 'ĠRECORD', 'S', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '513', '02', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '7', '-', '12', '-', '29', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/ad', 'renal', 'ine', '/', '100', '137', '13', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '37', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '7', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '044', '3', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '713', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '458', ',', 

Tokenizing batches 21-40:  35%|███▌      | 7/20 [03:43<06:55, 31.93s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '275', '807', '2', ',', 'ĠTitle', ':', 'ĠRelease', 'ĠThe', 'ĠPressure', ',', 'ĠArtist', ':', 'ĠTs', 'T', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '486', '03', ',', 'ĠGenre', ':', 'ĠTechn', 'o', 'Ġ(', 'Peak', 'ĠTime', 'Ġ/', 'ĠDriving', '),', 'ĠGenre', 'ĠID', ':', 'Ġ', '6', ',', 'ĠLabel', ':', 'Ġ', '24', '/', '7', 'ĠHardcore', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '220', '96', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '1', '-', '10', '-', '12', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/release', '-the', '-pressure', '/', '275', '807', '2', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '29', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '86', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '34', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '228', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '399', ',', 'ĠEnergy', ':', 'Ġ',

Tokenizing batches 21-40:  40%|████      | 8/20 [04:14<06:20, 31.72s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '106', '341', '53', ',', 'ĠTitle', ':', 'ĠEl', 'ĠVer', 'ano', ',', 'ĠArtist', ':', 'ĠPill', 'ĠColl', 'inz', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '473', '263', ',', 'ĠGenre', ':', 'ĠFunk', 'y', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '81', ',', 'ĠLabel', ':', 'ĠInfluence', 'ĠRecord', 'ings', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '957', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '8', '-', '07', '-', '05', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/el', '-ver', 'ano', '/', '106', '341', '53', ',', 'ĠMix', ':', 'ĠExtended', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '4', ':', '42', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '6', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '781', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '772', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '964', ',', 'ĠSpeech', 'iness', ':

Tokenizing batches 21-40:  45%|████▌     | 9/20 [04:47<05:51, 31.97s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '921', '101', '9', ',', 'ĠTitle', ':', 'ĠFl', 'ute', ',', 'ĠArtist', ':', 'ĠMarc', 'io', 'ĠPer', 'on', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '472', '408', ',', 'ĠGenre', ':', 'ĠFunk', 'y', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '81', ',', 'ĠLabel', ':', 'Ġ', '1', 'T', 'rib', 'al', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '409', '27', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '7', '-', '05', '-', '09', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/fl', 'ute', '/', '921', '101', '9', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '39', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '130', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '21', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '654', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '73', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '984', ',', 'ĠSpeech', 'iness', ':'

Tokenizing batches 21-40:  80%|████████  | 16/20 [08:39<02:12, 33.25s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '105', '310', '38', ',', 'ĠTitle', ':', 'ĠWhat', 'ĠYou', 'ĠDo', ',', 'ĠArtist', ':', 'ĠFreak', 'a', 'Tr', 'on', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '676', '351', ',', 'ĠGenre', ':', 'ĠDance', 'Ġ/', 'ĠElectro', 'ĠPop', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '39', ',', 'ĠLabel', ':', 'ĠLAND', 'R', ',', 'ĠSelf', '-', 'Released', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '676', '26', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '8', '-', '04', '-', '23', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/', 'what', '-you', '-do', '/', '105', '310', '38', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '4', ':', '15', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '70', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '22', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '273', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '421', ',', 'ĠEnergy', ':', 'Ġ', '0',

Tokenizing batches 21-40:  95%|█████████▌| 19/20 [10:24<00:34, 34.43s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '104', '329', '84', ',', 'ĠTitle', ':', 'ĠDem', 'ons', ',', 'ĠArtist', ':', 'ĠM', 'anna', '-C', 'roup', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '172', '194', ',', 'ĠGenre', ':', 'ĠTech', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '11', ',', 'ĠLabel', ':', 'ĠMusic', 'ĠIs', 'ĠMy', 'ĠReligion', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '659', '53', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '8', '-', '04', '-', '30', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/d', 'emons', '/', '104', '329', '84', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '8', ':', '33', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '120', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '33', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '576', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '735', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '687', ',', 'ĠSpeech', 'iness', ':', 'Ġ

Tokenizing batches 21-40: 100%|██████████| 20/20 [10:57<00:00, 32.87s/it]
Tokenizing batches 41-60:   5%|▌         | 1/20 [00:32<10:11, 32.16s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '312', '896', '3', ',', 'ĠTitle', ':', 'ĠWell', 'ĠWater', ',', 'ĠArtist', ':', 'ĠSt', 'as', 'ĠMiller', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '211', '869', ',', 'ĠGenre', ':', 'ĠTech', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '11', ',', 'ĠLabel', ':', 'Ġ', '9', 'ĠS', 'ides', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '175', '33', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '1', '-', '11', '-', '19', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/w', 'ell', '-water', '/', '312', '896', '3', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '37', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '128', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '12', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '083', '4', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '844', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '649', ',', 'ĠSpeech', 'iness', ':', 'Ġ'

Tokenizing batches 41-60:  35%|███▌      | 7/20 [04:08<07:53, 36.41s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '114', '865', '13', ',', 'ĠTitle', ':', 'ĠBL', 'K', 'SW', 'N', 'Ġfeat', '.', 'ĠAlex', 'ĠDon', 'ati', ',', 'ĠArtist', ':', 'ĠJeff', 'ĠEvel', 'ine', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '123', '699', ',', 'ĠGenre', ':', 'ĠTech', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '11', ',', 'ĠLabel', ':', 'ĠMad', 'zone', 'generation', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '228', '33', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '9', '-', '01', '-', '14', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/bl', 'ks', 'wn', '-fe', 'at', '-', 'alex', '-d', 'on', 'ati', '/', '114', '865', '13', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '00', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '120', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '8', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '314', ',', 'ĠDance', 'ability', ':', '

Tokenizing batches 41-60:  60%|██████    | 12/20 [07:09<04:51, 36.41s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '983', '273', '8', ',', 'ĠTitle', ':', 'ĠLogical', 'ĠMovement', ',', 'ĠArtist', ':', 'ĠMiss', 'ĠVogue', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '408', '631', ',', 'ĠGenre', ':', 'ĠTech', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '11', ',', 'ĠLabel', ':', 'ĠExt', 'acy', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '345', '28', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '7', '-', '10', '-', '27', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/log', 'ical', '-m', 'ovement', '/', '983', '273', '8', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '26', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '128', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '7', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '534', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '807', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '695', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0'

Tokenizing batches 41-60:  70%|███████   | 14/20 [08:28<03:47, 37.97s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '138', '532', '64', ',', 'ĠTitle', ':', 'ĠSmoke', 'ĠUp', '!,', 'ĠArtist', ':', 'ĠY', 'ell', 'ine', 'ck', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '250', '897', ',', 'ĠGenre', ':', 'ĠMinimal', 'Ġ/', 'ĠDeep', 'ĠTech', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '14', ',', 'ĠLabel', ':', 'ĠSphere', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '287', '40', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '0', '-', '07', '-', '11', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/sm', 'oke', '-up', '/', '138', '532', '64', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '50', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '128', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '7', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '386', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '799', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '858', ',', 'ĠSpeech', 'iness', ':', 'Ġ',

Tokenizing batches 41-60:  80%|████████  | 16/20 [09:47<02:34, 38.56s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '101', '180', '85', ',', 'ĠTitle', ':', 'ĠSeven', 'ĠSteps', ',', 'ĠArtist', ':', 'ĠRico', 'ĠMartinez', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '104', '454', ',', 'ĠGenre', ':', 'ĠMinimal', 'Ġ/', 'ĠDeep', 'ĠTech', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '14', ',', 'ĠLabel', ':', 'ĠDat', 'ag', 'ro', 'ove', 'ĠMusic', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '156', '41', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '8', '-', '01', '-', '18', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/se', 'ven', '-st', 'eps', '/', '101', '180', '85', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '5', ':', '48', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '122', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '12', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '48', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '814', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '484', ',',

Tokenizing batches 41-60:  85%|████████▌ | 17/20 [10:28<01:58, 39.50s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '514', '572', '7', ',', 'ĠTitle', ':', 'ĠDark', 'ĠV', 'isions', ',', 'ĠArtist', ':', 'ĠF', '-L', 'AME', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '256', '506', ',', 'ĠGenre', ':', 'ĠMinimal', 'Ġ/', 'ĠDeep', 'ĠTech', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '14', ',', 'ĠLabel', ':', 'ĠChocolate', 'ĠDealer', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '245', '68', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '4', '-', '02', '-', '17', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/d', 'ark', '-', 'visions', '/', '514', '572', '7', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '01', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '128', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '21', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '09', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '795', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '755', ',', 'ĠSpeech'

Tokenizing batches 41-60:  95%|█████████▌| 19/20 [12:22<00:47, 47.39s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '144', '545', '83', ',', 'ĠTitle', ':', 'ĠTake', 'ĠThis', 'ĠOut', ',', 'ĠArtist', ':', 'ĠMAT', 'I', 'ĠRiv', 'aday', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '642', '997', ',', 'ĠGenre', ':', 'ĠMinimal', 'Ġ/', 'ĠDeep', 'ĠTech', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '14', ',', 'ĠLabel', ':', 'ĠVariety', 'ĠMusic', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '168', '59', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '0', '-', '11', '-', '06', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/t', 'ake', '-this', '-out', '/', '144', '545', '83', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '32', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '8', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '768', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '809', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '957', ',', 'ĠSpeech',

Tokenizing batches 41-60: 100%|██████████| 20/20 [13:10<00:00, 39.53s/it]
Tokenizing batches 61-80:   5%|▌         | 1/20 [01:18<24:48, 78.34s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '771', '884', '3', ',', 'ĠTitle', ':', 'ĠSwe', 'ven', ',', 'ĠArtist', ':', 'ĠDimit', 'ri', 'ĠM', 'one', 'v', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '179', '061', ',', 'ĠGenre', ':', 'ĠMinimal', 'Ġ/', 'ĠDeep', 'ĠTech', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '14', ',', 'ĠLabel', ':', 'ĠMetro', 'line', 'ĠLimited', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '472', '3', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '6', '-', '03', '-', '18', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/s', 'we', 'ven', '/', '771', '884', '3', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '9', ':', '45', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '1', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '6', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '811', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '888', ',', 'ĠSpeech', 'i

Tokenizing batches 61-80:  10%|█         | 2/20 [02:36<23:31, 78.41s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '162', '712', '52', ',', 'ĠTitle', ':', 'ĠRead', 'ym', 'ade', ',', 'ĠArtist', ':', 'ĠInterface', 'ĠPalm', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '711', '791', ',', 'ĠGenre', ':', 'ĠDeep', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '12', ',', 'ĠLabel', ':', 'ĠBroken', 'ĠDistrict', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '703', '79', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '2', '-', '04', '-', '15', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/read', 'ym', 'ade', '/', '162', '712', '52', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '4', ':', '22', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '118', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '6', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '872', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '529', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '502', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0

Tokenizing batches 61-80:  25%|██▌       | 5/20 [05:49<16:21, 65.46s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '332', '863', '3', ',', 'ĠTitle', ':', 'ĠB', 'ree', 'zin', ',', 'ĠArtist', ':', 'ĠVincent', 'ĠKw', 'ok', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '593', '8', ',', 'ĠGenre', ':', 'ĠDeep', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '12', ',', 'ĠLabel', ':', 'ĠK', 'apa', 'ĠMusic', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '102', '71', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '2', '-', '04', '-', '17', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/b', 'ree', 'zin', '/', '332', '863', '3', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '01', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '9', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '965', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '809', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '802', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.',

Tokenizing batches 61-80:  50%|█████     | 10/20 [12:13<12:42, 76.27s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '167', '794', '46', ',', 'ĠTitle', ':', 'ĠOrg', 'asm', ',', 'ĠArtist', ':', 'ĠQ', '-G', 'reen', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '657', '157', ',', 'ĠGenre', ':', 'ĠDeep', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '12', ',', 'ĠLabel', ':', 'ĠHouse', 'ĠFreedom', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '600', '76', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '2', '-', '08', '-', '31', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/org', 'asm', '/', '167', '794', '46', ',', 'ĠMix', ':', 'ĠDub', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '28', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '120', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '6', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '691', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '762', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '683', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.', '055', '7', ',',

Tokenizing batches 61-80:  85%|████████▌ | 17/20 [21:19<04:02, 80.76s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '433', '668', '2', ',', 'ĠTitle', ':', 'ĠBlack', 'ĠGirl', ',', 'ĠArtist', ':', 'ĠLady', 'ĠBlack', 'tron', 'ika', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '182', '172', ',', 'ĠGenre', ':', 'ĠDeep', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '12', ',', 'ĠLabel', ':', 'ĠSound', 'ĠBlack', 'ĠRecord', 'ings', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '321', '87', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '3', '-', '04', '-', '22', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/black', '-girl', '/', '433', '668', '2', ',', 'ĠMix', ':', 'ĠSl', 'omo', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '57', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '101', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '9', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '315', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '644', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '922', ',', 'ĠSpeech', 'i

Tokenizing batches 61-80: 100%|██████████| 20/20 [25:34<00:00, 76.72s/it]
Tokenizing batches 81-100:   0%|          | 0/20 [00:00<?, ?it/s]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '858', '527', '5', ',', 'ĠTitle', ':', 'ĠGet', 'ĠTw', 'isted', ',', 'ĠArtist', ':', 'ĠBaba', 'Ġ(', 'Italy', '),', 'ĠArtist', 'ĠID', ':', 'Ġ', '239', '369', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠKeep', 'ĠCal', 'm', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '519', '32', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '6', '-', '11', '-', '21', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/get', '-tw', 'isted', '/', '858', '527', '5', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '00', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '34', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '608', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '81', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '65', ',', 'ĠSpeech', 'iness', ':', 'Ġ',

Tokenizing batches 81-100:  10%|█         | 2/20 [02:51<26:39, 88.86s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '617', '403', '6', ',', 'ĠTitle', ':', 'ĠI', 'ĠLove', 'ĠYou', ',', 'ĠArtist', ':', 'ĠAng', 'y', 'ĠBaxter', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '454', '654', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠNEWS', 'ĠPROM', 'OTION', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '452', '79', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '0', '-', '05', '-', '16', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/i', '-lo', 've', '-you', '/', '617', '403', '6', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '3', ':', '45', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '19', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '251', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '805', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '885', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0'

Tokenizing batches 81-100:  25%|██▌       | 5/20 [07:21<22:32, 90.17s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '134', '615', '44', ',', 'ĠTitle', ':', 'ĠX', 'ĠWings', ',', 'ĠArtist', ':', 'ĠG', 'REG', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '102', '56', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠKal', 'amb', 'ur', 'ĠPublishing', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '313', '11', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '0', '-', '06', '-', '24', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/x', '-w', 'ings', '/', '134', '615', '44', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '2', ':', '15', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '128', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '6', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '972', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '807', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '968', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.', '101', ',

Tokenizing batches 81-100:  35%|███▌      | 7/20 [10:51<21:21, 98.56s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '592', '498', '1', ',', 'ĠTitle', ':', 'ĠWhat', 'ĠU', 'ĠSay', ',', 'ĠArtist', ':', 'ĠCarlo', 'ĠCal', 'dar', 'eri', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '147', '415', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠSim', 'ma', 'ĠBlack', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '318', '07', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '4', '-', '11', '-', '03', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/', 'what', '-u', '-s', 'ay', '/', '592', '498', '1', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '26', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '125', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '6', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '804', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '803', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '974', ',', 'ĠSpeech', 'iness', ':', 

Tokenizing batches 81-100:  45%|████▌     | 9/20 [13:21<15:47, 86.11s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '164', '601', '81', ',', 'ĠTitle', ':', 'ĠFinal', 'ĠCall', ',', 'ĠArtist', ':', 'ĠNi', 'els', 'ĠKirk', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '105', '203', '6', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠStereo', 'ĠRoyal', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '878', '61', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '2', '-', '05', '-', '20', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/f', 'inal', '-call', '/', '164', '601', '81', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '3', ':', '19', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '100', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '5', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '041', '2', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '632', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '35', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.'

Tokenizing batches 81-100:  50%|█████     | 10/20 [15:09<15:29, 92.90s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '132', '940', '48', ',', 'ĠTitle', ':', 'ĠGive', ',', 'ĠArtist', ':', 'ĠSilver', 'filter', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '181', '798', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠDee', 'pl', 'ife', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '114', '6', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '0', '-', '04', '-', '03', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/g', 'ive', '/', '132', '940', '48', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '09', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '123', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '25', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '403', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '811', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '626', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.', '043', '1', ',', 

Tokenizing batches 81-100:  55%|█████▌    | 11/20 [16:59<14:41, 97.99s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '116', '962', '21', ',', 'ĠTitle', ':', 'ĠR', 'aining', ',', 'ĠArtist', ':', 'ĠSant', 'ory', 'u', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '711', '933', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'Ġ', '305', 'ĠV', 'ibe', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '764', '07', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '9', '-', '02', '-', '06', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/r', 'aining', '/', '116', '962', '21', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '4', ':', '50', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '127', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '28', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '734', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '804', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '545', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.', '064', '1', ',

Tokenizing batches 81-100:  60%|██████    | 12/20 [18:16<12:11, 91.46s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '691', '336', '2', ',', 'ĠTitle', ':', 'ĠBaby', 'ĠStop', ',', 'ĠArtist', ':', 'ĠMart', 'a', 'ĠAdam', 'ch', 'uk', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '436', '175', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠKing', 'ĠStreet', 'ĠSounds', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '266', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '5', '-', '08', '-', '24', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/b', 'aby', '-stop', '/', '691', '336', '2', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '4', ':', '28', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '120', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '5', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '476', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '881', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '322', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.', '044',

Tokenizing batches 81-100:  65%|██████▌   | 13/20 [20:07<11:22, 97.49s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '496', '091', '0', ',', 'ĠTitle', ':', 'ĠMed', 'usa', ',', 'ĠArtist', ':', 'ĠH', 'azz', 'aro', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '284', '551', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠRH', '2', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '159', '31', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '3', '-', '12', '-', '25', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/', 'med', 'usa', '/', '496', '091', '0', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '00', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '126', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '3', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '633', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '735', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '948', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.', '045', '2', ',', 'ĠLoud', 'ne

Tokenizing batches 81-100:  75%|███████▌  | 15/20 [23:54<08:48, 105.76s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '130', '906', '20', ',', 'ĠTitle', ':', 'ĠCl', 'ap', 'Ġto', 'Ġthe', 'ĠBeat', ',', 'ĠArtist', ':', 'ĠJ', 'ules', 'ĠHe', 'ff', 'ner', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '322', '969', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠRH', '2', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '159', '31', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '0', '-', '01', '-', '31', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/cl', 'ap', '-to', '-the', '-be', 'at', '/', '130', '906', '20', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '6', ':', '21', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '121', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '7', '.', '0', ',', 'ĠMode', ':', 'Ġ', '1', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '429', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '88', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '645', ',', 'ĠSpeech',

Tokenizing batches 81-100:  80%|████████  | 16/20 [25:13<06:31, 97.86s/it] 

Tokens: ['Track', 'ĠID', ':', 'Ġ', '593', '184', '4', ',', 'ĠTitle', ':', 'ĠOne', 'ĠDesire', ',', 'ĠArtist', ':', 'ĠT', 'ucc', 'illo', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '553', '72', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠStreet', 'ĠKing', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '126', '44', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '4', '-', '11', '-', '17', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/', 'one', '-des', 'ire', '/', '593', '184', '4', ',', 'ĠMix', ':', 'ĠAli', 'x', 'ĠAlvarez', 'ĠF', '1', 'Ġseries', 'ĠVox', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '42', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '126', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '9', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '7', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '804', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '877', ',', 'ĠSpeech', 'iness

Tokenizing batches 81-100:  85%|████████▌ | 17/20 [26:32<04:36, 92.14s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '119', '689', '01', ',', 'ĠTitle', ':', 'ĠHe', 'ĠMakes', 'ĠMe', 'ĠSay', ',', 'ĠArtist', ':', 'ĠKe', 'isha', 'ĠHall', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '633', '694', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠLW', 'ĠRecord', 'ings', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '783', '9', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '9', '-', '05', '-', '24', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/he', '-m', 'akes', '-me', '-s', 'ay', '/', '119', '689', '01', ',', 'ĠMix', ':', 'ĠDub', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '7', ':', '44', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '124', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '8', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '671', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '804', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '586', ',', 'ĠSpeech', 'iness

Tokenizing batches 81-100: 100%|██████████| 20/20 [31:49<00:00, 95.48s/it] 
Tokenizing batches 101-120:   0%|          | 0/20 [00:00<?, ?it/s]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '164', '509', '5', ',', 'ĠTitle', ':', 'ĠDual', 'ĠBand', ',', 'ĠArtist', ':', 'ĠRico', 'ĠBon', 'etti', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '947', '72', ',', 'ĠGenre', ':', 'ĠHouse', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '5', ',', 'ĠLabel', ':', 'ĠP', 'ino', 'ĠMusic', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '857', '5', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '201', '1', '-', '01', '-', '14', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/d', 'ual', '-band', '/', '164', '509', '5', ',', 'ĠMix', ':', 'ĠOriginal', 'ĠMix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '11', ':', '38', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '151', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '6', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '391', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '806', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '899', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.', '060', '

Tokenizing batches 101-120:  10%|█         | 2/20 [04:00<36:06, 120.35s/it]

Tokens: ['Track', 'ĠID', ':', 'Ġ', '178', '783', '09', ',', 'ĠTitle', ':', 'ĠBack', ',', 'ĠArtist', ':', 'ĠW', 'add', ',', 'ĠArtist', 'ĠID', ':', 'Ġ', '624', '782', ',', 'ĠGenre', ':', 'ĠOrganic', 'ĠHouse', 'Ġ/', 'ĠD', 'ownt', 'empo', ',', 'ĠGenre', 'ĠID', ':', 'Ġ', '93', ',', 'ĠLabel', ':', 'ĠTibet', 'ania', 'ĠRecords', ',', 'ĠLabel', 'ĠID', ':', 'Ġ', '914', '79', ',', 'ĠRelease', 'ĠDate', ':', 'Ġ', '202', '3', '-', '07', '-', '31', ',', 'ĠTrack', 'ĠURL', ':', 'Ġbeat', 'port', '.com', '/', 'track', '/back', '/', '178', '783', '09', ',', 'ĠMix', ':', 'ĠRemix', ',', 'ĠRemix', ':', 'ĠYes', ',', 'ĠRem', 'ixer', ':', 'ĠYes', ',', 'ĠDuration', ':', 'Ġ', '8', ':', '24', 'Ġminutes', ',', 'ĠBPM', ':', 'Ġ', '105', '.', '0', ',', 'ĠKey', 'ĠID', ':', 'Ġ', '9', '.', '0', ',', 'ĠMode', ':', 'Ġ', '0', '.', '0', ',', 'ĠVal', 'ence', ':', 'Ġ', '0', '.', '059', '4', ',', 'ĠDance', 'ability', ':', 'Ġ', '0', '.', '756', ',', 'ĠEnergy', ':', 'Ġ', '0', '.', '487', ',', 'ĠSpeech', 'iness', ':', 'Ġ', '0', '.

Tokenizing batches 101-120:  40%|████      | 8/20 [14:28<20:59, 105.00s/it]

### Cache tokenized data

In [None]:
def pickled_cache(obj, filename_prefix, pickles):
    os.makedirs('cache_parts', exist_ok=True)
    for i in range(0, len(obj), pickles):
        with open(f'cache_parts/{filename_prefix}_part_{i // pickles}.pkl', 'wb') as part_file:
            pickle.dump(obj[i:i + pickles], part_file)

def pickle_jar(filename_prefix):
    parts = []
    i = 0
    while os.path.exists(f'cache_parts/{filename_prefix}_part_{i}.pkl'):
        with open(f'cache_parts/{filename_prefix}_part_{i}.pkl', 'rb') as part_file:
            parts.extend(pickle.load(part_file))
        i += 1
    return parts

In [None]:
pickle_count = 50000
pickled_cache(bp_text_tokens, 'bp_text_tokens_cache')

In [None]:

bp_text_tokens_cached = pickle_jar('bp_text_tokens_cache')

print("Tokenized output: ", bp_text_tokens_cached)

### Token Visualizations

#### Summarize Token Statistics

In [None]:
#Token Length Statistics
from statistics import mean, median

all_token_lengths = [len(token) for tokens_batch in dolly_grazing for token in tokens_batch]

print("Mean Token Length:", mean(all_token_lengths))
print("Median Token Length:", median(all_token_lengths))
print("Minimum Token Length:", min(all_token_lengths))
print("Maximum Token Length:", max(all_token_lengths))

In [None]:
#Vocabulary Size
unique_tokens = set([token for tokens_batch in dolly_grazing for token in tokens_batch])
print("Vocabulary Size:", len(unique_tokens))


#### Token Frequency Analysis

In [None]:
from collections import Counter

# Count token frequencies
all_tokens = [token for tokens_batch in dolly_grazing for token in tokens_batch]
token_counts = Counter(all_tokens)

# Print the top 20 most common tokens
most_common_tokens = token_counts.most_common(20)
for token, count in most_common_tokens:
    print(f"Token: {token}, Frequency: {count}")

#### Visualize Aggregated Data

 Instead of focusing on individual tokens, visualize aggregated statistics. For instance:

Vocabulary Growth Curve: Plot the growth of unique tokens as you tokenize more data to understand how the vocabulary evolves with more input.
Token Length Distribution: Create a histogram or boxplot of token lengths.
Frequency Heatmaps: Use a heatmap to visualize how often specific tokens appear in different parts of the dataset

In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict

# Accumulate token frequencies over batches
token_frequency = defaultdict(int)

for batch in dolly_grazing:
    for tokens in batch['input_ids']:
        for token in tokens:
            token_frequency[token] += 1

# Get the top 20 most common tokens by frequency
sorted_tokens = sorted(token_frequency.items(), key=lambda x: x[1], reverse=True)[:20]

# Plot the token frequencies
tokens, frequencies = zip(*sorted_tokens)
plt.figure(figsize=(10, 6))
plt.bar(tokens, frequencies)
plt.xticks(rotation=45)
plt.xlabel('Token IDs')
plt.ylabel('Frequency')
plt.title('Top 20 Most Common Tokens')
plt.show()

### processing HTML into text tokens 

In [None]:
def dolly_crawl(html_location):
    seeds = document_loaders.HTMLLoader(html_location)
    pellets = seeds.load()
    snack = pellets[0].page_content
    clean_cut = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    chunks = clean_cut.split_text(snack)
    tokenized_chunks = [llama_3211b_tokenizer(chunk, return_tensors="tf", truncation=True, padding=True) for chunk in chunks]

    return tokenized_chunks

radiance = "I need to add an html file here at some point"
tokens = dolly_crawl(radiance)
print(f"Tokenized output: {tokens}")
