In [1]:
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from sklearn.cluster import KMeans

# Load your data
pitch_data = pd.DataFrame(pd.read_csv('jazmine-pitch.csv'))
words_data = pd.DataFrame(pd.read_csv('jazmine-words.csv'))

In [2]:
# Giving repeated words unique labels
def unique_word_labels(data):
    word_count = {}
    last_word = None
    unique_data = []

    for index, row in data.iterrows():
        word = row['word']
        
        # Check if this is a repeated word
        if word == last_word:
            # Use the same label as the previous occurrence if it's in a clump
            unique_word = unique_data[-1]['word']
        else:
            # If it's a new occurrence or a different word
            count = word_count.get(word, 0) + 1
            word_count[word] = count

            if count == 1:
                unique_word = word  # First occurrence remains the same
            else:
                unique_word = f"{word}-{count}"  # Label subsequent occurrences uniquely

        # Creating a new dictionary with modified word property
        new_item = row.to_dict()
        new_item['word'] = unique_word
        unique_data.append(new_item)

        last_word = word  # Update the last seen word

    
    return pd.DataFrame(unique_data)

words_data = unique_word_labels(words_data)

In [3]:
def get_word_data(word, words_data, pitch_data):
    word_notes = words_data[words_data['word'] == word]
    start_time = word_notes['timestamp'].min()
    end_time = word_notes.iloc[-1]['timestamp'] + word_notes.iloc[-1]['duration']

    # Adjusting the timestamps relative to the word's start
    adjusted_word_notes = word_notes.copy()
    adjusted_word_notes['adjusted_timestamp'] = adjusted_word_notes['timestamp'] - start_time

    # Filtering and adjusting pitch data
    filtered_pitch_data = pitch_data[(pitch_data['timestamp'] >= start_time) & (pitch_data['timestamp'] <= end_time)].copy()
    filtered_pitch_data['adjusted_timestamp'] = filtered_pitch_data['timestamp'] - start_time

    return {
        'word': word,
        'note_data': adjusted_word_notes,
        'pitch_data': filtered_pitch_data
    }

def chaos_score(pitch_data1, pitch_data2):
    # Using FastDTW for a quick approximation
    distance, _ = fastdtw(pitch_data1, pitch_data2, dist=euclidean)
    return distance

In [4]:
# Create distance matrix
unique_words = words_data['word'].unique()
shapes = [get_word_data(word, words_data, pitch_data) for word in unique_words]
num_shapes = len(shapes)
distance_matrix = []
filename = "distance_matrix.npy"

# Check if the file exists
if os.path.exists(filename):
    # Load the distance matrix
    distance_matrix = np.load(filename)
    print("Loaded distance matrix from file.")
else:
    num_shapes = len(shapes)
    distance_matrix = []

    for i, shape1 in enumerate(shapes):
        row = []
        for j, shape2 in enumerate(shapes):
            if i == j:
                # Skip comparing the shape with itself and set distance to 0
                distance = 0
            else:
                distance = chaos_score(shape1['pitch_data'], shape2['pitch_data'])
            row.append(distance)
        distance_matrix.append(row)

        # Print progress
        print(f"Processed shape {i+1} of {num_shapes}")

    # Convert to numpy array for easy saving
    distance_matrix = np.array(distance_matrix)

    # Save the distance matrix to a file
    np.save(filename, distance_matrix)
    print("Distance matrix computation complete and saved to file.")

Processed shape 1 of 81
Processed shape 2 of 81
Processed shape 3 of 81
Processed shape 4 of 81
Processed shape 5 of 81
Processed shape 6 of 81
Processed shape 7 of 81
Processed shape 8 of 81
Processed shape 9 of 81
Processed shape 10 of 81
Processed shape 11 of 81
Processed shape 12 of 81
Processed shape 13 of 81
Processed shape 14 of 81
Processed shape 15 of 81
Processed shape 16 of 81
Processed shape 17 of 81
Processed shape 18 of 81
Processed shape 19 of 81
Processed shape 20 of 81
Processed shape 21 of 81
Processed shape 22 of 81
Processed shape 23 of 81
Processed shape 24 of 81
Processed shape 25 of 81
Processed shape 26 of 81
Processed shape 27 of 81
Processed shape 28 of 81
Processed shape 29 of 81
Processed shape 30 of 81
Processed shape 31 of 81
Processed shape 32 of 81
Processed shape 33 of 81
Processed shape 34 of 81
Processed shape 35 of 81
Processed shape 36 of 81
Processed shape 37 of 81
Processed shape 38 of 81
Processed shape 39 of 81
Processed shape 40 of 81
Processed

In [None]:
k = 10  # Or any other number
kmeans = KMeans(n_clusters=k)
clusters = kmeans.fit_predict(distance_matrix)


In [None]:
clustered_words = pd.DataFrame({
    'word': unique_words,
    'cluster': clusters
})