In [None]:
%pip install scikit-learn
%pip install matplotlib
%pip install umap-learn

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import  StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import os
import datetime
import csv
import json
import glob
from sklearn.decomposition import PCA
from umap import UMAP
import numpy as np
from numpy import dot 
from numpy.linalg import norm 

In [None]:
import pprint

from numpy import source

def cos_sim(A, B):
    return dot(A, B)/(norm(A)*norm(B))

def complexity_reduce_tsne(data, dim):
    tsne = TSNE(n_components=dim, random_state=0)
    data_tsne = tsne.fit_transform(data)
    return data_tsne

def complexity_reduce_umap(data, dim):
    umap = UMAP(n_components=dim, random_state=0)
    data_umap = umap.fit_transform(data)
    return data_umap

def complexity_reduce_pca(data, dim):
    pca = PCA(n_components=dim, random_state=0)
    data_pca = pca.fit_transform(data)
    return data_pca

def similarity_from_complexity():
    save_dir_path = "./data/sims/complexity"
    os.makedirs(save_dir_path, exist_ok=True)

    data_path = "./data/complexity/complexity_status.json"
    complexity_file_path = glob.glob(data_path)[0]
    data = None
    with open(complexity_file_path) as f:
        data = json.load(f)

    if data is None:
        print("complexity ta is None")
        return
    
    data = sorted(data, key=lambda x: x["id"])
    complexity_status:list[list] = []
    ids:list[int] = []
    max_len = 0
    for music_complexity in data:
        status_by_measure:list = music_complexity["status_by_measure"]
        id:int = music_complexity["id"]
        complexity_status.append(status_by_measure)
        ids.append(id)
        max_len = max(max_len, len(status_by_measure))
        # print(status_by_measure)
        # reduced_data = complexity_reduce_tsne(status_by_measure, 8)
        # print(reduced_data)
        # return
    for i in range(len(complexity_status)):
        complexity_status[i].extend([0] * (max_len - len(complexity_status[i])))
    
    standard_data = StandardScaler().fit_transform(complexity_status)
    # print(standard_data)
    reduce_standard_complexity = complexity_reduce_pca(standard_data, 10)
    # pprint.pprint(reduce_standard_complexity)
    
    with open(os.path.join(save_dir_path, "similarity.csv"), "w") as f:
        writer = csv.writer(f)
        writer.writerow(["source", "target", "similarity"])
    
    for i in range(len(data)):
        # source, target, similarity
        source_id = ids[i]
        source_complexity = reduce_standard_complexity[i]
        sims: list = []
        for j in range(i+1, len(data)):
            target_id = ids[j]
            target_complexity = reduce_standard_complexity[j]
            similarity = cos_sim(source_complexity, target_complexity)
            sims.append((target_id, similarity))
        
        th_sims = list(filter(lambda x: x[1] > 0.95, sims))
        ordered_sims = sorted(sims, key=lambda x: x[1], reverse=True)
        filter_sims = th_sims
        if len(filter_sims) == 0 and len(ordered_sims) > 0:
            filter_sims.append(ordered_sims[0])
        
        for target_id, similarity in filter_sims:
            # print(source_id, target_id, similarity)
            save_file_path = os.path.join(save_dir_path, "similarity.csv")
            with open(save_file_path, "a") as f:
                writer = csv.writer(f)
                writer.writerow([source_id, target_id, similarity])
                writer.writerow([target_id, source_id, similarity])
    

In [None]:
similarity_from_complexity()
# print(dot([1, 2, 3], [1, 2, 3]) / (norm([1, 2, 3]) * norm([1, 2, 3])))