In [None]:
%pip install scikit-learn
%pip install matplotlib
%pip install umap-learn

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import  StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import os
import datetime
import csv
import json
import glob
from sklearn.decomposition import PCA
from umap import UMAP
import numpy as np
from numpy import dot 
from numpy.linalg import norm 

In [None]:
class UnionFind():
    def __init__(self, n):
        self.par = [-1] * n
        self.rank = [0] * n
        self.siz = [1] * n

    def root(self, x):
        if self.par[x] == -1:
            return x
        else:
          self.par[x] = self.root(self.par[x])
          return self.par[x]

    def issame(self, x, y):
        return self.root(x) == self.root(y)

    def unite(self, x, y):
        rx = self.root(x)
        ry = self.root(y)
        if rx == ry:
            return False
        if self.rank[rx] < self.rank[ry]:
            rx, ry = ry, rx
        self.par[ry] = rx
        if self.rank[rx] == self.rank[ry]:
            self.rank[rx] += 1
        self.siz[rx] += self.siz[ry]
        return True
    
    def size(self, x):
        return self.siz[self.root(x)]

In [None]:
def cos_sim(A, B):
    return dot(A, B)/(norm(A)*norm(B))

def complexity_reduce_tsne(data, dim):
    tsne = TSNE(n_components=dim, random_state=0)
    data_tsne = tsne.fit_transform(data)
    return data_tsne

def complexity_reduce_umap(data, dim):
    umap = UMAP(n_components=dim, random_state=0)
    data_umap = umap.fit_transform(data)
    return data_umap

def complexity_reduce_pca(data, dim):
    pca = PCA(n_components=dim, random_state=0)
    data_pca = pca.fit_transform(data)
    return data_pca

def get_reduce_standard_complexity():
    save_dir_path = "./data/sims/complexity"
    os.makedirs(save_dir_path, exist_ok=True)

    data_path = "./data/complexity/complexity_status.json"
    complexity_file_path = glob.glob(data_path)[0]
    data = None
    with open(complexity_file_path) as f:
        data = json.load(f)

    if data is None:
        print("complexity ta is None")
        return
    
    data = sorted(data, key=lambda x: x["id"])
    complexity_status:list[list] = []
    ids:list[int] = []
    max_len = 0
    for music_complexity in data:
        status_by_measure:list = music_complexity["status_by_measure"]
        id:int = music_complexity["id"]
        complexity_status.append(status_by_measure)
        ids.append(id)
        max_len = max(max_len, len(status_by_measure))
    for i in range(len(complexity_status)):
        complexity_status[i].extend([0] * (max_len - len(complexity_status[i])))
    
    standard_data = StandardScaler().fit_transform(complexity_status)
    reduce_standard_complexity = complexity_reduce_pca(standard_data, 10)
    return reduce_standard_complexity, data, ids


def similarity_from_complexity():
    save_dir_path = "./data/sims/complexity"
    os.makedirs(save_dir_path, exist_ok=True)

    reduce_standard_complexity, data, ids = get_reduce_standard_complexity()
    

    os.makedirs("./data/sims/each", exist_ok=True)
    similarity_all = []
    for i in range(len(data)):
        # source, target, similarity
        source_id = ids[i]
        source_complexity = reduce_standard_complexity[i]
        
        for j in range(i+1, len(data)):
            target_id = ids[j]
            target_complexity = reduce_standard_complexity[j]
            similarity = (cos_sim(source_complexity, target_complexity) + 1)/2
            
            similarity_all.append((source_id, target_id, similarity))
        
        sims: list = []
        for j in range(len(data)):
            if i == j:
                continue
            target_id = ids[j]
            target_complexity = reduce_standard_complexity[j]
            similarity = (cos_sim(source_complexity, target_complexity) + 1)/2
            sims.append((source_id, target_id, similarity))
            
        with open(f"./data/sims/each/similarities_{source_id}.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(["source", "target", "similarity"])
            for v in sims:
                writer.writerow(v)

    sim_graph: dict[set] = dict()

    order_similarity_all = sorted(similarity_all, key=lambda x: x[2], reverse=True)
    id_to_index = {id:i for i, id in enumerate(ids)}
    uf = UnionFind(len(ids))
    size = len(ids)
    save_items = set()

    for source_id, target_id, similarity in order_similarity_all:
        if size == 1:
            break
        source_index = id_to_index[source_id]
        target_index = id_to_index[target_id]

        if uf.unite(source_index, target_index):
            save_items.add((source_id, target_id, similarity))
            sim_graph.setdefault(source_id, set()).add((target_id, similarity))
            sim_graph.setdefault(target_id, set()).add((source_id, similarity))
            size -= 1

    for source_id, target_id, similarity in order_similarity_all:
        if similarity > 0.98:
            save_items.add((source_id, target_id, similarity))
    

    
    save_file_path = os.path.join(save_dir_path, "union_similarity_98.csv")
    with open(save_file_path, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["source", "target", "weight"])
        for source_id, target_id, similarity in save_items:
            writer.writerow([source_id, target_id, similarity])
        
    
    
    
    

In [None]:
similarity_from_complexity()
# print(dot([1, 2, 3], [1, 2, 3]) / (norm([1, 2, 3]) * norm([1, 2, 3])))