In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

sample_size = 7500

In [3]:
df = pd.read_csv("clean_fma_track_md.csv", index_col='track_id')
df = df.drop("Unnamed: 0", axis='columns')
df = df.sample(n=sample_size)


In [4]:
corpus = df['string_md']
working_df = df.drop('string_md', axis='columns')
result_df = working_df.fillna(0)
print((len(result_df), len(result_df.columns.values)))
# result_df.to_csv("./processed_data/sample_fma_track_md.csv")

(7500, 8)


In [None]:
# Count word frequencies with respect to number of documents using TfidfVectorizer
tiv = TfidfVectorizer()
tfidf_vec = tiv.fit_transform(corpus)
tfidf_counts = tfidf_vec.toarray()
labels = []
print("TFIDF_VEC LEN: " + str(len(tfidf_counts[0])))
for i in range(len(tfidf_counts[0])):
    labels.append("tfidf" + str(i))
print("ABOUT TO DO PANDAS WORK")
tfidf_df = pd.DataFrame(tfidf_counts, columns=labels)
print(tfidf_df.head())
# result_df = pd.concat([working_df, tfidf_df], axis='columns')
result_df = working_df.fillna(0)

In [5]:
# Count frequencies using CountVectorizer
cv = CountVectorizer()
count_vec = cv.fit_transform(corpus)
word_counts = count_vec.toarray()
labels = []
print("COUNT_VEC LEN: " + str(len(word_counts[0])))
for i in range(len(word_counts[0])):
    labels.append("word" + str(i))
print("ABOUT TO DO PANDAS WORK")
wc_df = pd.DataFrame(word_counts, columns=labels)
print(wc_df.head())
# result_df = pd.concat([working_df, wc_df], axis='columns')
result_df = working_df.fillna(0)

COUNT_VEC LEN: 49077
ABOUT TO DO PANDAS WORK
   word0  word1  word2  word3  ...  word49073  word49074  word49075  word49076
0      0      0      0      0  ...          0          0          0          0
1      2      0      0      0  ...          0          0          0          0
2      0      0      0      0  ...          0          0          0          0
3      0      0      0      0  ...          0          0          0          0
4      0      0      0      0  ...          0          0          0          0

[5 rows x 49077 columns]


In [6]:
print(result_df.head())
print("ABOUT TO CALCULATE SIMILARITY")
cosine_similarity = cosine_similarity(wc_df) # measure cosine similarity of word counts
euclidean_dist = euclidean_distances(result_df) # measure euclidean dist of metadata
print(cosine_similarity[:5])
print(euclidean_dist[:5])

          track_comments  track_disc_number  ...  track_number  track_bit_rate
track_id                                     ...                              
147790                 0                  1  ...             9        128000.0
130240                 0                  1  ...            28        320000.0
114380                 0                  1  ...             2        256000.0
154499                 0                  1  ...             6        320000.0
16429                  0                  1  ...             0        128000.0

[5 rows x 8 columns]
ABOUT TO CALCULATE SIMILARITY
[[1.         0.45726459 0.3737002  ... 0.38622992 0.41137668 0.40045429]
 [0.45726459 1.         0.40862577 ... 0.44455422 0.42937693 0.41702883]
 [0.3737002  0.40862577 1.         ... 0.46178947 0.4045567  0.45382535]
 [0.34324653 0.39617739 0.4125685  ... 0.53300179 0.37262066 0.4       ]
 [0.21873932 0.23918244 0.28394911 ... 0.26901379 0.26995276 0.22941573]]
[[0.00000000e+00 1.92001531e+