# Preparing Data Folder

Download this file and save it as 'data.zip' in the code folder 'https://drive.google.com/uc?export=download&id=1HDyNIUUmOGuEXynoMhL-Qm3ExMWDsQza"

In [None]:
%%bash -s "$root_path"

cd $1

mkdir -p data
mkdir -p models
mkdir -p plots
unzip data.zip
mv features data/features
mv taste-profile-subset data/taste-profile-subset
mkdir -p data/taste-profile-subset/clusters

# Collaborative Filtering

In [None]:
%matplotlib inline

import matplotlib as mpl
from matplotlib import pyplot as plt

In [None]:
root_path = "/home/fat-fighter/Documents/projects/machine-learning/cs771-project/code/"

## Finding Optimal Number of Track Clusters (Based on Tracks' MFCC Features)

In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
local_path = root_path + "data/"

n_jobs = -1
max_iter = 500
algorithm = "full"
n_init = 5

In [None]:
tracks_data = pd.read_csv(local_path + "features/tracks-mfcc.csv", sep="\t")

cols = tracks_data.columns.tolist()[1:]
tracks_features = tracks_data[cols]

In [None]:
estimators = [
    (n_clusters, KMeans(n_clusters=n_clusters, random_state=0, n_jobs=n_jobs, max_iter=max_iter, algorithm=algorithm, n_init=n_init))
    for n_clusters in range(5, 16, 1)
]

In [None]:
for n_clusters, estimator in estimators:
    estimator.fit(tracks_features)

In [None]:
with open(local_path + "features/tracks-clustering-kmeans-inertias.csv", "w") as f:
    cluster_inertias = []
    
    for n_clusters, estimator in estimators:
        cluster_inertias.append([n_clusters, estimator.inertia_])
        
    f.write("\n".join([str(n_clusters) + "\t" + str(inertia) for n_clusters, inertia in cluster_inertias]))

### Inertial Plot

In [None]:
with open(local_path + "features/tracks-clustering-kmeans-inertias.csv") as f:
    cluster_inertias = [line.strip(" \t\n\r").split("\t") for line in f.readlines()]
    
cluster_inertias = [[int(cluster), float(inertia)] for cluster, inertia in cluster_inertias]
cluster_inertias = np.array(cluster_inertias)

In [None]:
plt.scatter(cluster_inertias[:, 0], cluster_inertias[:, 1], s=6)

plt.title("Tracks Clustering: Inertia for K-Means")
plt.xlabel("Number of Clusters")
plt.ylabel("Variance")

plt.savefig(root_path + "plots/tracks-clustering-kmeans-inertia.png")
plt.show()

### PCA Plot of Tracks MFCC (for 10 Clusters)

In [None]:
decomposed_tracks_features = PCA(n_components=2).fit(tracks_features).transform(tracks_features)

In [None]:
plt.clf()
plt.scatter(decomposed_tracks_features[:, 0], decomposed_tracks_features[:, 1], alpha=.8, s=0.7)
    
plt.title("Tracks MFCC: PCA Plot")
plt.savefig(root_path + "plots/tracks-mfcc-pca.png")
plt.show()

## Clustering Tracks using GMM

In [None]:
import pandas as pd

from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
local_path = root_path + "data/"

n_clusters = 10
max_iter = 5000
covariance_type = "diag"
n_init = 3

In [None]:
tracks_data = pd.read_csv(local_path + "features/tracks-mfcc.csv", sep="\t")

cols = tracks_data.columns[1:]
tracks_mfcc = tracks_data[cols]

In [None]:
estimator = GaussianMixture(n_components=n_clusters, covariance_type=covariance_type, max_iter=max_iter, random_state=0, n_init=n_init)
estimator.fit(tracks_mfcc)
joblib.dump(estimator, root_path + "models/tracks-clustering-gmm-model.pkl")

In [None]:
estimator = joblib.load(local_path + "/models/tracks-clustering-gmm-model.pkl")

In [None]:
probs = estimator.predict_proba(tracks_mfcc)
cluster_assignments = estimator.predict(tracks_mfcc)

In [None]:
with open(local_path + "features/tracks-cluster-probabilities.csv", "w") as f:
    for i, song_id in enumerate(tracks_data["track_id"]):
        params = [song_id] + list(probs[i]) + [cluster_assignments[i]]

        params = [str(param) for param in params]

        f.write("\t".join(params) + "\n")

### LDA Plot of Tracks MFCC

In [None]:
decomposed_tracks_mfcc = LinearDiscriminantAnalysis(n_components=2).fit(tracks_mfcc, cluster_assignments).transform(tracks_mfcc)

In [None]:
for i in range(n_clusters):
    plt.scatter(decomposed_tracks_mfcc[cluster_assignments == i, 0], decomposed_tracks_mfcc[cluster_assignments == i, 1], alpha=.8, s=0.7)
    
plt.gca().set_xlim([-16, 6])
plt.gca().set_ylim([-5, 5])
plt.title("Tracks MFCC: LDA Plot (After GMM)")
    
plt.savefig(root_path + "plots/tracks-mfcc-gmm-clustering-pca.png", dpi=250)
plt.show()

## Mapping Users to Tracks

In [None]:
local_path = root_path + "data/taste-profile-subset/"

In [None]:
songs_to_tracks = dict()
count = 0
with open(local_path + "songs-to-tracks.txt", "r") as f:
    for line in f.readlines():
        line = line.strip(" \t\n\r").split()
        if len(line) > 1:
            songs_to_tracks[line[0]] = line[1:]

In [None]:
outfile = open(local_path + "user-track-counts-raw.txt", "w")

In [None]:
with open(local_path + "user-song-counts.txt", "r") as f:
    line = f.readline()
    while line:
        line = line.strip(" \t\n\r").split()
        if len(line) == 3 and line[1] in songs_to_tracks:
            for track in songs_to_tracks[line[1]]:
                outfile.write("\t".join([line[0], track, line[2]]) + "\n")
        line = f.readline()

In [None]:
outfile.close()

## Splitting Users into Training and Evaluation Sets

In [None]:
import random

In [None]:
local_path = root_path + "data/"

In [None]:
%%bash -s "$local_path"

cd $1/taste-profile-subset

cut -f1 user-track-counts-raw.txt | sort | uniq -c > user-counts.txt
cat user-counts.txt | sed 's/^ *\([0-9]*\) /\1\t/g' | awk '($1 > 49)' > t; mv t user-counts.txt

In [None]:
%%bash -s "$local_path"

cd $1/taste-profile-subset/

awk 'BEGIN {
    FS = OFS = "\t"
}
NR == FNR {
    f[$2] = $0
    next
}
$1 in f {
    print $0
}' user-counts.txt user-track-counts-raw.txt > t

rm user-track-counts-raw.txt

In [None]:
%%bash -s "$local_path"

cd $1

awk 'BEGIN {
    FS = OFS = "\t"
}
NR == FNR {
    f[$1] = 1
    next
}
$2 in f {
    print $0
}' features/tracks-cluster-probabilities.csv taste-profile-subset/t > taste-profile-subset/user-track-counts.txt

In [None]:
%%bash -s "$local_path"

cd $1/taste-profile-subset/

cut -f2 -d$'\t' user-counts.txt | sort --random-sort > t

size=`cat user-counts.txt | wc -l`
vsize=$(( $size / 10 ))

head -$vsize t > users-validation.txt
tail -n+$vsize t > users-train.txt

rm t

In [None]:
%%bash -s "$local_path"

cd $1/taste-profile-subset/

awk 'BEGIN {
    FS = OFS = "\t"
}
NR == FNR {
    f[$1] = 1
    next
}
$1 in f {
    print $0
}' users-train.txt user-track-counts.txt > user-track-counts-train.txt

In [None]:
%%bash -s "$local_path"

cd $1/taste-profile-subset/

awk 'BEGIN {
    FS = OFS = "\t"
}
NR == FNR {
    f[$1] = 1
    next
}
$1 in f {
    print $0
}' users-validation.txt user-track-counts.txt > user-track-counts-validation.txt

## Computing User Features (Based on Tracks' Cluster Probabilities)

In [None]:
import numpy as np

In [None]:
local_path = root_path + "data/"

n_clusters = 10

In [None]:
tracks_mfcc = dict()
with open(local_path + "features/tracks-cluster-probabilities.csv", "r") as f:
    line = f.readline()
    while line:
        line = f.readline()
        line = line.strip(" \t\n\r").split()
        if len(line) == 12:
            tracks_mfcc[line[0]] = np.array([float(field) for field in line[1:-1]])

In [None]:
with open(local_path + "taste-profile-subset/users-train.txt") as f:
    users_train = [user.strip(" \n\r") for user in f.readlines()]
    
with open(local_path + "taste-profile-subset/users-validation.txt") as f:
    users_validation = [user.strip(" \n\r") for user in f.readlines()]

In [None]:
user_features = dict()
user_track_counts = dict()

In [None]:
with open(local_path + "taste-profile-subset/user-track-counts.txt", "r") as f:
    for line in f:
        line = line.strip(" \t\n\r").split()
        if len(line) == 3 and line[1] in tracks_mfcc:
            if line[0] not in user_track_counts:
                user_features[line[0]] = np.zeros(n_clusters)
                user_track_counts[line[0]] = 0
                
            user_features[line[0]] += tracks_mfcc[line[1]]
            user_track_counts[line[0]] += 1

In [None]:
outfile_train = local_path + "features/user-features-train.csv"
outfile_validation = local_path + "features/user-features-validation.csv"

In [None]:
with open(outfile_train, "w") as f:
    for user in users_train:
        f.write("\t".join([user] + [str(field) for field in (user_features[user] / float(user_track_counts[user]))]) + "\n")
        
with open(outfile_validation, "w") as f:
    for user in users_validation:
        f.write("\t".join([user] + [str(field) for field in (user_features[user] / float(user_track_counts[user]))]) + "\n")

## Finding Optimal Number of Users Clusters (Based on Users' Computed Features)

In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
local_path = root_path + "data/"

In [None]:
n_jobs = -1
max_iter = 500
algorithm = "full"
n_init = 5

In [None]:
user_data = pd.read_csv(local_path + "features/user-features-train.csv", sep="\t", header=None)

cols = user_data.columns.tolist()[1:]
user_features = user_data[cols]

In [None]:
estimators = [
    (n_clusters, KMeans(n_clusters=n_clusters, random_state=0, n_jobs=n_jobs, max_iter=max_iter, algorithm=algorithm, n_init=n_init))
    for n_clusters in range(10, 30, 1)
]

In [None]:
for _, estimator in estimators:
    estimator.fit(user_features)

In [None]:
cluster_inertias = [[n_clusters, estimator.inertia_] for n_clusters, estimator in estimators]
with open(local_path + "features/users-clustering-kmeans-inertias.csv", "w") as f:
    f.write("\n".join([str(n_clusters) + "\t" + str(inertia) for n_clusters, inertia in cluster_inertias]))
    
cluster_inertias = np.array(cluster_inertias)

### Inertial Plot

In [None]:
with open(local_path + "features/users-clustering-kmeans-inertias.csv") as f:
    cluster_inertias = [line.strip(" \t\n\r").split("\t") for line in f.readlines()]
    
cluster_inertias = [[int(n_clusters), float(inertia)] for n_clusters, inertia in cluster_inertias]
cluster_inertias = np.array(cluster_inertias)

In [None]:
plt.scatter(cluster_inertias[:, 0].astype(int), cluster_inertias[:, 1], s=6)

plt.title("Users Clustering: Inertia for K-Means")
plt.xlabel("Number of Clusters")
plt.ylabel("Variance")

plt.savefig(root_path + "plots/users-clustering-kmeans-inertia.png", dpi=250)
plt.show()

### PCA Plot of Users MFCC (for 20 Clusters)

In [None]:
decomposed_user_features = PCA(n_components=2).fit(user_features).transform(user_features)

In [None]:
plt.scatter(decomposed_user_features[:, 0], decomposed_user_features[:, 1], alpha=.8, s=0.7)
    
plt.title("User Features: PCA Plot")
    
plt.savefig(root_path + "plots/user-features-pca.png", dpi=250)
plt.show()

## Clustering Users using GMM

In [None]:
import pandas as pd

from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
local_path = root_path + "data/"

n_clusters = 20
max_iter = 5000
covariance_type = "diag"
n_init = 3

In [None]:
user_data = pd.read_csv(local_path + "/features/user-features-train.csv", sep="\t", header=None)

cols = user_data.columns[1:]
user_features = user_data[cols]

In [None]:
estimator = GaussianMixture(n_components=n_clusters, covariance_type=covariance_type, max_iter=max_iter, random_state=0, n_init=n_init)
estimator.fit(user_features)
joblib.dump(estimator, root_path + "/models/users-clustering-gmm-model.pkl")

In [None]:
estimator = joblib.load(root_path + "/models/users-clustering-gmm-model.pkl")

In [None]:
probs = estimator.predict_proba(user_features)
cluster_assignments = estimator.predict(user_features)

In [None]:
for cluster in range(n_clusters):
    with open(local_path + "taste-profile-subset/clusters/user-ids-" + str(cluster + 1) + ".txt", "w") as f:
        f.write("\n".join(user_data[cluster_assignments == cluster][0]))

In [None]:
with open(local_path + "/features/user-cluster-probabilities.csv", "w") as f:
    for i, user_id in enumerate(user_data[user_data.columns[0]]):
        params = [user_id] + list(probs[i]) + [cluster_assignments[i]]

        params = [str(param) for param in params]

        f.write("\t".join(params) + "\n")

### LDA Plot of User Features

In [None]:
decomposed_user_features = LinearDiscriminantAnalysis(n_components=2).fit(user_features, cluster_assignments).transform(user_features)

In [None]:
for i in range(n_clusters):
    plt.scatter(decomposed_user_features[cluster_assignments == i, 0], decomposed_user_features[cluster_assignments == i, 1], alpha=.8, rasterized=True, s=0.7)

plt.gca().set_ylim([-5, 20])
plt.title("User Features: LDA Plot (After GMM)")
    
plt.savefig(root_path + "plots/user-features-gmm-clustering-pca.png", dpi=250)
plt.show()

## Distributing Users by their clusters

In [None]:
local_path = root_path + "data/taste-profile-subset/"

In [None]:
%%bash -s "$local_path"

cd $1/clusters/
for cluster in {1..20}; do
    cat user-ids-$cluster.txt | sed "s/$/\t$cluster/g"
    echo ""
done > user-clusters.txt

In [None]:
n_clusters = 20

In [None]:
cluster_files = [open(local_path + "clusters/user-track-counts-" + str(cluster + 1) + ".txt", "w") for cluster in range(n_clusters)]

In [None]:
user_clusters = dict()
with open(local_path + "clusters/user-clusters.txt") as f:
    for line in f:
        line = line.strip("\t\n\r").split("\t")
        user_clusters[line[0]] = int(line[1])

In [None]:
with open(local_path + "user-track-counts.txt") as f:
    for line in f:
        line = line.strip("\t\n\r").split("\t")
        if line[0] in user_clusters:
            cluster_files[user_clusters[line[0]] - 1].write("\t".join(line) + "\n")

In [None]:
for f in cluster_files:
    f.close()

## Collaborative Filtering On User Clusters

In [None]:
from math import sqrt

import numpy as np
from scipy.sparse import csr_matrix

In [None]:
local_path = root_path + "data/taste-profile-subset/"

n_clusters = 20

In [None]:
user_suggestions_file = open(local_path + "suggestions.csv", "w")

In [None]:
user_track_counts = dict()

with open(local_path + "clustered-user-track-counts/cluster-k0.txt") as f:
    for line in f:
        line = line.strip(" \t\n\r").split("\t")
        if line != []:
            user_track_counts[line[0]] = set(line[1:])
    
similarity = [[0]*len(user_track_counts)]*len(user_track_counts)

In [None]:
tracks = set([])
for user in user_track_counts:
    for track in user_track_counts[user]:
        tracks.add(track)
        
tracks = list(tracks)
users = list(user_track_counts)

In [None]:
N, M = (len(users), len(tracks))

In [None]:
for i, user_i in enumerate(users):
    weights = dict()
    for track in tracks:
        weights[track] = 0
        
    for user_j in user_track_counts:
        if user_i != user_j:
        
            similarity = len(user_track_counts[user_i].intersection(user_track_counts[user_j]))
            similarity = similarity / (sqrt(len(user_track_counts[user_i])) * sqrt(len(user_track_counts[user_j])))

            for track in user_track_counts[user_j]:
                if track not in user_track_counts[user_i]:
                    weights[track] += similarity
                    
    keys = sorted(list(weights), key=lambda x: -weights[x])[:50]
    user_suggestions_file.write(user_i + "\t" + "\t".join(keys) + "\n")

In [None]:
user_suggestions_file.close()

## Generating Recommendations for Validation Users (User-User Localized Similarity)

In [None]:
import random
from math import sqrt

import numpy as np
from sklearn.externals import joblib

from multiprocessing import Pool

In [None]:
local_path = root_path + "data/"

n_clusters = 20

In [None]:
user_features = dict()
with open(local_path + "features/user-features-validation.csv") as f:
    for line in f:
        line = line.strip(" \t\n\r").split()
        user_features[line[0]] = line[1:]

In [None]:
users = list(user_features)

In [None]:
gmm_clustering_model = joblib.load(local_path + "models/users-clustering-gmm-model.pkl")

In [None]:
clustered_users = dict()
for cluster in range(n_clusters):
    clustered_users[cluster] = []
    
for user in users:
    cluster = gmm_clustering_model.predict([user_features[user]])[0]
    clustered_users[cluster].append(user)

In [None]:
user_tracks = dict()
user_validation_tracks = dict()
for user in users:
    user_tracks[user] = [set([]), 0]
    user_validation_tracks[user] = set([])

with open(local_path + "taste-profile-subset/user-track-counts-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r").split("\t")
        if random.random() > 0.35:
            user_tracks[line[0]][0].add(line[1])
        else:
            user_validation_tracks[line[0]].add(line[1])

for user in users:
    user_tracks[user][1] = sqrt(len(user_tracks[user][0]))

In [None]:
def get_suggestions_for_cluster(cluster):
    global user_tracks, clustered_users, local_path
    
    outfile = open(local_path + "taste-profile-subset/suggestions-validation-" + str(cluster) + ".txt", "w")
    
    print "Starting for cluster", cluster
    tracks = set([])
    
    cluster_user_tracks = dict()
    with open(local_path + "taste-profile-subset/clusters/user-ids-" + str(cluster + 1) + ".txt") as f:
        for line in f:
            cluster_user_tracks[line.strip(" \n\r")] = [set([]), 0]

    with open(local_path + "taste-profile-subset/clusters/user-track-counts-" + str(cluster + 1) + ".txt") as f:
        for line in f:
            line = line.strip(" \n\r").split("\t")
            cluster_user_tracks[line[0]][0].add(line[1])
            tracks.add(line[1])

    for user in cluster_user_tracks:
        cluster_user_tracks[user][1] = sqrt(len(cluster_user_tracks[user][0]))

    for i, user_v in enumerate(clustered_users[cluster][:1]):
        if i % 10 == 0:
            print "\tStarting for user", i
            
        track_weights = dict()
        for track in tracks:
            track_weights[track] = 0
            
        for user_t in cluster_user_tracks:
            similarity = len(user_tracks[user_v][0].intersection(cluster_user_tracks[user_t][0]))
            similarity = similarity / (user_tracks[user_v][1] * cluster_user_tracks[user_t][1])
            similarity = pow(similarity, 6)

            for track in cluster_user_tracks[user_t][0].difference(user_tracks[user_v][0]):
                track_weights[track] += similarity
        
        suggestions = np.array(sorted(tracks, key=lambda x: track_weights[x]))[-1:-501:-1]
        
        outfile.write(user_v + "\t" + "\t".join(suggestions) + "\n")
        
    outfile.close()

In [None]:
process_pool = Pool(4)
process_pool.map(get_suggestions_for_cluster, range(n_clusters))

In [None]:
%%bash -s "$local_path"

cd $1/taste-profile-subset

for cluster in {0..19}; do
    cat suggestions-validation-$cluster.txt
done > suggestions-validation.txt

for cluster in {0..19}; do
    rm suggestions-validation-$cluster.txt
done

In [None]:
with open(local_path + "taste-profile-subset/user-tracks-used-validation.txt", "w") as f:
    for user in user_tracks:
        f.write(user + "\t" + "\t".join(user_tracks[user][0]) + "\n")

## Generating Recommendations for Validation Users (Item-Item Localized Similarity)

In [None]:
import random
from math import sqrt

import numpy as np
from sklearn.externals import joblib

from multiprocessing import Pool

In [None]:
local_path = root_path + "data/"

n_clusters = 20

In [None]:
user_features = dict()
with open(local_path + "features/user-features-validation.csv") as f:
    for line in f:
        line = line.strip(" \t\n\r").split()
        user_features[line[0]] = line[1:]

In [None]:
users = list(user_features)

In [None]:
gmm_clustering_model = joblib.load(root_path + "models/users-clustering-gmm-model.pkl")

In [None]:
user_tracks = dict()
user_validation_tracks = dict()
for user in users:
    user_tracks[user] = [set([]), 0]
    user_validation_tracks[user] = set([])
    
track_users = dict()

with open(local_path + "taste-profile-subset/user-track-counts-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r").split("\t")
        if random.random() > 0.35:
            if line[1] not in track_users:
                track_users[line[1]] = [set([]), 0]
                
            track_users[line[1]][0].add(line[0])
            user_tracks[line[0]][0].add(line[1])
        else:
            user_validation_tracks[line[0]].add(line[1])

for user in users:
    user_tracks[user][1] = sqrt(len(user_tracks[user][0]))

for track in track_users:
    track_users[track][1] = sqrt(len(track_users[track][0]))

In [None]:
clustered_users = dict()
clustered_tracks = dict()
for cluster in range(n_clusters):
    clustered_users[cluster] = []
    clustered_tracks[cluster] = set([])
    
for user in users:
    cluster = gmm_clustering_model.predict([user_features[user]])[0]
    
    clustered_users[cluster].append(user)
    clustered_tracks[cluster] = clustered_tracks[cluster].union(user_tracks[user][0])

In [None]:
def get_suggestions_for_cluster(cluster):
    global track_users, clustered_users, clustered_tracks, local_path

    outfile = open(local_path + "taste-profile-subset/suggestions-validation-" + str(cluster) + ".txt", "w")

    print "Starting for cluster", cluster
    
    cluster_track_users = dict()
    with open(local_path + "taste-profile-subset/clusters/user-track-counts-" + str(cluster + 1) + ".txt") as f:
        for line in f:
            line = line.strip(" \n\r").split("\t")
            if line[1] not in cluster_track_users:
                cluster_track_users[line[1]] = [set([]), 0]
                
            cluster_track_users[line[1]][0].add(line[0])

    for track in cluster_track_users:
        cluster_track_users[track][1] = sqrt(len(cluster_track_users[track][0]))

    for i, user_v in enumerate(list(clustered_users[cluster])):
            
        suggestions = []
            
        for j, track_t in enumerate(list(cluster_track_users)):
            similarity = 0
            
            for track_v in list(user_tracks[user_v][0]):
                similarity_t = len(track_users[track_v][0].intersection(cluster_track_users[track_t][0]))
                similarity_t = similarity_t / (track_users[track_v][1] * cluster_track_users[track_t][1])
                similarity_t = pow(similarity_t, 3)
                
                similarity += similarity_t
                
            suggestions.append((track_t, similarity))
                
        suggestions.sort(key=lambda x: -x[1])
        suggestions = [suggestion[0] for suggestion in suggestions[:500]]
        
        outfile.write(user_v + "\t" + "\t".join(suggestions) + "\n")
        
    outfile.close()

In [None]:
get_suggestions_for_cluster(0)

In [None]:
process_pool = Pool(2)
process_pool.map(get_suggestions_for_cluster, range(n_clusters))

In [None]:
%%bash -s "$local_path"

cd $1/taste-profile-subset

for cluster in {1..20}; do
    cat suggestions-validation-$cluster.txt
done > suggestions-validation.txt

for cluster in {1..20}; do
    rm suggestions-validation-$cluster.txt
done

In [None]:
with open(local_path + "taste-profile-subset/user-tracks-used-validation.txt", "w") as f:
    for user in user_tracks:
        f.write(user + "\t" + "\t".join(user_tracks[user][0]) + "\n")

## Computing Truncated mAP on the Predicted Recommendations

In [None]:
import numpy as np

In [None]:
local_path = root_path + "data/taste-profile-subset/"

In [None]:
listened_user_tracks = dict()
with open(local_path + "users-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r")
        listened_user_tracks[line] = set([])
        
with open(local_path + "user-track-counts-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r").split("\t")
        listened_user_tracks[line[0]].add(line[1])

for user in listened_user_tracks:
    listened_user_tracks[user] = set(listened_user_tracks[user])

In [None]:
with open(local_path + "user-tracks-used-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r").split("\t")
        listened_user_tracks[line[0]] = listened_user_tracks[line[0]].difference(line[1:])

In [None]:
with open(local_path + "suggestions-validation.txt") as f:
    aps = list()
    
    for line in f:
        if line.strip() == "":
            continue
            
        line = line.strip(" \t\n\r").split("\t")
        
        user = line[0]
        tracks = line[1:]
        tracks = np.array(tracks[:500])
        
        k = 0
        l = 0
        p = 0.0
        for i, track in enumerate(tracks):
            k += 1
            if track in listened_user_tracks[user]:
                l += 1
                p += float(l) / float(k)
            
        if l != 0:
            aps.append(p / l)
        else:
            aps.append(0)
        
    print np.mean(aps)

## LDA Plot of the User Suggestions

In [None]:
import numpy as np

from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
local_path = root_path + "data/"

n_clusters = 10

In [None]:
listened_user_tracks = dict()
with open(local_path + "taste-profile-subset/users-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r")
        listened_user_tracks[line] = set([])
        
with open(local_path + "taste-profile-subset/user-track-counts-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r").split("\t")
        listened_user_tracks[line[0]].add(line[1])

for user in listened_user_tracks:
    listened_user_tracks[user] = set(listened_user_tracks[user])

In [None]:
with open(local_path + "taste-profile-subset/user-tracks-used-validation.txt") as f:
    for line in f:
        line = line.strip(" \n\r").split("\t")
        listened_user_tracks[line[0]] = listened_user_tracks[line[0]].difference(line[1:])

In [None]:
user_suggestions = dict()
with open(local_path + "taste-profile-subset/suggestions-validation.txt") as f:
    for line in f:
        line = line.strip(" \t\n\r").split("\t")
        user_suggestions[line[0]] = set(line[1:]).difference(listened_user_tracks[user])

### Clustering Tracks

In [None]:
tracks_clustering_model = joblib.load(local_path + "models/tracks-clustering-gmm-model.pkl")

In [None]:
tracks_mfcc = []
with open(local_path + "features/tracks-mfcc.csv") as f:
    f.readline()
    for line in f:
        line = line.strip(" \t\n\r").split()
        tracks_mfcc.append([float(field) for field in line[1:]])

In [None]:
cluster_assignments = tracks_clustering_model.predict(tracks_mfcc)

### Loading User Tracks

In [None]:
user = list(user_suggestions)[0]

In [None]:
user_tracks = listened_user_tracks[user]
user_suggestions = user_suggestions[user]

In [None]:
user_tracks_mfcc = []
user_suggestions_mfcc = []
with open(local_path + "features/tracks-mfcc.csv") as f:
    f.readline()
    for line in f:
        line = line.strip(" \t\n\r").split()
        if line[0] in user_tracks:
            user_tracks_mfcc.append([float(field) for field in line[1:]])
            
        if line[0] in user_suggestions:
            user_suggestions_mfcc.append([float(field) for field in line[1:]])

### LDA Plot of User Tracks and Suggestions

In [None]:
lda_model = LinearDiscriminantAnalysis(n_components=2).fit(tracks_mfcc, cluster_assignments)

In [None]:
decomposed_tracks_mfcc = lda_model.transform(tracks_mfcc)
decomposed_user_tracks_mfcc = lda_model.transform(user_tracks_mfcc)
decomposed_user_suggestions_mfcc = lda_model.transform(user_suggestions_mfcc)

In [None]:
for i in range(n_clusters):
    plt.scatter(decomposed_tracks_mfcc[cluster_assignments == i, 0], decomposed_tracks_mfcc[cluster_assignments == i, 1], alpha=.8, rasterized=True, s=0.7)

plt.scatter(decomposed_user_tracks_mfcc[:, 0], decomposed_user_tracks_mfcc[:, 1], alpha=1, s=8, c="blue")
plt.scatter(decomposed_user_suggestions_mfcc[:, 0], decomposed_user_suggestions_mfcc[:, 1], alpha=1, s=8, c="black")

plt.gca().set_xlim([-15, 5])
plt.gca().set_ylim([-4, 4.5])
plt.title("Tracks MFCC: LDA Plot (After GMM)")

plt.savefig(local_path + "plots/tracks-mfcc-lda-exploited-suggestions.png", dpi=250)
plt.show()

# Exploration

## Generating Track Recommendations through Exploration

In [None]:
from math import sqrt

import numpy as np

from sklearn.externals import joblib
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
local_path = root_path + "data/"

n_clusters = 10
n_suggestions = 25

In [None]:
users = set()
tracks = set()
with open(local_path + "taste-profile-subset/user-track-counts.txt") as f:
    for line in f:
        line = line.strip(" \t\n\r").split("\t")
        
        users.add(line[0])
        tracks.add(line[1])

In [None]:
users = list(users)
tracks = list(tracks)

In [None]:
user_indices = dict()
track_indices = dict()

for i, user in enumerate(users):
    user_indices[user] = i

for i, track in enumerate(tracks):
    track_indices[track] = i

In [None]:
user_tracks = dict()
track_features = dict()

for i in range(len(users)):
    user_tracks[i] = set()

for i in range(len(tracks)):
    track_features[i] = [0, -1, -1]

In [None]:
with open(local_path + "taste-profile-subset/user-track-counts.txt") as f:
    for line in f:
        line = line.strip(" \t\n\r").split("\t")
        
        user, track = user_indices[line[0]], track_indices[line[1]]
        user_tracks[user].add(track)
        track_features[track][0] += 1

In [None]:
clustered_tracks = dict()
for cluster in range(n_clusters):
    clustered_tracks[cluster] = []
    
with open(local_path + "features/tracks-cluster-probabilities.csv") as f:
    for track in f:
        track = track.strip(" \t\n\r").split("\t")
        if track[0] in track_indices:
            track[0] = track_indices[track[0]]
            
            clustered_tracks[int(track[-1])].append(track[0])
            track_features[track[0]][1] = int(track[-1])
            track_features[track[0]][2] = float(track[int(track[-1]) + 1])

In [None]:
track_features[list(track_features)[0]]

In [None]:
for cluster in clustered_tracks:
    clustered_tracks[cluster].sort(key=lambda track: -track_features[track][0] * track_features[track][2])

In [None]:
user_tracks_clusters = dict()
for user in user_tracks:
    user_tracks_clusters[user] = []
    for cluster in range(n_clusters):
        user_tracks_clusters[user].append(1)
        
    for track in user_tracks[user]:
        user_tracks_clusters[user][track_features[track][1]] += 1

In [None]:
for user in user_tracks:
    normalization_const = 0
    for cluster in range(n_clusters):
        user_tracks_clusters[user][cluster] = sqrt(len(clustered_tracks[cluster])) / user_tracks_clusters[user][cluster]
        normalization_const += user_tracks_clusters[user][cluster]
    
    for cluster in range(n_clusters):
        user_tracks_clusters[user][cluster] = user_tracks_clusters[user][cluster] / normalization_const

In [None]:
outfile = open(local_path + "taste-profile-subset/suggestions-exploration.txt", "w")

user_suggestions = dict()
for user in user_tracks:
    suggestions = set([])
    cluster_indices = [0] * n_suggestions
    
    while len(suggestions) < n_suggestions:
        cluster = np.argmax(np.random.multinomial(20, user_tracks_clusters[user], size = 1))
        
        while clustered_tracks[cluster][cluster_indices[cluster]] in user_tracks[user]:
            cluster_indices[cluster] += 1
        
        suggestions.add(clustered_tracks[cluster][cluster_indices[cluster]])
        cluster_indices[cluster] += 1
    
    user_suggestions[user] = suggestions
    outfile.write(users[user] + "\t" + "\t".join([tracks[track] for track in suggestions]) + "\n")
    
outfile.close()

## Plotting User Suggestions

In [None]:
from sklearn.externals import joblib

In [None]:
local_path = root_path + "data/"

In [None]:
tracks_clustering_model = joblib.load(local_path + "models/tracks-clustering-gmm-model.pkl")

In [None]:
tracks_mfcc = []
with open(local_path + "features/tracks-mfcc.csv") as f:
    f.readline()
    for line in f:
        line = line.strip(" \t\n\r").split()
        tracks_mfcc.append([float(field) for field in line[1:]])

In [None]:
cluster_assignments = tracks_clustering_model.predict(tracks_mfcc)

### Loading Tracks for First User

In [None]:
user_suggestions = []
with open(local_path + "taste-profile-subset/suggestions-exploration.txt") as f:
    user = f.readline().split("\t")
    user_suggestions = user[1:]
    user = user[0]

In [None]:
user_tracks = []
with open(local_path + "taste-profile-subset/user-track-counts.txt") as f:
    for line in f:
        line = line.strip(" \t\n\r").split("\t")
        
        if line[0] == user:
            user_tracks.append(line[1])

In [None]:
user_tracks_mfcc = []
user_suggestions_mfcc = []
with open(local_path + "features/tracks-mfcc.csv") as f:
    f.readline()
    for line in f:
        line = line.strip(" \t\n\r").split()
        if line[0] in user_tracks:
            user_tracks_mfcc.append([float(field) for field in line[1:]])
            
        if line[0] in user_suggestions:
            user_suggestions_mfcc.append([float(field) for field in line[1:]])

### LDA Plot of Tracks

In [None]:
lda_model = LinearDiscriminantAnalysis(n_components=2).fit(tracks_mfcc, cluster_assignments)

In [None]:
decomposed_tracks_mfcc = lda_model.transform(tracks_mfcc)
decomposed_user_tracks_mfcc = lda_model.transform(user_tracks_mfcc)
decomposed_user_suggestions_mfcc = lda_model.transform(user_suggestions_mfcc)

```python
for i in range(n_clusters):
    plt.scatter(decomposed_tracks_mfcc[cluster_assignments == i, 0], decomposed_tracks_mfcc[cluster_assignments == i, 1], alpha=.8, rasterized=True, s=0.7)

plt.scatter(decomposed_user_tracks_mfcc[:, 0], decomposed_user_tracks_mfcc[:, 1], alpha=1, s=8, c="blue")
plt.scatter(decomposed_user_suggestions_mfcc[:, 0], decomposed_user_suggestions_mfcc[:, 1], alpha=1, s=15, c="black")

plt.gca().set_xlim([-15, 5])
plt.gca().set_ylim([-4, 4.5])
plt.title("Tracks MFCC: LDA Plot (After GMM)")

plt.savefig(local_path + "plots/tracks-mfcc-lda-explored-suggestions.png", dpi=250)
plt.show()
```