In [1]:
import numpy as np
import json
import os
import re
import collections
from sklearn.datasets.base import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import pickle
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText
import sys
from __future__ import division
from collections import OrderedDict
import pandas as pd
from sklearn.datasets import load_files
from collections import OrderedDict

In [2]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.model_selection import KFold
from surprise import accuracy

In [3]:
def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [4]:
#Load Training Data change path name as per your directory
path = '/home/irlab/Documents/recsys/mpd/data'
filenames = os.listdir(path)
mapping = {}
playlist_prob = OrderedDict()

In [5]:
#Train FastText Model for finding similarity
def train_playlist_embedding(filenames):    
    data = LineSentence('/home/irlab/files/data.txt')
    model = FastText(size=100)
    print('Building Vocab....')
    model.build_vocab(data)
    print('Training....')
    model.train(data, total_examples=model.corpus_count, epochs=model.iter)
    print(model)
    return model

In [6]:
#print(train_data)
model = train_playlist_embedding(filenames)

Building Vocab....
Training....


  


FastText(vocab=9132, size=100, alpha=0.025)


In [7]:
#Find Similar Playlists with given name
def find_similar(playlist_name,model = model):
    names = []
    similar_playlists = model.wv.similar_by_word(playlist_name,20)
    similar_playlists = dict(similar_playlists)
    sim_playlist_name = [key for (key,value) in sorted(similar_playlists.items(),key=lambda x:x[1],reverse=True)]
    return sim_playlist_name

In [8]:
#Mapping of playlist name to filename
def load_filemapping():
    #Change this as per you files folder location
    fp = open('/home/irlab/files/mappings/track_filename.json')
    mpd_mapping = json.load(fp)
    return mpd_mapping

In [9]:
mpd_mapping = load_filemapping()

In [10]:
#Change this as per you files folder location
fp = open('/home/irlab/files/stats/album_stats.json')
album_map = json.load(fp)
album_stat = sum(album_map.values())

#Change this as per you files folder location
fp = open('/home/irlab/files/stats/artist_stats.json')
artist_map = json.load(fp)
artist_stat = sum(artist_map.values())

#Change this as per you files folder location
fp = open('/home/irlab/files/stats/tracks_stats.json')
track_map = json.load(fp)
track_stat = sum(track_map.values())

print(album_stat,artist_stat,track_stat)

(66346428, 66346428, 66346428)


In [11]:
#Generate weights for the tracks
def create_weighted_playlists(playlist_name,playlist_prob):
    try:
        similar_playlists = find_similar(playlist_name)
    except KeyError:
        temp = playlist_name.split()
        playlist_name = ''.join(temp)
        similar_playlists = find_similar(playlist_name)
        
    #print(similar_playlists)
    for similar_playlist in similar_playlists:
        if similar_playlist in mpd_mapping.keys():    
            #print(similar_playlist)
            name = mpd_mapping[similar_playlist]
            filename = os.sep.join((path,name))
            fp = open(filename,'r')
            playlist_file = json.load(fp)
            mpd_data = playlist_file['playlists']
            playlist_prob[similar_playlist] = {}
            for playlist in mpd_data:
                if normalize_name(playlist['name']) in similar_playlist:
                    for track in playlist['tracks']:
                        p1 = artist_map[track['artist_uri']]/artist_stat
                        p2 = album_map[track['album_uri']]/album_stat
                        p3 = track_map[track['track_uri']]/track_stat
                        playlist_prob[similar_playlist][track['track_uri']] = (p1+p2+p3)/(1+track['pos'])
                        #print(playlist_prob[similar_playlist])
    return playlist_prob

In [12]:
#Create Rating matrix
def create_matrix(playlist_prob):
    pd.DataFrame(data=playlist_prob,dtype=np.int64)
    df = pd.DataFrame(data=playlist_prob)
    df = df.fillna(0)
    df = df.T
    return df

In [13]:
#Change this path as per your system
path1 = '/home/irlab/Documents/recsys/challenge'
challenge_file = 'challenge_set.json'
filename = os.sep.join((path1,challenge_file))
fp = open(filename)
challenge_set = json.load(fp)
playlists = challenge_set['playlists']
dict_playlist = {}

In [14]:
#Load Data in surprise format
def load_data(tracks,playlists,ratings):
    ratings_dict = {'itemID': tracks,
                    'userID': playlists,
                    'rating': ratings}
    df1 = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(0,1))
    data = Dataset.load_from_df(df1[['userID','itemID','rating']],reader)
    #print(df1)
    return data

In [15]:
#Train using SVD and Gradient Descent
def train(data):
    kf = KFold(n_splits=10)
    algo = SVD(n_epochs=35,lr_all=0.007,reg_all=0.065)
    for trainset,testset in kf.split(data):
        algo.fit(trainset)
        prediction = algo.test(testset)
        print(accuracy.rmse(prediction,verbose=True))
    return algo

In [16]:
#Predict ratings for unknown tracks and filter them out
def test(playlist_name,tracks,algo,seed_tracks):
    tracks_pred = [n for n in tracks if n not in seed_tracks]
    Recommend = {}
    for track in tracks_pred:
        predict = algo.predict(playlist_name,track)
        Recommend[track] = predict[3]
    recommend_list = [key for (key,value) in sorted(Recommend.items(),key=lambda x:x[1],reverse=True)]
    return recommend_list

In [17]:
#If we fall short of 500 tracks pick popular tracks
def random_tracks(recommend_list,seed_tracks):
    tracks = [key for (key,value) in sorted(track_map.items(),key=lambda x:x[1],reverse=True)]
    extend_tracks = [track for track in tracks if track not in recommend_list]
    extend_tracks = [track for track in extend_tracks if track not in seed_tracks]
    diff = 500 - len(recommend_list)
    print(len(extend_tracks[0:diff]))
    for name in extend_tracks[0:diff]:
        print(name)
        recommend_list.append(name)
    return recommend_list

In [18]:
import csv

In [19]:
#string = ['team_info','main','Dark_Horse_1','201711040@daiict.ac.in']

#with open('/home/irlab/Documents/recsys/mpd/codes/output/output1.csv','w') as csvfile:
#        filewriter = csv.writer(csvfile,delimiter = ',')
#        filewriter.writerow(string)

count=0
for playlist in playlists:
    list_temp = []
    seed_tracks = []
    recommend_list = []
    if playlist['name'] and playlist['num_samples'] == 5:
        pid = playlist['pid']
        list_temp.insert(0,pid)
        count+=1
        i=0
        print(count,playlist['name'])
        #if count >= 637:
        tracks = playlist['tracks']
        playlist_prob.clear()
        playlist_prob[normalize_name(playlist['name'])] = {}
        for track in tracks:
            p1 = artist_map[track['artist_uri']]/artist_stat
            p2 = album_map[track['album_uri']]/album_stat
            p3 = track_map[track['track_uri']]/track_stat
            seed_tracks.insert(i,track['track_uri'])
            playlist_prob[normalize_name(playlist['name'])][track['track_uri']] = (p1+p2+p3)/(1+track['pos'])
            i+=1
        #print(seed_tracks)
        print('Finding Similar Playlists...')
        playlist_prob = create_weighted_playlists(normalize_name(playlist['name']),playlist_prob)
        print('Creating Rating matrix...')
        df = create_matrix(playlist_prob)
        column_names = list(df.columns)
        row_names = list(df.index)
        matrix = df.values
        row,col = np.nonzero(matrix)
        row = row.tolist()
        col = col.tolist()
        n_tracks = [column_names[i] for i in col]
        playlists = [row_names[i] for i in row]
        ratings = [matrix[r][c] for (r,c) in zip(row,col)]
        print('Loading data for training')
        data = load_data(n_tracks,playlists,ratings)
        print('Training....')
        algo = train(data)
        print('Predicting....')
        recommend_list = test(normalize_name(playlist['name']),n_tracks,algo,seed_tracks)
        print('Playlist Size:',len(recommend_list))
        if len(recommend_list[0:500])<500:
            print('Picking popular tracks...')
            recommend_list = random_tracks(recommend_list,seed_tracks)
            print('New Playlist Size:',len(recommend_list))
        list_temp.extend(recommend_list[0:500])
        print('Writing to file.....')
        #Change this path as per your system
        with open('/home/irlab/Documents/recsys/mpd/codes/output/final_submission.csv','a') as csvfile1:
            filewriter = csv.writer(csvfile1,delimiter = ',')
            filewriter.writerow(list_temp)

(1, u'Party')
Finding Similar Playlists...


KeyboardInterrupt: 