In [21]:
import os
import json
import pandas as pd
import numpy as np
import scipy as sp
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Metrics obtained from an organizer of the Challenge
# https://github.com/plamere/RecsysChallengeTools/blob/master/metrics.py
from metrics import r_precision
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [22]:
playlists = list()
tracks = dict()
map_pl = list()

max_files_for_quick_processing = 1


def process_track(track):
    key = track['track_uri']
    if not key in tracks:
        tk = dict()
        tk['track_artist_name'] = track['artist_name']
        tk['track_artist_uri'] = track['artist_uri']
        tk['track_name'] = track['track_name']
        tk['track_album_uri'] = track['album_uri']
        tk['track_duration_ms'] = track['duration_ms']
        tk['track_album_name'] = track['album_name']
        tracks[track['track_uri']] = tk
    return key


def process_playlist(playlist):
    pl = dict()
    pl['playlist_name'] = playlist['name']
    pl['playlist_collaborative'] = playlist['collaborative']
    pl['playlist_pid'] = playlist['pid']
    pl['playlist_modified_at'] = playlist['modified_at']
    pl['playlist_num_albums'] = playlist['num_albums']
    pl['playlist_num_tracks'] = playlist['num_tracks']
    pl['playlist_num_followers'] = playlist['num_followers']
    pl['playlist_num_edits'] = playlist['num_edits']
    pl['playlist_duration_ms'] = playlist['duration_ms']
    pl['playlist_num_artists'] = playlist['num_artists']
    if 'description' in playlist:
        pl['playlist_description'] = playlist['description']
    else:
        pl['playlist_description'] = ''
    trks = list()
    for track in playlist['tracks']:
        map_pl.append([playlist['pid'], track['track_uri']])
        trks.append(track['track_uri'])
        process_track(track)
    return pl

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            slice = json.loads(js)
            for playlist in slice['playlists']:
                playlists.append(process_playlist(playlist))
            count += 1
            if quick and count > max_files_for_quick_processing:
                break

quick = True
process_mpd('mpd.v1/data')

mpd.slice.0-999.json
mpd.slice.1000-1999.json


In [23]:
print(len(playlists))
print(len(tracks))
print(len(map_pl))

2000
57884
134125


In [24]:
playlist_df = pd.DataFrame(playlists)
playlist_df.head()
# print(playlist_df.describe())

Unnamed: 0,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,playlist_pid
0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0
1,False,,11656470,1506556800,Awesome Playlist,23,21,5,1,39,1
2,False,,14039958,1505692800,korean,51,31,18,1,64,2
3,False,,28926058,1501027200,mat,107,86,4,1,126,3
4,False,,4335282,1401667200,90s,16,16,7,2,17,4


In [25]:
tracks_df = pd.DataFrame.from_dict(tracks, orient='index')
tracks_df.head()
# print(tracks_df.describe())

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name
spotify:track:000mA0etY38nKdvf1N04af,The Coronas,spotify:artist:2tppd6KkhK4ULAd217Ecq1,If I Gave Myself To Someone Else,spotify:album:662PiU3dRsilN0Gp87IiSF,214506,The Long Way
spotify:track:000xQL6tZNLJzIrtIgxqSl,ZAYN,spotify:artist:5ZsFI1h6hIdQRw2ti0hz81,Still Got Time,spotify:album:2kGUeTGnkLOYlinKRJe47G,188490,Still Got Time
spotify:track:002PgfoyfrOGiKch4EW8Wm,Alan Menken,spotify:artist:5sy77gt4bfsLcSQ8GIe4ZZ,Jasmine Runs Away,spotify:album:29EiOQJnxWlX5nVOWQpu3u,46866,Aladdin
spotify:track:004skCQeDn1iLntSom0rRr,Kevin Jonas,spotify:artist:11bdWrCwHE2gCdVARdASuu,Heart and Soul,spotify:album:5iK842b9xnZblgZkRxWCFe,177773,Camp Rock 2: The Final Jam
spotify:track:005CGalYNgMNZcvWMIFeK8,Nelson Y Sus Estrellas,spotify:artist:5SRwYMyavJCNT2AdVMpPgi,LLorándote,spotify:album:2TfoLoZHhsL0oAYBxDi4t2,321480,Orquídea de Plata


In [26]:
playlist_map_df = pd.DataFrame(map_pl, columns=['playlist_pid', 'track_uri'])
playlist_map_df.head()
#print(playlist_map_df.describe())

Unnamed: 0,playlist_pid,track_uri
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H


We set strings to lower case remove all non alphabetic characters and stop-words

In [7]:
import nltk
import string
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

def word_cleanup(df_col):
    df_col = df_col.apply(lambda x: x.lower())
    df_col = df_col.str.replace('[^a-z]+', ' ')
    df_col = df_col.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return df_col

playlist_df.playlist_description = word_cleanup(playlist_df.playlist_description)
playlist_df.playlist_name = word_cleanup(playlist_df.playlist_name)

#playlist_df.playlist_name = playlist_df.playlist_name.apply(lambda x: x.lower())
#playlist_df.playlist_name = playlist_df.playlist_name.str.replace('[^a-z]+', ' ')
#playlist_df.playlist_name = playlist_df.playlist_name.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [8]:
#pip install wordcloud
from wordcloud import WordCloud

def generate_wordcloud(text): # optionally add: stopwords=STOPWORDS and change the arg below
    wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
                          relative_scaling = 0.1,
                          ).generate(text)
    plt.figure(figsize=(15,10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

generate_wordcloud(' '.join(playlist_df.playlist_name))

generate_wordcloud(' '.join(playlist_df.playlist_description))

TypeError: sequence item 0: expected str instance, NoneType found

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

cv_description = CountVectorizer(token_pattern=r"(?u)\b\w+\b", stop_words=None, ngram_range=(1,1), analyzer='word')
dt_mat_description = cv_description.fit_transform(playlist_df.playlist_description)
playlist_df['playlist_description_frequency'] = list(dt_mat_description.toarray())

cv_name = CountVectorizer(token_pattern=r"(?u)\b\w+\b", stop_words=None, ngram_range=(1,1), analyzer='word')
dt_mat_name = cv_name.fit_transform(playlist_df.playlist_name)
playlist_df['playlist_name_frequency'] = list(dt_mat_name.toarray())

In [None]:
#pip install yellowbrick
from yellowbrick.text import FreqDistVisualizer

plt.figure(figsize=(15,10))
visualizer = FreqDistVisualizer(features = cv_name.get_feature_names())
visualizer.fit(dt_mat_name)
visualizer.poof()

In [None]:
from yellowbrick.text import FreqDistVisualizer

plt.figure(figsize=(15,10))
visualizer = FreqDistVisualizer(features = cv_description.get_feature_names())
visualizer.fit(dt_mat_description)
visualizer.poof()

In [None]:
tfidf_transformer = TfidfTransformer()

tfidf_mat_description = tfidf_transformer.fit_transform(dt_mat_description)
playlist_df['playlist_description_tfidf_score']=list(tfidf_mat_description.toarray())

tfidf_mat_name = tfidf_transformer.fit_transform(dt_mat_name)
playlist_df['playlist_name_tfidf_score']=list(tfidf_mat_name.toarray())
playlist_df

In [None]:
bigrams_name_frequency = pd.DataFrame(dt_mat_name.todense(), index=playlist_df.index, columns=cv_name.get_feature_names())
bigrams_name_frequency['playlist_pid'] = playlist_df.playlist_pid
print(bigrams_name_frequency)

bigrams_desc_frequency = pd.DataFrame(tfidf_mat_description.todense(), index=playlist_df.index, columns=cv_description.get_feature_names())
bigrams_desc_frequency['playlist_pid'] = playlist_df.playlist_pid
print(bigrams_desc_frequency)

bigrams_name_tfidf_score = pd.DataFrame(dt_mat_name.todense(), index=playlist_df.index, columns=cv_name.get_feature_names())
bigrams_name_tfidf_score['playlist_pid'] = playlist_df.playlist_pid
print(bigrams_name_tfidf_score)

bigrams_desc_tfidf_score = pd.DataFrame(tfidf_mat_description.todense(), index=playlist_df.index, columns=cv_description.get_feature_names())
bigrams_desc_tfidf_score['playlist_pid'] = playlist_df.playlist_pid
print(bigrams_desc_tfidf_score)

In [27]:
merged = pd.merge(
    pd.merge(
        tracks_df, playlist_map_df, left_index=True, right_on='track_uri'),
    playlist_df,
    on='playlist_pid')

In [28]:
merged.head()

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name,playlist_pid,track_uri,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks
0,The Coronas,spotify:artist:2tppd6KkhK4ULAd217Ecq1,If I Gave Myself To Someone Else,spotify:album:662PiU3dRsilN0Gp87IiSF,214506,The Long Way,371,spotify:track:000mA0etY38nKdvf1N04af,False,,3675883,1496793600,quiet,15,14,4,1,15
1,Lewis Watson,spotify:artist:40ELTAg7Kg6vbWnlyx2n9R,stones around the sun,spotify:album:4cKBAg2zgjrVF2XefrW4WC,224440,the morning,371,spotify:track:0JGbwcwPV0VfuR4zDcZ9ce,False,,3675883,1496793600,quiet,15,14,4,1,15
2,Jaymes Young,spotify:artist:6QrQ7OrISRYIfS5mtacaw2,We Won't,spotify:album:6MuWCR3WPjwyKhqsTKLZ3z,240586,Feel Something,371,spotify:track:0Zge2Kfo3Yd9JOGnAmVPbb,False,,3675883,1496793600,quiet,15,14,4,1,15
3,John Lucas,spotify:artist:7iEy8zKFtlYIINaxxLIyBk,This Will Be Our Home,spotify:album:2N7sEVVS3jKMJJuJi4v0UF,277160,Promised Land,371,spotify:track:1Sw7fhf7YJCD6GcWW0wETD,False,,3675883,1496793600,quiet,15,14,4,1,15
4,Hozier,spotify:artist:2FXC3k01G6Gw61bmprjgqS,Cherry Wine - Live,spotify:album:36k5aXpxffjVGcNce12GLZ,240147,Hozier,371,spotify:track:1ivHxaGL5ld9VS1zsYc4YN,False,,3675883,1496793600,quiet,15,14,4,1,15


In [29]:
# Create Negative Samples
negative_samples = pd.DataFrame([])
for pid, df in tqdm(merged.groupby(["playlist_pid"])):
    negative_tracks = tracks_df.drop(df.track_uri).sample(df.shape[0])
    negative_playlist_tracks = pd.concat(
        [
            df.drop(list(tracks_df.columns) + ['track_uri'], axis=1).reset_index(
                drop=True), negative_tracks.reset_index()
        ],
        axis=1)
    negative_playlist_tracks.rename(columns={'index': 'track_uri'}, inplace=True)
    negative_samples = negative_samples.append(negative_playlist_tracks)





In [30]:
# Provide labels
negative_samples['match'] = 0
merged['match'] = 1

In [31]:
negative_samples.head()

Unnamed: 0,playlist_pid,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,track_uri,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name,match
0,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,spotify:track:43SGRLrSS4m4b8YfSrwvNZ,JJAMZ,spotify:artist:2Az7s0XmI73RBBVTyCSdIG,Heartbeat,spotify:album:7Jv7nH7PID4qrecmi6WxhS,229266,Suicide Pact,0
1,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,spotify:track:7CnDrJ3mlKBEcAUVAlzUQE,Enrique Iglesias,spotify:artist:7qG3b048QCHVRO5Pv1T5lw,DUELE EL CORAZON - Remix,spotify:album:0TNxy50fxWMAZYt8aEAeun,210546,DUELE EL CORAZON,0
2,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,spotify:track:7t4ICcvjp5OplAxpcavAGc,Johnny Cash,spotify:artist:6kACVPfCOnqzgfEF5ryl0x,"It Ain't Me, Babe",spotify:album:5gHMnKNXhduWcDsQnhm9Yc,183600,Darlin' Companion,0
3,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,spotify:track:3LgtCt7CVhMvNSMGoQD9i1,Arcangel,spotify:artist:4SsVbpTthjScTS7U2hmr1X,Me Llamas,spotify:album:3acSJqw0A0SdZX6j1nfXxI,327999,Me Llamas,0
4,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,spotify:track:6amWzecM9lo2uYpVOQCkEZ,The Piano Guys,spotify:artist:0jW6R8CVyVohuUJVcuweDI,Me and My Cello (Happy Together),spotify:album:5iCNAC5zJYnO90r0dcXq4u,185840,The Piano Guys 2,0


In [20]:
merged.head()

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name,playlist_pid,track_uri,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,match
0,The Coronas,spotify:artist:2tppd6KkhK4ULAd217Ecq1,If I Gave Myself To Someone Else,spotify:album:662PiU3dRsilN0Gp87IiSF,214506,The Long Way,371,spotify:track:000mA0etY38nKdvf1N04af,False,,3675883,1496793600,quiet,15,14,4,1,15,1
1,Lewis Watson,spotify:artist:40ELTAg7Kg6vbWnlyx2n9R,stones around the sun,spotify:album:4cKBAg2zgjrVF2XefrW4WC,224440,the morning,371,spotify:track:0JGbwcwPV0VfuR4zDcZ9ce,False,,3675883,1496793600,quiet,15,14,4,1,15,1
2,Jaymes Young,spotify:artist:6QrQ7OrISRYIfS5mtacaw2,We Won't,spotify:album:6MuWCR3WPjwyKhqsTKLZ3z,240586,Feel Something,371,spotify:track:0Zge2Kfo3Yd9JOGnAmVPbb,False,,3675883,1496793600,quiet,15,14,4,1,15,1
3,John Lucas,spotify:artist:7iEy8zKFtlYIINaxxLIyBk,This Will Be Our Home,spotify:album:2N7sEVVS3jKMJJuJi4v0UF,277160,Promised Land,371,spotify:track:1Sw7fhf7YJCD6GcWW0wETD,False,,3675883,1496793600,quiet,15,14,4,1,15,1
4,Hozier,spotify:artist:2FXC3k01G6Gw61bmprjgqS,Cherry Wine - Live,spotify:album:36k5aXpxffjVGcNce12GLZ,240147,Hozier,371,spotify:track:1ivHxaGL5ld9VS1zsYc4YN,False,,3675883,1496793600,quiet,15,14,4,1,15,1


In [None]:
negative_samples.head()

In [None]:
merged.columns

In [None]:
negative_samples.columns

In [33]:
dataset = merged.append(negative_samples[merged.columns]).sort_values(by=['playlist_pid']).reset_index(drop=True)

In [None]:
dataset.tail(32)

In [68]:
# Include the features in the list below
features = [
    'playlist_duration_ms', 'playlist_num_albums', 'playlist_num_artists',
    'playlist_num_edits', 'playlist_num_followers', 'playlist_num_tracks',
    'playlist_pid', 'track_duration_ms'
]
data_x = dataset[features]
data_y = dataset.match
X_train, X_test, y_train, y_test = train_test_split(
    data_x, data_y, test_size=0.1, random_state=42, shuffle=True)

The dataset has been split, which is great. However, in order to better simulate real world situations, we need to build a test set with a lot more negative samples than there are positive samples. In the split above, there are equal number of positive and negative samples, so it'll be easy for the algorithm to perform. Below we build a more robust test set

In [35]:
X_train.columns

Index(['playlist_duration_ms', 'playlist_num_albums', 'playlist_num_artists',
       'playlist_num_edits', 'playlist_num_followers', 'playlist_num_tracks',
       'playlist_pid', 'track_duration_ms'],
      dtype='object')

In [None]:
X_test_refined = pd.DataFrame([])
for pid, df in tqdm(X_test.groupby(['playlist_pid'])):
    labels = y_test.loc[df.index]
    targets = dataset.loc[labels.index].track_duration_ms
    # trained_idx = set(y_train[y_train == 1].index) - set(df.index)
    # Obtain all tracks from dataset which are not in the playlist under consideration
    #     negative_tracks = dataset.loc[X_train[(X_train.playlist_pid != pid)]
    #                                   .index].track_duration_ms
    negative_tracks = dataset.loc[X_test[(X_test.playlist_pid != pid)]
                                  .index].track_duration_ms
    # trained_tracks = X_train.loc[y_train.loc[y_train[y_train==1].index].index].track_uri
    new_df = df.drop('track_duration_ms', axis=1)
    new_df = pd.concat([new_df.head(1)] * len(negative_tracks))
    new_test = negative_tracks.append(targets)
    test_playlist_tracks = pd.concat(
        [new_df.reset_index(drop=True), new_test.reset_index(drop=True)],
        axis=1).set_index(new_test.index)
    X_test_refined = X_test_refined.append(test_playlist_tracks)
#     from IPython.core.debugger import set_trace
#     set_trace()

In [62]:
X_test_refined.head()

Unnamed: 0,playlist_duration_ms,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,playlist_pid,track_duration_ms
20901,11532414.0,47.0,37.0,6.0,1.0,52.0,0.0,205560.0
219809,,,,,,,,198360.0
119281,,,,,,,,192146.0
10627,11532414.0,47.0,37.0,6.0,1.0,52.0,0.0,240320.0
118291,,,,,,,,155224.0


In [37]:
y_test.head()
# y_test[y_test == 1].head()

20901     0
219809    1
119281    0
10627     1
118291    1
Name: match, dtype: int64

In [38]:
dataset.loc[X_test.head().index]

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name,playlist_pid,track_uri,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,match
20901,Sabrina Carpenter,spotify:artist:74KM79TiuVKeVCqs8QtB0B,Right Now,spotify:album:6r6BJYOMOKJ1B3CnhGHVhf,215226,Eyes Wide Open,156,spotify:track:3pCFKdlgevT02YXBblGdss,False,,49012499,1464998400,Love Music,157,103,57,2,188,0
219809,Plain White T's,spotify:artist:1g1yxsNVPhMUl9GrMjEb2o,"1, 2, 3, 4",spotify:album:5TAmG2iyx5BRYR2z8f9xUd,198360,Big Bad World,1632,spotify:track:5VWmMZCfJ4yVkJw9ZLFXej,False,,10821965,1485388800,HS,44,39,3,1,48,1
119281,Disclosure,spotify:artist:6nS5roXSAGhTGr34W6n7Et,Latch,spotify:album:1ZFGRj11NnZHos8DUbbpF1,255631,Settle,885,spotify:track:1BltsyC5W3SAABdxyrDXwi,False,,14042586,1506643200,SEPTEMBER 2017,54,48,5,1,59,0
10627,Jeremih,spotify:artist:3KV3p5EY4AvKxOlhGHORLg,Planez,spotify:album:7DMyQuDPe8xzjC0UDSDa96,240320,Late Nights: The Album,91,spotify:track:08zJpaUQVi9FrKv2e32Bah,False,,18184816,1428105600,R & B,54,37,9,3,68,1
118291,Delayers,spotify:artist:2oZ9We4LkpN7I062fHkci3,Bass Code,spotify:album:2QWdmviLmAWIVjKG2E4YNL,155224,Bass Code,877,spotify:track:6JzmACom3jAbkxfFxiWzBn,False,,17181746,1508803200,ELECTRONICAS,82,58,24,1,85,1


In [39]:
# 1. Calculate metric only on test set using the positive samples
#     a. Obtain unique playlists in the test set
#     b. For each playlist, obtain the positve songs.
#     c. Set both `target` and `prediction` as lists of the positive songs
#     d. For each playlist, calculate the r_precision. r_precision comes from the metrics.py file in the repo.

# 2. Train the X_train on the classifier. Obtain predictions for X_test

# 3. Repeat 1 above, now setting `predictions` as the list of songs "ranked by probability".

# 4. Calculate the r_precision metric based on your predicitons!

In [40]:
r_precisions = []
for pid, df in tqdm(X_test.groupby(["playlist_pid"])):
    probs = y_test.loc[df.index]  # change y_test to the output probs from clf
    targets = dataset.loc[probs[probs == 1].index].track_uri
    predictions = dataset.loc[probs[probs > 0.5].index].track_uri.unique()
    if len(targets) > 0:
        r_precisions.append(r_precision(targets, predictions))
np.mean(r_precisions)    




1.0

In [69]:
X_train_norm = (X_train-X_train.mean())/X_train.std()
X_test_norm = (X_test-X_test.mean())/X_test.std()

In [70]:
X_train_norm.head()

Unnamed: 0,playlist_duration_ms,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,playlist_pid,track_duration_ms
66703,-0.376635,-0.02194,0.268983,-0.758536,-0.054956,-0.381463,-0.875817,-0.7177
17756,2.065766,2.490829,2.182972,0.833917,-0.054956,1.9173,-1.513067,0.851458
66340,-1.315739,-1.072359,-0.953843,-0.758536,-0.054956,-1.326161,-0.884546,-0.774263
202746,-1.253443,-1.051763,-0.84751,-0.894063,-0.054956,-1.310416,0.894517,-0.697672
78808,0.921551,0.225218,0.295567,1.342146,0.023153,0.673449,-0.713449,-0.383035


In [None]:
X_test_norm.head()

In [72]:
lr_clf = LogisticRegression()
lr_clf = lr_clf.fit(X_train_norm, y_train)

In [75]:
# y_prob = lr_clf.predict_proba(X_test)
y_prob = pd.DataFrame(lr_clf.predict_proba(X_test_norm), index=y_test.index)

In [76]:
y_prob.head(10)

Unnamed: 0,0,1
20901,0.496,0.504
219809,0.498082,0.501918
119281,0.50065,0.49935
10627,0.498296,0.501704
118291,0.496048,0.503952
159640,0.495531,0.504469
245782,0.50351,0.49649
228828,0.498385,0.501615
139527,0.496306,0.503694
58654,0.498516,0.501484


In [77]:
y_test.head(10)

20901     0
219809    1
119281    0
10627     1
118291    1
159640    1
245782    0
228828    0
139527    0
58654     0
Name: match, dtype: int64

In [32]:
y_pred = y_prob.idxmax(axis=1)

In [33]:
r_precisions = []
for pid, df in tqdm(X_test.groupby(["playlist_pid"])):
    labels = y_test.loc[df.index]  # change y_test to the output probs from clf
    preds = y_pred.loc[df.index]
    targets = dataset.loc[labels[labels == 1].index].track_uri
    predictions = dataset.loc[preds[preds == 1].index].track_uri.unique()
    if len(targets) > 0:
        r_precisions.append(r_precision(targets, predictions))
np.mean(r_precisions)    




0.1378411710638677

In [None]:
X_test_refined = pd.DataFrame([])
r_precisions = []
pbar = tqdm(X_test.groupby(['playlist_pid']))
for pid, df in pbar:
    labels = y_test.loc[df.index]
    targets = dataset.loc[labels.index].track_duration_ms
    positive_targets = dataset.loc[labels[labels == 1].index].index
    # trained_idx = set(y_train[y_train == 1].index) - set(df.index)
    # Obtain all tracks from dataset which are not in the playlist under consideration
    #     negative_tracks = dataset.loc[X_train[(X_train.playlist_pid != pid)]
    #                                   .index].track_duration_ms
    negative_tracks = dataset.loc[X_test[(X_test.playlist_pid != pid)]
                                  .index].track_duration_ms
    # trained_tracks = X_train.loc[y_train.loc[y_train[y_train==1].index].index].track_uri
    new_df = df.drop('track_duration_ms', axis=1)
    new_test = negative_tracks.append(targets)
    new_df = pd.concat([new_df.head(1)] * len(new_test))
    test_playlist_tracks = pd.concat(
        [new_df.reset_index(drop=True), new_test.reset_index(drop=True)],
        axis=1).set_index(new_test.index)
    from IPython.core.debugger import set_trace; set_trace()
    test_playlist_tracks = (test_playlist_tracks-test_playlist_tracks.mean())/(test_playlist_tracks.std()+1e-8)
    X_test_refined = X_test_refined.append(test_playlist_tracks)
    y_prob = pd.DataFrame(
        lr_clf.predict_proba(test_playlist_tracks),
        index=test_playlist_tracks.index)
    y_prob = y_prob.sort_values(by=[1], ascending=False)
    if len(positive_targets) > 0:
        r_precisions.append(r_precision(positive_targets, y_prob.index))
    pbar.set_description("{}".format(np.mean(r_precisions)))
    # from IPython.core.debugger import set_trace; set_trace()

> [0;32m<ipython-input-80-173cda7a6d18>[0m(22)[0;36m<module>[0;34m()[0m
[0;32m     20 [0;31m        axis=1).set_index(new_test.index)
[0m[0;32m     21 [0;31m    [0;32mfrom[0m [0mIPython[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mdebugger[0m [0;32mimport[0m [0mset_trace[0m[0;34m;[0m [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 22 [0;31m    [0mtest_playlist_tracks[0m [0;34m=[0m [0;34m([0m[0mtest_playlist_tracks[0m[0;34m-[0m[0mtest_playlist_tracks[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m/[0m[0mtest_playlist_tracks[0m[0;34m.[0m[0mstd[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m    [0mX_test_refined[0m [0;34m=[0m [0mX_test_refined[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mtest_playlist_tracks[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m    y_prob = pd.DataFrame(
[0m
ipdb> test_playlist_tracks.head()
        playlist_duration_ms  playlist_num_albums  

In [56]:
np.unique(r_precisions, return_counts=True)

(array([0.        , 0.05555556, 0.09090909, 0.11111111]),
 array([1885,    1,    1,    1]))

In [58]:
len(r_precisions)

1888

Question: Do playlist and track duration interact to influence whether a song should belong to a playlist `(match=1)` or whether a song should not belong to a playlist `(match=0)`?

In [None]:
(dataset[dataset.match==1].playlist_duration_ms/dataset[dataset.match==1].playlist_num_tracks)

In [None]:
plt.scatter(dataset[dataset.match == 1].playlist_duration_ms /
            dataset[dataset.match == 1].playlist_num_tracks,
            dataset[dataset.match == 1].track_duration_ms)
plt.scatter(dataset[dataset.match == 0].playlist_duration_ms /
            dataset[dataset.match == 0].playlist_num_tracks,
            dataset[dataset.match == 0].track_duration_ms)
plt.xlabel('Average Track Duration')
plt.ylabel('Track Duration')


A trend which might not be that strong to notice is that as the average track duration increases for a playlist, the propensity to see a track of long duration increases. However, this isn't the case with negative sample playlists, where in playlist of short length very long tracks can still be detected.

In [None]:
plt.hist(dataset[dataset.match==1].playlist_duration_ms)
plt.hist(dataset[dataset.match==0].playlist_duration_ms)