In [5]:
import pandas as pd
import numpy as np

In [6]:
data_dir = '../data/random_data'
tracks = pd.read_csv('{}/tracks.csv'.format(data_dir), index_col=0)
artists = pd.read_csv('{}/artists.csv'.format(data_dir), index_col=0)
albums = pd.read_csv('{}/albums.csv'.format(data_dir), index_col=0)
transactions = pd.read_csv('{}/transactions.csv'.format(data_dir), index_col=0)
playlists = pd.read_csv('{}/playlists.csv'.format(data_dir), index_col=0)

In [4]:
def get_full_dataset(transactions, tracks, playlists):
    full_dataset = pd.merge(transactions, tracks, how='left', on='trackid')
    full_dataset = pd.merge(full_dataset, playlists, how='left', on='pid')
    return full_dataset

In [5]:
full_dataset = get_full_dataset(transactions, tracks, playlists)
full_dataset['rating'] = 1
full_dataset.head()

Unnamed: 0,pid,trackid,popular,artistid,albumid,name,num_followers,rating
0,822032,2283,3384,1151,1618,party time,1,1
1,822032,2288,4339,1152,1622,party time,1,1
2,822032,907,4992,500,687,party time,1,1
3,822032,2289,1822,503,690,party time,1,1
4,822032,910,2160,503,690,party time,1,1


$R$ - матрица плейлист-трек с пропусками
$X$ - матрица плейлист-признак
$C$ - матрица популярность-трек
$U, V$ - матрицы профилей 
$$\min_{U,V} J: = \sum_{(u,i) \in D} (r_{ui} - \mathbf{x}_u^TUV\mathbf{c}_i) + \lambda(||U||^2 + ||V^2||)$$

$\hat{r}_{ui} = \mathbf{x}_u^TUV\mathbf{c}_i$

$J_U = \mathbf{x}_u(\hat{r}_{ui} - r_ui)\mathbf{c}_i^TV^T + \lambda U$

$J_V = U^T\mathbf{x}_u(\hat{r}_{ui} - r_ui)\mathbf{c}_i^T + \lambda V$

In [6]:
def reindex_content(content_data, col, sort=True, inplace=True):
    grouper = content_data.groupby(col, sort=sort).grouper
    new_val = grouper.group_info[1]
    old_val = grouper.levels[0]
    print (new_val.shape, old_val.shape)
    val_transform = pd.DataFrame({'old': old_val, 'new': new_val})
    new_data = grouper.group_info[0]

    if inplace:
        result = val_transform
        content_data.loc[:, col] = new_data
    else:
        result = (new_data, val_transform)
    return result
        
def reindex_content_columns(content_data, columns):
    index_content = {}
    for col in columns:
        index_content[col] = reindex_content(content_data, col)
    return index_content

index_content = reindex_content_columns(full_dataset, ['pid', 'trackid', 'albumid', 'artistid', 'popular'])

(7657,) (7657,)
(8560,) (8560,)
(6824,) (6824,)
(4243,) (4243,)
(1692,) (1692,)


In [7]:
full_dataset.head()

Unnamed: 0,pid,trackid,popular,artistid,albumid,name,num_followers,rating
0,6320,586,1218,655,673,party time,1,1
1,6320,587,1419,656,674,party time,1,1
2,6320,365,1507,378,395,party time,1,1
3,6320,588,519,380,397,party time,1,1
4,6320,367,741,380,397,party time,1,1


1691

In [13]:
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords

In [34]:
def get_playlist_artist_album_matrix(data, content_columns, user_col):
    idx_userid = data[user_col].values
    val = np.ones(data[user_col].shape[0])
    
    i = 0
    features = []
    
    for col in content_columns:
        idx_feature = data[col].values
        shp = (idx_userid.max() + 1,
               idx_feature.max() + 1)
        
        features_new = sparse.csr_matrix((val, (idx_userid, idx_feature)), 
                                         shape=shp)
        
        if i == 0:
            features = features_new
        else:
            features = sparse.hstack((features, features_new))
            
        i+=1
    
    return features

def get_playlist_name_matrix(playlists):
    playlists['name'] = [str(playlists['name'].values[i]).lower() 
                         for i in range(playlists.shape[0])]
    names = playlists['name'].values
    for i in range(names.shape[0]):
        names[i] = re.compile('[^a-zA-Z0-9 ]').sub('', names[i])
    playlists['name'] = names
    words_in_names = []
    for i in range(names.shape[0]):
        words_in_names += names[i].split(' ')
    stop_words = stopwords.words('english') + ['music', 'song', 'songs', 'playlist', '', 'good']
    words_in_names = np.array([word for word in words_in_names if word not in stop_words])
    bag_of_words = CountVectorizer()
    bag_of_words.vocabulary_ = words_in_names
    print(words_in_names)
    X = bag_of_words.fit_transform(playlists.name.values)
    return X

def get_popularity_track_matrix(data):
    idx_popular = data.popular.values
    idx_track = data.trackid.values
    val = np.ones(data.shape[0])
    shp = (idx_popular.max() + 1, idx_track.max() + 1)
    C = sparse.csr_matrix((val, (idx_popular, idx_track)), 
                          shape=shp)
    return C

In [35]:
X1 = get_playlist_artist_album_matrix(full_dataset, ['artistid', 'albumid'], 'pid')
X1.shape

(7657, 11067)

In [36]:
C = get_popularity_track_matrix(full_dataset)

(1692, 8560)


In [38]:
X2 = get_playlist_name_matrix(full_dataset.drop_duplicates('pid')[['pid','name']])

['party' 'time' 'gd' ... 'tunes' 'march' 'sleep']


In [39]:
X2.shape

(7657, 2366)

In [41]:
X = sparse.hstack((X1, X2))

In [69]:
def get_recommendations(x, U, V, c):
    return x.dot(U).dot(V.dot(c)).data[0]
def gradients(r, x, U, V, c, lamb):
    r_pred = get_recommendations(x, U, V, c)
    print (r_pred)
    grad_U = (r_pred - r) * x.T.dot(c.T.dot(V.T)) + lamb*U
    grad_V = (r_pred - r) * U.T.dot(x.T).dot(c.T) + lamb*V
    return grad_U, grad_V

In [76]:
U = np.abs(sparse.rand(X.shape[1], 10, 0.99, 'csr', random_state=0))
V = np.abs(sparse.rand(10, C.shape[0], 0.99, 'csr', random_state=0))
lamb = 1.
eta = 0.00001

k = 0

for el in full_dataset.itertuples():
    
    if (k >= 100):
        break
    
    u = el.pid
    i = el.trackid
    r = el.rating
    
    x = X.getrow(u)
    c = C.getcol(i)
    grad_U, grad_V = gradients(r, x, U, V, c, lamb)
    U = U + eta*grad_U
    V = V + eta*grad_V
    
    k += 1

24389.35330582078
1607116.5187258606
190699950.39065304
9041136382.876236
209315160773.1883
12844849745790.041
1362885402338687.5
1.0083406793109245e+17
7.675134867256291e+18
1.787065320462395e+20
2.953536528149795e+21
9.919615966121275e+22
2.2703239073726075e+24
4.954364947835878e+25
1.0989674811675751e+27
2.4472836246220155e+28
9.315538102709813e+29
1.3789475142596641e+31
1.925181498608568e+32
4.720085803292213e+33
2.2248165977151616e+35
1.354329872009109e+37
3.075550590594203e+38
8.177642428107547e+39
1.6786548523013281e+41
4.2574902034951644e+42
1.4987166953859308e+44
2.0779884504179172e+45
3.219320771834935e+46
4.589997043598995e+47
1.3705544728378345e+49
5.7035460527043935e+50
1.7728249685364133e+52
6.59406202526135e+53
1.5060964716305038e+55
4.151041699588206e+56
1.304882235322063e+58
3.87090163700605e+59
1.8930553749030429e+61
4.165393205813071e+61
3.5658294917703224e+62
5.492255429915037e+62
1.2791235233979689e+64
4.783421249619724e+65
1.3105886257439653e+67
2.9588903711734657

  return self._with_data(self.data * other)


1.6123292532785055e+262
3.2229819765412684e+304
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
15942.7597706421
416312.86371024285
12428148.953527953
362674995.639522
7955835884.499022
4773612656.855217


In [72]:
X.getrow(0).shape

(1, 13433)

In [46]:
C.getcol(0).shape

(1692, 1)

In [48]:
full_dataset.itertuples()[:10]

TypeError: 'map' object is not subscriptable

In [51]:
X*10

<7657x13433 sparse matrix of type '<class 'numpy.float64'>'
	with 351555 stored elements in COOrdinate format>

In [68]:
X + 1

NotImplementedError: adding a nonzero scalar to a sparse matrix is not supported

In [1]:
import gensim

In [2]:
from gensim.models import Word2Vec

In [3]:
from gensim.models.word2vec import LineSentence

In [7]:
LineSentence(playlists['name'].values)

<gensim.models.word2vec.LineSentence at 0x7f4d3eb34908>

In [20]:
model = Word2Vec([el.split(' ') for el in playlists['name']],
                 size=200,
                 window=5,
                 min_count=3,
                 workers=8)

In [21]:
model.vocabulary

<gensim.models.word2vec.Word2VecVocab at 0x7f4d3e020128>

In [22]:
model.init_sims(replace=True)

In [27]:
model.most_similar('good')

  """Entry point for launching an IPython kernel.


[('Good', 0.21613922715187073),
 ('Covers', 0.21379691362380981),
 ('Dinner', 0.1914544254541397),
 ('now', 0.1884658932685852),
 ('june', 0.1869111955165863),
 ('Clean', 0.1842919886112213),
 ('Rap', 0.18383505940437317),
 ('English', 0.1767725795507431),
 ('i', 0.17616254091262817),
 ("It's", 0.16625140607357025)]

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/vectors.txt'