In [2]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
from itertools import combinations, islice
import csv
from datetime import datetime, timedelta
import time
import pickle
from itertools import islice

from surprise import Reader, Dataset, SVD, KNNWithMeans, NMF
from surprise.model_selection.validation import cross_validate
svd = SVD()

### EDA

In [3]:
# data source: https://www.kaggle.com/c/ee627a-2019fall/data?select=albumData2.txt
# origin of data: https://webscope.sandbox.yahoo.com/catalog.php?datatype=c&guccounter=1&guce_referrer=aHR0cHM6Ly9naXRodWIuY29tL3NhcmFueWF2c3IvTXVzaWMtUmVjb21tZW5kYXRpb25z&guce_referrer_sig=AQAAADdDVj1NcJ7l9D0AF1OwjrIchcuyq2aDD8kc4qxRk3RP-B1mQTaY0IDliV2wsC-gQw05v-d9k8v70efaNULAbemXR_upER5MDVS8mcDsU_DQJZmtcUF8Sdh7A1holj3I-8UJVcKbI65keJp44o46CL8aGp2kLYhRCUYeTXkwxv9N


data_list = glob.glob('ee627a-2019fall\*')
data_list

[]

- trainItem2.txt - the training set 
- testItem2.txt - the test set 
- sample_ submission.csv - a sample submission file in the correct format 
- trackData2.txt -- Track information formatted as: <'TrackId'>|<'AlbumId'>|<'ArtistId'>|<'Optional GenreId_1'>|...|<'Optional GenreId_k'> 
- albumData2.txt -- Album information formatted as: <'AlbumId'>|<'ArtistId'>|<'Optional GenreId_1'>|...|<'Optional GenreId_k'> 
- artistData2.txt -- Artist listing formatted as: <'ArtistId'>
- genreData2.txt -- Genre listing formatted as: <'GenreId'>



## Process for surprise

In [3]:
# process track data
rows = []
for row in open('ee627a-2019fall/trackData2.txt'):
    cur_set = row.strip('\n').split('|')
    row_dict = {'track': cur_set[0], 
                'album': cur_set[1], 
                'artist': cur_set[2]
               }
    if len(cur_set) > 3:
        row_dict['genres'] = cur_set[3:]
    rows.append(row_dict)

df_tracks = pd.DataFrame(rows)

In [4]:
df_tracks['track'] = df_tracks.track.astype(int)
df_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224041 entries, 0 to 224040
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   track   224041 non-null  int64 
 1   album   224041 non-null  object
 2   artist  224041 non-null  object
 3   genres  215595 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.8+ MB


In [5]:
df_tracks

Unnamed: 0,track,album,artist,genres
0,1,106710,281667,"[214765, 162234, 155788]"
1,2,280977,233685,"[131552, 173467, 48505]"
2,3,38422,219136,"[61215, 201738, 88853]"
3,4,119529,166863,"[17453, 35389]"
4,5,16742,294690,"[61215, 34486, 274088]"
...,...,...,...,...
224036,296100,166516,33011,"[274088, 199606, 88853]"
224037,296101,,,
224038,296102,153644,289056,"[158282, 139095, 242383]"
224039,296105,68336,6613,[82064]


In [6]:
# process training data
rows = []
for row in open('ee627a-2019fall/trainItem2.txt'):
    if '|' in row:
        cur_user = row.strip('\n').split('|')[0] # pull user ID. don't need song count
        continue # skip to the user's ratings
    row = row.strip('\n').split('\t')
    row_dict = {'user': cur_user, 
                'track': row[0], 
                'rating': int(row[1])
               }
    rows.append(row_dict)

df_train = pd.DataFrame(rows)

df_train['rating'] = df_train['rating'].replace({0:1}) # nmf can't handle 0

In [7]:
df_train['track'] = df_train.track.astype(int)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12403575 entries, 0 to 12403574
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   user    object
 1   track   int64 
 2   rating  int64 
dtypes: int64(2), object(1)
memory usage: 283.9+ MB


In [8]:
df_train

Unnamed: 0,user,track,rating
0,199808,248969,90
1,199808,2663,90
2,199808,28341,90
3,199808,42563,90
4,199808,59092,90
...,...,...,...
12403570,249011,270557,90
12403571,249011,273574,90
12403572,249011,286938,90
12403573,249011,287681,80


In [9]:
reader = Reader(rating_scale=(1, 100))
data = Dataset.load_from_df(df_train, reader)
trainset = data.build_full_trainset()

In [10]:
df_trainWgenres = pd.merge(df_tracks, df_train, on='track', how='left')

In [11]:
df_trainWgenres.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5480302 entries, 0 to 5480301
Data columns (total 6 columns):
 #   Column  Dtype  
---  ------  -----  
 0   track   int64  
 1   album   object 
 2   artist  object 
 3   genres  object 
 4   user    object 
 5   rating  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 292.7+ MB


In [12]:
df_trainWgenres

Unnamed: 0,track,album,artist,genres,user,rating
0,1,106710,281667,"[214765, 162234, 155788]",199872,50.0
1,1,106710,281667,"[214765, 162234, 155788]",213142,90.0
2,1,106710,281667,"[214765, 162234, 155788]",214520,1.0
3,1,106710,281667,"[214765, 162234, 155788]",233636,50.0
4,1,106710,281667,"[214765, 162234, 155788]",237128,1.0
...,...,...,...,...,...,...
5480297,296110,281202,140302,[61215],217723,1.0
5480298,296110,281202,140302,[61215],230188,1.0
5480299,296110,281202,140302,[61215],235883,1.0
5480300,296110,281202,140302,[61215],238641,1.0


In [13]:
dataWgenres = Dataset.load_from_df(df_train, reader)
trainsetWgenres = dataWgenres.build_full_trainset()

## SVD

In [9]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    63.0237 63.0137 62.9738 62.9827 63.0187 63.0025 0.0203  
MAE (testset)     50.2529 50.2442 50.1980 50.2079 50.2439 50.2294 0.0221  
Fit time          470.88  431.81  423.76  422.57  423.45  434.49  18.49   
Test time         29.59   25.72   28.56   28.69   26.96   27.90   1.38    


{'test_rmse': array([63.02372132, 63.01368387, 62.97381311, 62.98268164, 63.0187177 ]),
 'test_mae': array([50.25291096, 50.24423725, 50.19802517, 50.20785983, 50.2439458 ]),
 'fit_time': (470.87875986099243,
  431.8085570335388,
  423.76098585128784,
  422.57399702072144,
  423.4520380496979),
 'test_time': (29.590038537979126,
  25.724996089935303,
  28.555996894836426,
  28.688954830169678,
  26.964996576309204)}

In [78]:
%time svd.fit(trainsetWgenres)


CPU times: user 6min 3s, sys: 49.2 ms, total: 6min 3s
Wall time: 6min 3s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f78e4cf31d0>

#### make predictions

In [79]:
svd.predict(str(199810), str(208019)).est

100

In [15]:
df_trainWgenres[df_trainWgenres.track==208019]

Unnamed: 0,track,album,artist,genres,user,rating
3857580,208019,209288,,,199820,1.0
3857581,208019,209288,,,200634,50.0
3857582,208019,209288,,,200999,1.0
3857583,208019,209288,,,201107,90.0
3857584,208019,209288,,,201178,50.0
...,...,...,...,...,...,...
3857699,208019,209288,,,247149,70.0
3857700,208019,209288,,,247693,70.0
3857701,208019,209288,,,248130,1.0
3857702,208019,209288,,,248393,90.0


In [80]:
predictions = []
with open('ee627a-2019fall/testItem2.txt') as f:
    while True:
        next_n_lines = list(islice(f, 7))
        if not next_n_lines:
            break
        ratings = [] # reset for new user's list of songs
        cur_set = [x.strip('\n').split('|') for x in next_n_lines]
        test_user = int(cur_set[0][0]) # pull user ID. don't need song count 
        for track in cur_set[1:]:
            rating = svd.predict(str(test_user), track[0]).est
            rating_tup = (int(track[0]), rating)
            ratings.append(rating_tup)
        
        ratings.sort(key=lambda x:x[1]) # sort 6 songs by rating
        for i, j in enumerate(ratings):
            cur_dict = {}
            cur_dict['TrackID'] = f'{test_user}_{j[0]}'
            cur_dict['rating'] = j[1]
            if i < 3:
                cur_dict['Predictor'] = 0 
            else:
                cur_dict['Predictor'] = 1 
            predictions.append(cur_dict)
        

In [81]:
preditions_df_svd = pd.DataFrame(predictions)
preditions_df_svd

Unnamed: 0,TrackID,rating,Predictor
0,199810_208019,100,0
1,199810_74139,100,0
2,199810_9903,100,0
3,199810_242681,100,1
4,199810_18515,100,1
...,...,...,...
119995,249010_72192,100,0
119996,249010_86104,100,0
119997,249010_186634,100,1
119998,249010_293818,100,1


## KNN

In [28]:
# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)
%time algo.fit(trainset)



Computing the cosine similarity matrix...


MemoryError: Unable to allocate 652. GiB for an array with shape (295799, 295799) and data type float64

In [None]:
# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)
%time algo.fit(trainset)



In [None]:
prediction = algo.predict('E', 2) # E=user, 2=movie

## NMF - no genre

In [9]:
algo = NMF(n_factors = 16)
%time algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f78b4364b10>

In [12]:
# with open('model.pkl') as f: 
#     alg0_NMF = pickle.load(f)   # pickling error: UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80
# test 2 predictions
print(algo.predict('199810', '208019'))
print(algo.predict('199810', '74139'))


user: 199810     item: 208019     r_ui = None   est = 9.39   {'was_impossible': False}
user: 199810     item: 74139      r_ui = None   est = 12.98   {'was_impossible': False}


In [22]:
predictions = []
with open('ee627a-2019fall/testItem2.txt') as f:
    while True:
        next_n_lines = list(islice(f, 7))
        if not next_n_lines:
            break
        ratings = [] # reset for new user's list of songs
        cur_set = [x.strip('\n').split('|') for x in next_n_lines]
        test_user = int(cur_set[0][0]) # pull user ID. don't need song count 
        for track in cur_set[1:]:
            rating = algo.predict(str(test_user), track[0]).est
            rating_tup = (int(track[0]), rating)
            ratings.append(rating_tup)
        
        ratings.sort(key=lambda x:x[1]) # sort 6 songs by rating
        for i, j in enumerate(ratings):
            cur_dict = {}
            cur_dict['TrackID'] = f'{test_user}_{j[0]}'
            cur_dict['rating'] = j[1]
            if i < 3:
                cur_dict['Predictor'] = 0 
            else:
                cur_dict['Predictor'] = 1 
            predictions.append(cur_dict)

In [23]:
preditions_df = pd.DataFrame(predictions)
preditions_df

Unnamed: 0,TrackID,rating,Predictor
0,199810_242681,8.276465,0
1,199810_208019,9.392225,0
2,199810_105760,9.754371,0
3,199810_18515,10.584244,1
4,199810_9903,11.668439,1
...,...,...,...
119995,249010_110470,12.653526,0
119996,249010_86104,13.094187,0
119997,249010_72192,13.353449,1
119998,249010_262811,14.027582,1


In [25]:
preditions_df[['TrackID','Predictor']].to_csv('YahooMusic_predictions-nmf-AJS.csv', index=False)

## NMF - with genre

In [69]:
algo = NMF(n_factors = 16)
%time algo.fit(trainsetWgenres)

CPU times: user 8min 26s, sys: 1.51 s, total: 8min 28s
Wall time: 8min 28s


<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f77db4ed8d0>

In [72]:
# with open('model.pkl') as f: 
#     alg0_NMF = pickle.load(f)   # pickling error: UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80
# test 2 predictions
print(algo.predict('199810', 208019))
print(algo.predict('199810', 74139))


user: 199810     item: 208019     r_ui = None   est = 9.08   {'was_impossible': False}
user: 199810     item: 74139      r_ui = None   est = 11.57   {'was_impossible': False}


In [75]:
predictions = []
with open('ee627a-2019fall/testItem2.txt') as f:
    while True:
        next_n_lines = list(islice(f, 7))
        if not next_n_lines:
            break
        ratings = [] # reset for new user's list of songs
        cur_set = [x.strip('\n').split('|') for x in next_n_lines]
        test_user = int(cur_set[0][0]) # pull user ID. don't need song count 
        for track in cur_set[1:]:
            rating = algo.predict(str(test_user), int(track[0])).est
            rating_tup = (int(track[0]), rating)
            ratings.append(rating_tup)
        
        ratings.sort(key=lambda x:x[1]) # sort 6 songs by rating
        for i, j in enumerate(ratings):
            cur_dict = {}
            cur_dict['TrackID'] = f'{test_user}_{j[0]}'
            cur_dict['rating'] = j[1]
            if i < 3:
                cur_dict['Predictor'] = 0 
            else:
                cur_dict['Predictor'] = 1 
            predictions.append(cur_dict)

In [76]:
preditions_df = pd.DataFrame(predictions)
preditions_df

Unnamed: 0,TrackID,rating,Predictor
0,199810_242681,7.731504,0
1,199810_208019,9.078159,0
2,199810_18515,10.412370,0
3,199810_9903,10.933214,1
4,199810_105760,11.382126,1
...,...,...,...
119995,249010_186634,10.679229,0
119996,249010_293818,12.486086,0
119997,249010_110470,13.602687,1
119998,249010_72192,14.612461,1


In [77]:
preditions_df[['TrackID','Predictor']].to_csv('YahooMusic_predictions-nmf-genre-AJS.csv', index=False)

# Graveyard

## Process for networkx

In [4]:
df_tracks = pd.DataFrame()

# process tracks data file
for row in open('ee627a-2019fall/trackData2.txt'):
    row = row.strip('\n').split('|')
    G.add_node(row[0], attr = {"node_type": "track"}) # add track node
    G.add_edge(row[0], row[1], weight=100) # connect track to album
    G.add_node(row[1], attr = {"node_type": "album"}) # add album node
    G.add_node(row[2], attr = {"node_type": "artist"}) # add artist node
    if len(row) > 3:
        for genre in row[3:]:
            G.add_node(genre, attr = {"node_type": "genre"}) # add genre node
            G.add_edge(row[0], genre, weight=100) # connect each genre to the track

In [5]:
# process albums data file
for row in open('ee627a-2019fall/albumData2.txt'):
    row = row.strip('\n').split('|')
    G.add_edge(row[0], row[1], weight=100) # connect album to artist
    if len(row) > 2:
        for genre in row[2:]:
            G.add_node(genre, attr = {"node_type": "genre"}) # add genre node
            G.add_edge(row[0], genre, weight=100) # connect each genre to the album

In [6]:
# process training user data file
for row in open('ee627a-2019fall/trainItem2.txt'):
    if '|' in row:
        cur_user = row.strip('\n').split('|')[0] # pull user ID. don't need song count
        G.add_node(cur_user, attr = {"node_type": "user"}) # add user node
        continue # skip to the user's ratings
    row = row.strip('\n').split('\t')
    G.add_edge(cur_user, row[0], weight=int(row[1])) # connect user to song with rating as edge weight

In [7]:
print(len(G.nodes))
print(len(G.edges)) # 956820
print(len([1 for cc in nx.connected_components(G)]))


296101
13342506
5


### G2 selects the largest subgraph. appears to just drop 4 disconnected nodes
not currently needed because switched to astar, which is ok with weakly connected graphs  
don't want to process if not necessary because the 2nd graph takes about 1.5GB of memory

### Processing attempt 1. estimated time to completion of 20 days

### fill in missing netowrkx preds

In [None]:
# submit to: https://www.kaggle.com/c/ee627a-2019fall/submit
df_submission = pd.read_csv('ee627a-2019fall\\sample_submission.csv')
df_submission[['user', 'track']] = df_submission['TrackID'].str.split('_', 1, expand=True)

df_submission

In [None]:
### Fillin netowrkX remaining, remove dupes

In [91]:
df_networkx_preds = pd.read_csv('recommendations-combined.csv')
df_networkx_preds

Unnamed: 0,TrackID,Predictor
0,199814_122375,1
1,199814_189043,1
2,199814_122429,1
3,199814_52519,0
4,199814_232332,0
...,...,...
15637,199814_122375,0
15638,199814_232332,0
15639,199814_52519,1
15640,199814_122429,1


In [92]:
df_networkx_preds.drop_duplicates(subset=['TrackID'], keep='first', inplace=True)


In [93]:
df_networkx_preds

Unnamed: 0,TrackID,Predictor
0,199814_122375,1
1,199814_189043,1
2,199814_122429,1
3,199814_52519,0
4,199814_232332,0
...,...,...
15547,206132_108120,1
15548,206132_261653,1
15549,206132_205259,0
15550,206132_144483,0


In [96]:
# exted with other predictions
frames = [df_networkx_preds, preditions_df[['TrackID','Predictor']]]

df_networkx_preds = pd.concat(frames)
df_networkx_preds.drop_duplicates(subset=['TrackID'], keep='first', inplace=True)


In [98]:
df_networkx_preds

Unnamed: 0,TrackID,Predictor
0,199814_122375,1
1,199814_189043,1
2,199814_122429,1
3,199814_52519,0
4,199814_232332,0
...,...,...
119995,249010_186634,0
119996,249010_293818,0
119997,249010_110470,1
119998,249010_72192,1


In [97]:
df_networkx_preds.to_csv('recommendations-combined-dedup.csv', index=False)

## lightfm

In [106]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

# Load the MovieLens 100k dataset. Only five
# star ratings are treated as positive.

data = fetch_movielens(min_rating=5.0)

# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(data['train'], epochs=300, num_threads=8)

# Evaluate the trained model
test_precision = precision_at_k(model, data['test'], k=5).mean()


In [107]:
test_precision

0.045706734