In [54]:
from __future__ import print_function

First we load the functions and libraries necessary for this report.

In [68]:
run analysis_functions.ipynb #import all helper functions

In [16]:
# Import main dataset
df = pd.read_csv('lastfm_9000_users.csv', na_filter=False)
df = df.drop(['Unnamed: 0'], axis=1)

The structure of the analysis will be as such: 
1. Objective
2. Prepare data
3. Model Fitting, Tuning, and Evaluation
    - Benchmarks
        - Most Popular
        - ALS Matrix Factorization
    - LightFM
        - Vanilla FM / BPR
        - FM with User/Item Side Information
        - Parameter Tuning
4. Model Exploration
    - Metrics Used (NDCG, Recall, Precision, Coverage)
    - Performance of each Model (Table)
    - NDCG Metric by User Type 
        - Active/Non-Active (Aggregate Plays)
        - Diverse/Non-Diverse (Top 1000 Artists)
        - "Basic"/Non-"Basic" (Popular)
    - Scale:
        - NDCG by Size
        - Training Time / Predict Time
6. Conclusion / Next Steps

# 1. Objective

BLALBALBALBLA

# 2. Prepare Data

### 2a. Create Sparse Matrix from Dataset

In [17]:
#create sparse matrix
plays_sparse = create_sparse_matrix(df).astype('float')
print('Matrix Sparsity:', calculate_sparsity(plays_sparse))

Creating sparse matrix...




('Matrix Sparsity:', 99.8965986346416)


### 2b. Split Data to Train/Test

Split data into train test set, maintaining that each user still has some interactions intact.

In [18]:
# Split data into training and test sets
train, test, user_count = split_train_test_per_user(plays_sparse, 3, 10)
print("Percentage of original data masked:", pct_masked(plays_sparse, train))
print("Users masked:", user_count)

HBox(children=(IntProgress(value=0, max=9000), HTML(value=u'')))


('Percentage of original data masked:', 0.06548419166887459)
('Users masked:', 8980)


In [35]:
#train is item by user to accomodate implicit and baseline training
train

<47102x9000 sparse matrix of type '<type 'numpy.float64'>'
	with 438337 stored elements in Compressed Sparse Row format>

# 3. Model Fitting, Tuning, and Evaluation

- Note on evaluation: use metrics
- what autotune does

## 3a. Benchmarks

### Baseline

Baseline recommends the most-popular artists to everyone.

In [36]:
model_baseline = Baseline(n_recs = 20)
model_baseline.fit(train)

Fitting baseline...
[3.17738e+05 3.48600e+04 6.91220e+04 ... 2.80000e+01 2.70000e+01
 2.40000e+01]


No tuning is necessary since no parameters. We then evaluate the test set below: 

In [46]:
coverage, precision, recall, ndcg = evaluate(model_baseline, "baseline", test, plays_sparse)
print("Precision:",precision*100,'%')
print("Recall:",recall*100,'%')
print("Coverage:",coverage*100,'%')
print("Average NDCG per User:",ndcg*100,'%')

Evaluating model...


HBox(children=(IntProgress(value=0, max=8980), HTML(value=u'')))


Precision: 1.1425389755 %
Recall: 7.61692650334 %
Coverage: 0.042461041994 %
Average NDCG per User: 4.533232513791051 %


### Model-Based (ALS)

Here we fit the model-based ALS Matrix Factorization using the implicit package from Homework 2 and use the parameters that were found to be optimized in the HW 2 report. 

In [40]:
model_als = implicit.als.AlternatingLeastSquares(factors = 30, regularization = 0.01)

# Train model
print("Fitting model...")
model_als.fit(train)

  0%|          | 0/15 [00:00<?, ?it/s]

Fitting model...


100%|██████████| 15.0/15 [00:05<00:00,  2.50it/s]


In [47]:
coverage, precision, recall, ndcg = evaluate(model_als, "implicit", test, plays_sparse)
print("Precision:",precision*100,'%')
print("Recall:",recall*100,'%')
print("Coverage:",coverage*100,'%')
print("Average NDCG per User:",ndcg*100,'%')

Evaluating model...


HBox(children=(IntProgress(value=0, max=8980), HTML(value=u'')))


Precision: 3.2065701559 %
Recall: 21.3771343727 %
Coverage: 8.50494671139 %
Average NDCG per User: 11.622170561158734 %


### LightFM (without side information)

Here we fit the LightFM model.

In [69]:
from lightfm import LightFM

model_fm_vanilla = LightFM(no_components=30, loss='bpr')

# Train Model
print("Fitting model...")
model_fm_vanilla.fit(train.T.tocsr(),user_features = None, item_features = None)

Fitting model...


<lightfm.lightfm.LightFM at 0x10e5f9850>

Now we try to find the best hyperparameter for this model and use the model with the best hyperparameter to get our results.

In [72]:
run analysis_functions.ipynb #import all helper functions

In [73]:
coverage, precision, recall, ndcg = evaluate(model_fm_vanilla, "lightfm", test, plays_sparse)
print("Precision:",precision*100,'%')
print("Recall:",recall*100,'%')
print("Coverage:",coverage*100,'%')
print("Average NDCG per User:",ndcg*100,'%')

Evaluating model...
lightfm


HBox(children=(IntProgress(value=0, max=8980), HTML(value=u'')))

[-0.4830966  -0.53468198 -0.4948459  ... -0.51602066 -0.54535168
 -0.5218792 ]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745 14650
  4846  2991 11656 19278  1888  6606   570  2678]
[-0.46115834 -0.51451099 -0.47216213 ... -0.49439782 -0.52464944
 -0.50129426]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745 19278
  2991  4846 14650 11656  1888  6606  2678   570]
[-0.48459414 -0.53598243 -0.49432719 ... -0.51691115 -0.54671323
 -0.52394909]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
 19278 14650  4846  1888  6606 11656   570  2678]
[-0.47470036 -0.52569991 -0.48519906 ... -0.50750804 -0.53694302
 -0.51421481]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
  4846 19278 11656 14650  1888  6606  4491  5765]
[-0.4756223  -0.526977   -0.48611683 ... -0.50929314 -0.53776604
 -0.51452559]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 11656 19278 14650  1888  6606  26

[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
 19278  4846 14650 11656  6606  1888  5765   570]
[-0.48005024 -0.53233176 -0.48976439 ... -0.5130111  -0.54342037
 -0.52080768]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278 11656  4846 14650  1888  6606   570  2678]
[-0.462881   -0.51459175 -0.47205666 ... -0.49622649 -0.52520913
 -0.50214583]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 11656 14650  1888  6606  2678  5765]
[-0.29803839 -0.35045317 -0.30733386 ... -0.33144835 -0.36056215
 -0.33859593]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
 19278  4846 14650  1888 11656  5765  2678  6606]
[-0.45229122 -0.5043413  -0.46220291 ... -0.48613989 -0.51517737
 -0.49291191]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278 14650 11656  1888  2678  6606  5765]
[-0.58013636 -0.63367724 -0.59063673 ... -0.61401826 -0.64274526
 -0.

[-0.44952977 -0.50182015 -0.45994893 ... -0.4814238  -0.51139379
 -0.49038631]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745 19278
  2991  4846 11656 14650  1888  2678   570  5765]
[-0.48166007 -0.53313273 -0.49095261 ... -0.51455438 -0.54326946
 -0.52025515]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  4846
 19278  2991 14650  6606 11656  1888  5765   570]
[-0.48530224 -0.53718686 -0.49643144 ... -0.51891208 -0.54774576
 -0.52498686]
[   82    55  2473   330 13704   772 10103   148  4958 27255  7745 19278
  2991  4846 11656  1888 14650   570  6606  2678]
[-0.3678526  -0.41821957 -0.37682536 ... -0.4000701  -0.428992
 -0.40691206]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650 11656  1888  2678  6606   570]
[-0.52871752 -0.5797267  -0.53843755 ... -0.56088483 -0.59042692
 -0.56769478]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650 11656  1888  6606   570

[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  4846
  2991 19278 14650 11656  1888  6606  5765  2678]
[-0.4674544  -0.5184105  -0.4769772  ... -0.50070029 -0.52969193
 -0.50678498]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278 14650 11656  1888  6606  2678   570]
[-0.51234186 -0.5631851  -0.5220173  ... -0.54532027 -0.57498074
 -0.55126631]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 14650 19278 11656  1888  6606  2678  5765]
[-0.46502408 -0.51626372 -0.47625303 ... -0.49833545 -0.52693224
 -0.50513262]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745 19278
  2991  4846 11656 14650  1888  6606  4491  2678]
[-0.59378862 -0.6447041  -0.60273665 ... -0.62554079 -0.65611267
 -0.63376951]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
 19278  4846 14650  1888 11656  6606  2678  5765]
[-0.48178884 -0.534172   -0.49185312 ... -0.51483619 -0.54372734
 -0.

[-0.47113973 -0.52256221 -0.4810034  ... -0.50343746 -0.53355169
 -0.51107258]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  4846
  2991 19278 11656 14650  1888  6606   570  5765]
[-0.39718372 -0.44870192 -0.40714896 ... -0.4296895  -0.45858258
 -0.43696705]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 14650 19278  4846 11656  1888  6606   570  5765]
[-0.47819445 -0.52916479 -0.48789433 ... -0.51182359 -0.54080182
 -0.51810563]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278  1888 11656 14650  6606  2678  5765]
[-0.4658955  -0.5192582  -0.47693419 ... -0.49953997 -0.52927059
 -0.50618351]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745 19278
  2991 14650  4846 11656  6606  1888  5765   570]
[-0.39531302 -0.44718301 -0.4043968  ... -0.42852446 -0.45792171
 -0.43550476]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
 19278  4846 14650 11656  1888  6606  57

[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278 14650 11656  1888  6606  2678  5765]
[-0.45054367 -0.50354004 -0.46166265 ... -0.48372084 -0.51320988
 -0.48921341]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
 19278  4846 14650 11656  6606  1888  5765  2678]
[-0.47233742 -0.52513999 -0.48280177 ... -0.50457644 -0.53485429
 -0.51193911]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958 14650
  2991  4846 19278 11656  6606  1888  5765   570]
[-0.48047331 -0.53210968 -0.49089792 ... -0.51383466 -0.542382
 -0.51971042]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 14650  4846 11656 19278  1888  6606  2678   570]
[-0.47929567 -0.53243881 -0.48922676 ... -0.51331866 -0.54183692
 -0.51940757]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650 11656  1888  6606  2678  5765]
[-0.63841069 -0.69080567 -0.64869881 ... -0.67097396 -0.70184803
 -0.67

[-0.37320942 -0.42670986 -0.38354066 ... -0.40727195 -0.4360773
 -0.41247994]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
 19278  4846 14650 11656  1888  6606  2678  5765]
[-0.43186009 -0.48514223 -0.44095796 ... -0.4660635  -0.49472833
 -0.47228599]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650 11656  6606  1888   570  2678]
[-0.45778465 -0.51051491 -0.46946052 ... -0.49130949 -0.52092022
 -0.49775037]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278 14650 11656  4846  1888   570  6606  5765]
[-0.49162713 -0.54179615 -0.50156254 ... -0.52399766 -0.5533092
 -0.53047246]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  4846
  2991 14650 19278 11656  1888  6606  2678   570]
[-0.51602173 -0.57025445 -0.5261206  ... -0.55010802 -0.57974511
 -0.55615526]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 14650 19278  6606 11656  5765  1888

[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650 11656  1888  6606  2678   570]
[-0.47254321 -0.52306288 -0.48198527 ... -0.50605881 -0.5339945
 -0.51323694]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846  1888 11656 14650  2678  6606   570]
[-0.50774705 -0.5606724  -0.51774192 ... -0.54206312 -0.56951028
 -0.54722995]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
 19278  1888  4846 14650 11656  6606   570  5765]
[-0.4031564  -0.45622882 -0.41318825 ... -0.43738019 -0.4660432
 -0.44489831]
[   82    55  2473   330 13704   772   148 10103 27255  7745  2991  4958
 19278  1888  4846 11656 14650  6606   570  2678]
[-0.49829569 -0.54979199 -0.50828898 ... -0.53100437 -0.56017888
 -0.53821427]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650 11656  1888  6606   570  5765]
[-0.46248403 -0.51464397 -0.47336617 ... -0.4965001  -0.52409464
 -0.50

[-0.45440757 -0.50655657 -0.4646149  ... -0.48798722 -0.51707649
 -0.4937385 ]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  4846
  2991 19278 11656 14650  6606  1888  2678   570]
[-0.5274058  -0.57922262 -0.53730762 ... -0.56118321 -0.58953446
 -0.56685793]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
  4846 19278 11656 14650  1888  6606  2678   570]
[-0.5599277  -0.6108399  -0.56979352 ... -0.59202558 -0.62089992
 -0.59899056]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278 11656 14650  1888   570  2678  5765]
[-0.30682594 -0.35964826 -0.31804371 ... -0.34143898 -0.36910653
 -0.3470993 ]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
 19278  4846  1888 11656 14650  6606  2678   570]
[-0.48352057 -0.53655642 -0.49398881 ... -0.51679254 -0.54726011
 -0.52409893]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
 19278 11656  4846  1888 14650   570  66

[-0.44975415 -0.50275254 -0.46029755 ... -0.48346633 -0.51343626
 -0.49053171]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745 19278
  2991  4846 11656 14650  1888  2678  6606  5765]
[-0.48683634 -0.53843194 -0.49692541 ... -0.51962644 -0.54827482
 -0.52690268]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  4846
  2991 19278 14650 11656  1888  6606  4491  5765]
[-0.50034297 -0.55146617 -0.508744   ... -0.53428483 -0.56243831
 -0.54046196]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278 14650  4846 11656  1888  6606  2678  5765]
[-0.50494868 -0.55537474 -0.51504701 ... -0.53641176 -0.5672555
 -0.54384285]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  4846
  2991 19278 14650 11656  1888  6606  5765  2678]
[-0.48506531 -0.53723007 -0.49500591 ... -0.51815617 -0.54764938
 -0.52521431]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
  4846 19278 11656 14650  1888  5765  267

[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 14650 19278 11656  6606  1888  2678  5765]
[-0.55104047 -0.60181689 -0.56162506 ... -0.58441848 -0.61338031
 -0.59146821]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 11656 19278  1888 14650   570  5765  2678]
[-0.28043041 -0.33168191 -0.28907156 ... -0.31470978 -0.34182897
 -0.32033405]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
  4846 19278 14650  1888 11656  6606  2678  5765]
[-0.47496071 -0.52653837 -0.48412097 ... -0.50746739 -0.53750968
 -0.51505846]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278 14650  4846 11656  6606  1888   570  5765]
[-0.54989552 -0.60239065 -0.55932182 ... -0.58353931 -0.61215609
 -0.58933169]
[   82    55  2473   330 13704   772   148 10103 27255  7745  4958  2991
 19278  4846 14650  1888  6606 11656  2678  5765]
[-0.46872857 -0.52143949 -0.47898555 ... -0.5029003  -0.532543
 -0.50

[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650 11656  6606  1888  5765  2678]
[-0.48701319 -0.53959846 -0.49622044 ... -0.51939034 -0.54950726
 -0.52731472]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
  4846 19278 14650 11656  6606  1888  2678  5765]
[-0.48140958 -0.53304231 -0.49170205 ... -0.51481503 -0.54378492
 -0.51970208]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278 14650 11656  6606  1888  5765  4491]
[-0.56904602 -0.62048191 -0.58015275 ... -0.6028893  -0.63072592
 -0.60898161]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278 14650  1888 11656  6606  2678  4491]
[-0.46329096 -0.51563549 -0.47492748 ... -0.49734494 -0.52691305
 -0.50308245]
[   82    55  2473   330 13704   772 10103   148  4958 27255  7745  2991
 19278  4846 14650 11656  6606  1888   570  2678]
[-0.47137505 -0.5229004  -0.48086098 ... -0.50418651 -0.53360361
 -0.

[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650  1888 11656  6606   570  2678]
[-0.46612531 -0.51856101 -0.47682759 ... -0.49930876 -0.52786332
 -0.50536776]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
 19278  4846 14650  1888 11656  6606  2678   570]
[-0.44513601 -0.49556017 -0.45590588 ... -0.47830895 -0.50721151
 -0.48447236]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  2991
  4846 19278 11656 14650  1888  6606   570  2678]
[-0.20291585 -0.25569391 -0.21465087 ... -0.23682609 -0.26700366
 -0.24238306]
[   82    55  2473   330 13704   772   148 10103 27255  4958  7745  2991
  4846 19278 11656 14650  6606  1888  2678  5765]
[-0.46211839 -0.51447678 -0.47352314 ... -0.49563053 -0.52456522
 -0.50245273]
[   82    55  2473   330 13704   772 10103   148 27255  4958  7745  4846
  2991 19278 14650 11656  1888  6606  4491  2678]
[-0.48215365 -0.53400975 -0.49226531 ... -0.51485097 -0.54478496
 -0.

KeyboardInterrupt: 

### LightFM (with side information)

In [None]:
#insert side info here here
user_features = None
item_features = None

model_fm_features = LightFM(no_components=30, loss='warp')

# Train Model
print("Fitting model...")
model.fit(train.T.tocsr(),user_features = None, item_features = None)

Now we try to find the best hyperparameter for thi smodel with the best hyperparameter to get our results.

### Summary of Performance Results (Table)

# 4. Model Exploration

Next, we explore these models even more. How do they perform in regards to size, different user population?

## 4a. Performance by User Type

### Active/Non-Active

### Diverse/Non-Diverse

### Basic/Non-Basic

## 4b. Performance by Input Size 

### Accuracy/NDCG

### Training/Predict Time

# 5. Conclusion / Next Steps

# OLD STUFF: 

# Cross Validation and Parameter Tuning (k-fold)

In this section, we use k-fold cross validation to tune hyperparameters and evaluate our models.

### Splitting into test and training sets

In [16]:
# Cross Validation test
k=5
train_list, test_list, user_count = split_train_test_per_user(plays_sparse,k,20,cross_valid=True)




## Evaluate and tune ALS

Tuning two parameters: number of latent factors and the regularization factor. 

For latent factors, we try values [10,20,30,40,50,60]

For regularization factors, we try [.01,.03,.05,.07]

In [None]:
start = time.time()
model=implicit.als.AlternatingLeastSquares
ndcg_list,heatmap_list=auto_tune_parameter(4,20,model,plays_sparse,[10,20,30,40,50,60],[.01,.03,.05,.07],param3=None)
stop = time.time()
total = stop-start

In [None]:
# Plot heatmap of parameter tuning results
sns.set_style("whitegrid")
sns.heatmap(heatmap_list[2], 
            xticklabels=['0.01','0.03','0.05','0.07'], 
            yticklabels=['10','20','30','40','50','60'], 
            cbar_kws={'label':'NDCG Score'})
plt.ylabel("Number of Latent Factors")
plt.xlabel("Regularization Factor")
plt.title("ALS NDCG scores according to Model Parameters")
plt.show()

Here we analyze the scalability of ALS by looking at datasets with 9k, 20k, 60k, and 150k users:

In [None]:
# The following block imports larger datasets for scaling tests, these expanded CSVs are not available in the repo
files9k = pd.read_csv('lastfm_9000_users.csv', na_filter=False)
files20k = pd.read_csv('lastfm_20k_users.csv', na_filter=False)
files60k = pd.read_csv('lastfm_60k_users.csv', na_filter=False)
files150k = pd.read_csv('lastfm_150k_users.csv', na_filter=False)
files40k = get_users(files150k, 40000)
files = [files9k, files20k, files40k, files60k]

In [None]:
# Compute the recall and ndcg using optimal parameters for the ALS model on different dataset sizes
size = [9000, 20000, 40000, 60000]
ndcg_size = []
recall_size = []
for i in files:
    model = implicit.als.AlternatingLeastSquares(factors=30, regularization=0.01)

    #create sparse matrix
    plays_sparse = create_sparse_matrix(i).astype('float')
    print('Matrix Sparsity:', calculate_sparsity(plays_sparse))

    train, test, user_count = split_train_test_per_user(plays_sparse, 4, 20)

    # train model 
    print("Fitting model...")
    model.fit(train, show_progress=True)

    recall, ndcg = evaluate(model, test, plays_sparse)
    print("Recall:",recall*100,'%')
    print("Average NDCG:",ndcg*100,'%')
    recall_size.append(recall)
    ndcg_size.append(ndcg)

In [None]:
# Plot scalability of ALS model
ndcg_size_df = pd.DataFrame({'N':size,
                       'NDCG': ndcg_size})
g = sns.pointplot(x='N', y='NDCG', data=ndcg_size_df)
plt.title('NDCG by Input Size for ALS')
plt.xlabel('Number Of Users')
plt.show()

Here we analyze the catalog coverage of the ALS model:

In [None]:
# Calculate catalog coverage of ALS model with optimal parameters
model = implicit.als.AlternatingLeastSquares(factors=30, regularization=0.01)
model.fit(plays_sparse)
users = list(df.user_id.unique())
catalog = []
for i in range(0,len(users)):
    for x,y in model.recommend(i,plays_sparse.T.tocsr(), N=20, filter_already_liked_items=True):
        if x not in catalog:
            catalog.append(x)
print('Catalog Coverage is', len(catalog)/plays_sparse.shape[1])

# Tune ALS

Below we will tune the hyperparameters of ALS from a given range of hyperparameters.

In [None]:
model=implicit.als.AlternatingLeastSquares
ndcg_list,heatmap_list=auto_tune_parameter(4,20,model,plays_sparse,[10,20,30,40,50,60],[.01,.03,.05,.07],param3=None)

sns.set_style("whitegrid")
sns.heatmap(heatmap_list[2], xticklabels=['0.01','0.03','0.05','0.07'], yticklabels=['10','20','30','40','50','60'], cbar_kws={'label':'NDCG Score'})
plt.ylabel("Number of Latent Factors")
plt.xlabel("Regularization Factor")
plt.title("ALS NDCG scores according to Model Parameters")
plt.show()

# Input Size vs Training Time

Below you can run the code using a larger dataset to evaluate performance tiem compared with input tiem

In [None]:
#helper function to grab needed users
def get_users(df, n):
    sample_userid = df["user_id"].unique()
    sample_userid = np.random.choice(sample_userid, size = n, replace = False)

    #grab rows with sample user id
    df_sample = df[df.user_id.isin(sample_userid)].reset_index(drop = True)

    return df_sample

#in order to run this analysis, you need to download the 150k data at: 
#https://www.dropbox.com/s/qd8rnlxsuq0rjll/last_fm_bigger_data.zip?dl=0

#read in large dataset
df_150 = pd.read_csv('lastfm_150k_users.csv', na_filter=False)
df_150 = df_150.drop(['Unnamed: 0'], axis=1)

# ALS

In [None]:
#plot ALS training time vs input size
#calculate training time for different input sizes for ALS
import sns
import time

size = [9000,20000,40000,60000,80000]
train_time = list()

#get train data for each input size
for users in size:
    df = get_users(df_150, users)
    plays_sparse = create_sparse_matrix(df)

    # K-Nearest Neighbors
    model = implicit.als.AlternatingLeastSquares(50)
    
    start = time.time()
    # train model 
    print("Fitting model...")
    model.fit(plays_sparse, show_progress=True)
    stop = time.time()
    total = stop-start
    train_time.append(total)

#plot
df = pd.DataFrame({'n':size,'time':train_time})
sns.pointplot(x="n",y="time",data=df)
plt.title('Training Time by Input Size for KNN')
plt.xlabel('Number of Users')
plt.ylabel('Time (s)')