In [41]:
import pandas as pd
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k, ranking_metrics_at_k

from src.algorithm.als.sparsedataset import SparseDataset
from model_selection import GridSearchCV, KFold
from src.algorithm.als.model_wrapper import ImplicitModelWrapper

from implicit.nearest_neighbours import CosineRecommender, normalize, BM25Recommender, TFIDFRecommender

import matplotlib.pyplot as plt

plt.rcParams['figure.facecolor'] = 'white'

In [9]:
dataset, test = SparseDataset.from_csv("C:/Users/TS/PycharmProjects/DS1-RecommendationSystems/data.csv",
                                 user="user", item="subreddit", rating='count')

Loading csv.
Creating pivot.


In [78]:
model = AlternatingLeastSquares

parameter = dict(iterations=[10],
                 factors=[64],
                 alpha=[1],
                 regularization=[0.1],
                 )
grid = GridSearchCV(algo=model,
                    param_grid=parameter,
                    cv=5,  # Number of folds in cross validation
                    eval_k=20,  # Number of Top items to check in validation
                    metrics=['map', 'precision'],  # the metrics to use
                    random_state=0)
print("Number of parameters combinations in grid:", len(grid))

Number of parameters combinations in grid: 1


In [77]:
grid.fit(dataset.item_user)
grid.get_result(show=False)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

{'param_set_0': {'params': {'alpha': 1,
   'factors': 64,
   'iterations': 10,
   'regularization': 0.1},
  'Fold 0': {'map': 0.12847238237923747, 'precision': 0.23237576538129906},
  'Fold 1': {'map': 0.12816494833311556, 'precision': 0.23295454545454544},
  'Fold 2': {'map': 0.1311190490954621, 'precision': 0.23469793430963026},
  'Fold 3': {'map': 0.1283675372795771, 'precision': 0.232661375978387},
  'Fold 4': {'map': 0.13022042512699, 'precision': 0.2335801393728223},
  'mean': {'map': 0.12926886844287644, 'precision': 0.2332539520993368},
  'std': {'map': 0.001182713498233923, 'precision': 0.0008250436922770743}}}

In [52]:
inf = grid.get_result(show=False, filter_metric='map')
df = pd.DataFrame(inf)

Unnamed: 0,alpha,factors,iterations,regularization,Fold 0,Fold 1,Fold 2,Fold 3,Fold 4,mean,std
0,1,64,5,0.01,0.12584,0.124978,0.123817,0.126122,0.12597,0.125346,0.000861
1,1,64,5,1.0,0.126453,0.124971,0.124607,0.125668,0.126584,0.125657,0.000783
2,1,64,5,0.001,0.124098,0.120606,0.122643,0.123171,0.122291,0.122562,0.001152
3,1,64,5,0.1,0.12913,0.127581,0.127489,0.128568,0.12825,0.128203,0.000615


In [5]:
df = pd.read_csv("T:/SubredditDataset/Gridsearch_new.csv")
best = df.iloc[df['mean'].argmax()]
print(best)

alpha              1.000000
factors           64.000000
iterations        10.000000
regularization     0.100000
Fold 0             0.130713
Fold 1             0.127383
Fold 2             0.128475
Fold 3             0.129593
Fold 4             0.129868
mean               0.129206
std                0.001159
Name: 213, dtype: float64


In [48]:
# , **grid.get_best()['map'][1]
best_model = AlternatingLeastSquares(regularization=0.01,
                                     iterations=10,
                                     factors=32)
best_model.fit(dataset.item_user)

  0%|          | 0/10 [00:00<?, ?it/s]

In [49]:
reddit = "Python"
sitems = best_model.similar_items(dataset.get_item_id(reddit), N=10)

for i, (idx, dist) in enumerate(sitems):
    print("{0:<3}{1:<20}{2:.3f}".format(i + 1, dataset.get_item(idx), dist))


1  Python              1.000
2  learnpython         0.939
3  learnprogramming    0.906
4  webdev              0.882
5  programming         0.871
6  web_design          0.863
7  programminghorror   0.843
8  javascript          0.831
9  linux4noobs         0.817
10 Wordpress           0.817


In [18]:
user = "-ah"  # -ah IncognitoCumShot
userid = dataset.get_user_id(user)

rec = best_model.recommend(userid, dataset.user_item, N=10, filter_already_liked_items=True)

for i, (idx, dist) in enumerate(rec):
    print("{0:<3}{1:<20}{2:.3f}".format(i + 1, dataset.get_item(idx), dist))
print("-" * 30)
print("True feedback:")
for subreddit, rating in sorted(zip(dataset.user_item.getrow(userid).indices,
                                    dataset.user_item.getrow(userid).data), key=lambda x: x[1],
                                reverse=True):
    print("{0:<23}{1:<3}".format(dataset.get_item(subreddit), rating))

1  britishproblems     0.849
2  UKPersonalFinance   0.723
3  CoronavirusUK       0.704
4  london              0.704
5  Scotland            0.703
6  politics            0.628
7  soccer              0.554
8  LegalAdviceUK       0.541
9  brexit              0.521
10 videos              0.508
------------------------------
True feedback:
unitedkingdom          661
ukpolitics             89 
europe                 46 
worldnews              9  
CasualUK               6  
ModSupport             6  
PoliticalDiscussion    6  
Roadcam                6  
AskUK                  5  
de                     5  
news                   3  
Showerthoughts         2  
aww                    2  
france                 2  
nottheonion            2  
todayilearned          2  
AskReddit              1  
ich_iel                1  
space                  1  


Testing Cosine Neighbours

In [44]:

neighbour_model = TFIDFRecommender

parameter = dict(K=list(range(10, 201, 10)))
grid = GridSearchCV(algo=neighbour_model,
                    param_grid=parameter,
                    cv=5,  # Number of folds in cross validation
                    eval_k=10,  # Number of Top items to check in validation
                    metrics=['map', 'precision'],  # the metrics to use
                    random_state=0)
print("Number of parameters combinations in grid:", len(grid))

Number of parameters combinations in grid: 20


In [45]:
grid.fit(dataset.item_user.astype(float))
grid.get_result(show=False)


  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

{'param_set_0': {'params': {'K': 10},
  'Fold 0': {'map': 0.1018809651713145, 'precision': 0.19100337450852914},
  'Fold 1': {'map': 0.10188809080666979, 'precision': 0.19365173796377202},
  'Fold 2': {'map': 0.10275410269668374, 'precision': 0.19302686270860475},
  'Fold 3': {'map': 0.10203132480131955, 'precision': 0.1926793267262872},
  'Fold 4': {'map': 0.1029855062968817, 'precision': 0.19477654603300856},
  'mean': {'map': 0.10230799795457386, 'precision': 0.19302756958804032},
  'std': {'map': 0.0004676011204863614, 'precision': 0.001238316928469858}},
 'param_set_1': {'params': {'K': 20},
  'Fold 0': {'map': 0.1098415311782906, 'precision': 0.19788861025974427},
  'Fold 1': {'map': 0.11056044964762489, 'precision': 0.2006544088940118},
  'Fold 2': {'map': 0.11086072589406336, 'precision': 0.19965258390719026},
  'Fold 3': {'map': 0.1111675342446831, 'precision': 0.20108284442018828},
  'Fold 4': {'map': 0.11199669515824721, 'precision': 0.20272419964207597},
  'mean': {'map': 0

In [47]:
inf = grid.get_result(show=False, filter_metric='map')
df = pd.DataFrame(inf)
#df.to_csv("nearst_neigh_grid.csv", index=False)
df

Unnamed: 0,K,Fold 0,Fold 1,Fold 2,Fold 3,Fold 4,mean,std
0,10,0.101881,0.101888,0.102754,0.102031,0.102986,0.102308,0.000468
1,20,0.109842,0.11056,0.110861,0.111168,0.111997,0.110885,0.000709
2,30,0.114693,0.114827,0.115791,0.115213,0.116702,0.115445,0.000735
3,40,0.116996,0.117111,0.117527,0.117714,0.119096,0.117689,0.000751
4,50,0.117962,0.118128,0.11879,0.118818,0.120292,0.118798,0.000822
5,60,0.118619,0.119025,0.119577,0.119666,0.121318,0.119641,0.000921
6,70,0.119199,0.119618,0.119685,0.119976,0.121608,0.120017,0.000833
7,80,0.119442,0.119853,0.120007,0.120279,0.121923,0.120301,0.000855
8,90,0.119608,0.120095,0.120352,0.12066,0.122227,0.120588,0.000889
9,100,0.11981,0.120209,0.120646,0.120801,0.12251,0.120795,0.000925


In [42]:
m = TFIDFRecommender(K=200)
m.fit(dataset.item_user.astype(float))

  0%|          | 0/2483 [00:00<?, ?it/s]

In [43]:
reddit = "Python"
sitems = m.similar_items(dataset.get_item_id(reddit), N=10)

for i, (idx, dist) in enumerate(sitems):
    print("{0:<3}{1:<20}{2:.3f}".format(i + 1, dataset.get_item(idx), dist))

1  Python              1.000
2  learnpython         0.338
3  programming         0.173
4  CryptoMarkets       0.154
5  learnprogramming    0.147
6  ProgrammerHumor     0.142
7  linux               0.104
8  javascript          0.073
9  linux4noobs         0.068
10 raspberry_pi        0.066


<3224x34563 sparse matrix of type '<class 'numpy.float64'>'
	with 1284519 stored elements in COOrdinate format>