In [1]:
import pandas as pd
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k, ranking_metrics_at_k

from src.algorithm.als.sparsedataset import SparseDataset
from model_selection import GridSearchCV, KFold
from src.algorithm.als.model_wrapper import ImplicitModelWrapper

from implicit.nearest_neighbours import CosineRecommender, normalize, BM25Recommender, TFIDFRecommender

import matplotlib.pyplot as plt

plt.rcParams['figure.facecolor'] = 'white'

In [2]:
dataset, test = SparseDataset.from_csv("C:/Users/TS/PycharmProjects/DS1-RecommendationSystems/data.csv",
                                       user="user", item="subreddit", rating='count')

Loading csv.
Creating pivot.


In [5]:
model = AlternatingLeastSquares

parameter = dict(iterations=[10],
                 factors=[64],
                 alpha=[1, 10],
                 regularization=[0.1],
                 )
grid = GridSearchCV(algo=model,
                    param_grid=parameter,
                    cv=5,  # Number of folds in cross validation
                    eval_k=20,  # Number of Top items to check in validation
                    metrics=['map', 'precision'],  # the metrics to use
                    random_state=12)
print("Number of parameters combinations in grid:", len(grid))

Number of parameters combinations in grid: 2


In [6]:
grid.fit(dataset.item_user)
grid.get_result(show=False)

Train 894138 float64 Test 223535 float64


  0%|          | 0/10 [00:00<?, ?it/s]

Train 894138 float64 Test 223535 float64


  0%|          | 0/10 [00:00<?, ?it/s]

Train 894138 float64 Test 223535 float64


  0%|          | 0/10 [00:00<?, ?it/s]

Train 894139 float64 Test 223534 float64


  0%|          | 0/10 [00:00<?, ?it/s]

Train 894139 float64 Test 223534 float64


  0%|          | 0/10 [00:00<?, ?it/s]

Train 894138 float64 Test 223535 float64


  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [59]:
inf = grid.get_result(show=False, filter_metric='map')
df = pd.DataFrame(inf)
df

Unnamed: 0,alpha,factors,iterations,regularization,Fold 0,Fold 1,Fold 2,Fold 3,Fold 4,mean,std
0,1,64,10,0.1,0.126279,0.12436,0.124801,0.125506,0.124731,0.125135,0.000682
1,10,64,10,0.1,0.100637,0.124959,0.124703,0.123934,0.125761,0.119999,0.009699
2,20,64,10,0.1,0.081768,0.124503,0.124307,0.124493,0.125541,0.116123,0.017183
3,30,64,10,0.1,0.072311,0.125041,0.125045,0.124714,0.125305,0.114483,0.021087
4,40,64,10,0.1,0.06271,0.125001,0.124891,0.123978,0.125131,0.112342,0.024819
5,50,64,10,0.1,0.056887,0.124354,0.125255,0.123058,0.124212,0.110753,0.026942


In [5]:
df = pd.read_csv("T:/SubredditDataset/Gridsearch_new.csv")
best = df.iloc[df['mean'].argmax()]
print(best)

alpha              1.000000
factors           64.000000
iterations        10.000000
regularization     0.100000
Fold 0             0.130713
Fold 1             0.127383
Fold 2             0.128475
Fold 3             0.129593
Fold 4             0.129868
mean               0.129206
std                0.001159
Name: 213, dtype: float64


In [48]:
# , **grid.get_best()['map'][1]
best_model = AlternatingLeastSquares(regularization=0.01,
                                     iterations=10,
                                     factors=32)
best_model.fit(dataset.item_user)

  0%|          | 0/10 [00:00<?, ?it/s]

In [49]:
reddit = "Python"
sitems = best_model.similar_items(dataset.get_item_id(reddit), N=10)

for i, (idx, dist) in enumerate(sitems):
    print("{0:<3}{1:<20}{2:.3f}".format(i + 1, dataset.get_item(idx), dist))


1  Python              1.000
2  learnpython         0.939
3  learnprogramming    0.906
4  webdev              0.882
5  programming         0.871
6  web_design          0.863
7  programminghorror   0.843
8  javascript          0.831
9  linux4noobs         0.817
10 Wordpress           0.817


In [18]:
user = "-ah"  # -ah IncognitoCumShot
userid = dataset.get_user_id(user)

rec = best_model.recommend(userid, dataset.user_item, N=10, filter_already_liked_items=True)

for i, (idx, dist) in enumerate(rec):
    print("{0:<3}{1:<20}{2:.3f}".format(i + 1, dataset.get_item(idx), dist))
print("-" * 30)
print("True feedback:")
for subreddit, rating in sorted(zip(dataset.user_item.getrow(userid).indices,
                                    dataset.user_item.getrow(userid).data), key=lambda x: x[1],
                                reverse=True):
    print("{0:<23}{1:<3}".format(dataset.get_item(subreddit), rating))

1  britishproblems     0.849
2  UKPersonalFinance   0.723
3  CoronavirusUK       0.704
4  london              0.704
5  Scotland            0.703
6  politics            0.628
7  soccer              0.554
8  LegalAdviceUK       0.541
9  brexit              0.521
10 videos              0.508
------------------------------
True feedback:
unitedkingdom          661
ukpolitics             89 
europe                 46 
worldnews              9  
CasualUK               6  
ModSupport             6  
PoliticalDiscussion    6  
Roadcam                6  
AskUK                  5  
de                     5  
news                   3  
Showerthoughts         2  
aww                    2  
france                 2  
nottheonion            2  
todayilearned          2  
AskReddit              1  
ich_iel                1  
space                  1  


Testing Cosine Neighbours

In [63]:

neighbour_model = CosineRecommender

parameter = dict(K=list(range(191, 201, 10)))
grid = GridSearchCV(algo=neighbour_model,
                    param_grid=parameter,
                    cv=5,  # Number of folds in cross validation
                    eval_k=10,  # Number of Top items to check in validation
                    metrics=['map', 'precision'],  # the metrics to use
                    random_state=0)
print("Number of parameters combinations in grid:", len(grid))

Number of parameters combinations in grid: 1


In [65]:
grid.fit(dataset.item_user.astype(float))
grid.get_result(show=False)


{'param_set_0': {'params': {'K': 191},
  'Fold 0': {'map': 0.08446494289922978, 'precision': 0.17294201417912758},
  'Fold 1': {'map': 0.0828053443133263, 'precision': 0.17167078770752384},
  'Fold 2': {'map': 0.08415020210528971, 'precision': 0.1727898753024381},
  'Fold 3': {'map': 0.08344882702446713, 'precision': 0.17319929050743602},
  'Fold 4': {'map': 0.08478858616009882, 'precision': 0.17471788625969378},
  'mean': {'map': 0.08393158050048234, 'precision': 0.17306397079124386},
  'std': {'map': 0.0007166852060654527, 'precision': 0.0009782339862583421}}}

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

  0%|          | 0/2483 [00:00<?, ?it/s]

{'param_set_0': {'params': {'K': 191},
  'Fold 0': {'map': 0.08446494289922978, 'precision': 0.17294201417912758},
  'Fold 1': {'map': 0.0828053443133263, 'precision': 0.17167078770752384},
  'Fold 2': {'map': 0.08415020210528971, 'precision': 0.1727898753024381},
  'Fold 3': {'map': 0.08344882702446713, 'precision': 0.17319929050743602},
  'Fold 4': {'map': 0.08478858616009882, 'precision': 0.17471788625969378},
  'mean': {'map': 0.08393158050048234, 'precision': 0.17306397079124386},
  'std': {'map': 0.0007166852060654527, 'precision': 0.0009782339862583421}}}

In [66]:
inf = grid.get_result(show=False, filter_metric='map')
df = pd.DataFrame(inf)
#df.to_csv("nearst_neigh_grid.csv", index=False)
df

Unnamed: 0,K,Fold 0,Fold 1,Fold 2,Fold 3,Fold 4,mean,std
0,191,0.084465,0.082805,0.08415,0.083449,0.084789,0.083932,0.000717


In [3]:
m = CosineRecommender(K=180)
m.fit(dataset.item_user.astype(float))

  0%|          | 0/2483 [00:00<?, ?it/s]

In [43]:
reddit = "Python"
sitems = m.similar_items(dataset.get_item_id(reddit), N=10)

for i, (idx, dist) in enumerate(sitems):
    print("{0:<3}{1:<20}{2:.3f}".format(i + 1, dataset.get_item(idx), dist))

1  Python              1.000
2  learnpython         0.338
3  programming         0.173
4  CryptoMarkets       0.154
5  learnprogramming    0.147
6  ProgrammerHumor     0.142
7  linux               0.104
8  javascript          0.073
9  linux4noobs         0.068
10 raspberry_pi        0.066


In [15]:
user = "BigMac3k"  # -ah IncognitoCumShot
userid = dataset.get_user_id(user)

rec = m.recommend(userid, dataset.user_item.astype(float), N=10, filter_already_liked_items=True)

for i, (idx, dist) in enumerate(rec):
    print("{1:<20}& {2:.2f} \\\\".format(i + 1, dataset.get_item(idx), dist))

CryptoMarkets       & 26.46 \\
binance             & 12.73 \\
btc                 & 11.27 \\
CoinBase            & 10.29 \\
ethtrader           & 10.10 \\
cardano             & 9.96 \\
SatoshiStreetBets   & 8.14 \\
crafts              & 8.13 \\
ethereum            & 7.92 \\
wallstreetbets      & 6.55 \\
