# Notebook de filtro colaborativo


## Setup inicial


In [1]:
import sqlite3
import pandas as pd
import os

THIS_FOLDER = os.path.dirname(os.path.abspath("__file__"))
print(THIS_FOLDER)

# tables
interacciones = 'interactions'
items = 'repositories'
users = 'users'

# datasets to dfs
con = sqlite3.connect(os.path.join(THIS_FOLDER, "data/data.db"))
df_int = pd.read_sql_query(f"SELECT * FROM {interacciones}", con)
df_items = pd.read_sql_query(f"SELECT * FROM {items}", con)
df_users = pd.read_sql_query(f"SELECT * FROM {users}", con)
con.close()

/home/tomas/workspace/uba/sr/sr-tpfinal-gh-web


# Generics


In [62]:
def build_dummies(data, column, sep=";", remove_original_column=False):
    data_dummies = data[column].str.get_dummies(sep=sep)
    data_with_dummies = pd.concat([data, data_dummies], axis=1)
    if remove_original_column:
        data_with_dummies = data_with_dummies.drop(columns=[column], axis=1)
    return data_with_dummies


# Surprise

Enfoque de recomendación con surprise no es posible dado que no se enfoca en recomendaciones binarias

ver: https://github.com/NicolasHug/Surprise/issues/412

# Implicit

Libreria enfocada en recomendación binaria e implicita

In [2]:
import implicit
from scipy.sparse import coo_matrix 
import numpy as np

In [3]:
try:
    df_int = df_int.drop(columns=["index", "date"])#.isnull().sum()
except:
    pass

#list_null_repos = list(df_int[df_int["repository"].isnull()].index)
#print(df_int.shape)
#print(df_int.dropna().shape)
# Eliminar ese usuario e interacción
df_int.dropna(inplace=True)

In [4]:
dfint = df_int.copy()

In [5]:
dfint.isnull().sum()

repository    0
user          0
dtype: int64

In [6]:
#dfint.drop(columns=["index"], inplace=True)

In [7]:
dfint["users"] = dfint.user.astype("category")#.unique()
dfint["repos"] = dfint.repository.astype("category")#.unique()

stars = coo_matrix((np.ones(dfint.shape[0]),
                   (dfint['repos'].cat.codes.copy(),
                    dfint['users'].cat.codes.copy())))

In [8]:
from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares

In [9]:
stars_bm25 = bm25_weight(stars)
stars_by_users = stars_bm25.T.tocsr()
stars_by_users

<12955x298 sparse matrix of type '<class 'numpy.float64'>'
	with 22025 stored elements in Compressed Sparse Row format>

In [10]:
model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
model.fit(stars_by_users)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [11]:
# make recomendation
userid = 'fly51fly'
codeid = dfint['users'].cat.categories.get_loc(userid)
#items_ids, items_scores = model.recommend(codeid, stars_by_users[codeid], N=10, filter_already_liked_items=False)
items_ids, items_scores = model.recommend(codeid, stars_by_users[codeid], N=10, filter_already_liked_items=True)

In [12]:
pd.DataFrame({"repos": dfint['repos'].cat.categories[items_ids], "scores": items_scores})

Unnamed: 0,repos,scores
0,py-why/dowhy,0.513297
1,pyro-ppl/pyro,0.496485
2,banditml/banditml,0.466755
3,planetlabs/planet-client-python,0.447452
4,huggingface/neuralcoref,0.43003
5,chainer/chainer,0.399927
6,mwydmuch/napkinXC,0.388848
7,lorismichel/drf,0.385448
8,IDSIA/sacred,0.3816
9,pycaret/pycaret,0.364547


In [13]:
# similar items
repoid = 'pytorch/pytorch'
coderepoid = dfint['repos'].cat.categories.get_loc(repoid)

similar_items_ids, similar_items_scores = model.similar_items(coderepoid)
pd.DataFrame({"repos": dfint['repos'].cat.categories[similar_items_ids], "scores": similar_items_scores})

Unnamed: 0,repos,scores
0,pytorch/pytorch,1.0
1,pyro-ppl/pyro,0.425628
2,google/wuffs,0.261818
3,iterative/dvc,0.244086
4,TeamHypersomnia/Hypersomnia,0.238672
5,kornia/kornia,0.23067
6,ray-project/ray,0.208496
7,rushter/MLAlgorithms,0.207611
8,huggingface/neuralcoref,0.203042
9,rasbt/deep-learning-book,0.202782


In [14]:
# similar users
userid = 'fly51fly'
codeuserid = dfint['users'].cat.categories.get_loc(userid)

similar_users_ids, similar_users_scores = model.similar_users(codeuserid)
pd.DataFrame({"users": dfint['users'].cat.categories[similar_users_ids], "scores": similar_users_scores})

Unnamed: 0,users,scores
0,fly51fly,1.0
1,nikitavoloboev,0.555298
2,oudommeas,0.528794
3,hardikudeshi,0.501235
4,romanofficial,0.496216
5,nikolay,0.480198
6,xuanhan863,0.459712
7,usmanakram232,0.458704
8,pushpendrapratap,0.454848
9,julianxhokaxhiu,0.454445


In [16]:
print(df_int.head())
df_int.shape

                 repository           user
0     tensorflow/tensorflow           mrry
1     tensorflow/tensorflow         danbri
2     tensorflow/tensorflow          rockt
3     tensorflow/tensorflow  petro-rudenko
4  huggingface/transformers         kashif


(22025, 2)

# Lightfm



In [28]:
import lightfm as lfm
from lightfm import data, cross_validation, evaluation

## Dataset de LightFM

In [21]:
# genero la codificación de los datasets
ds = lfm.data.Dataset()
ds.fit(users=df_int.user.unique(), items=df_int.repository.unique())
print(ds.interactions_shape())

(12955, 298)


In [24]:
# construyo las interacciones
(lfm_interactions, lfm_weights) = ds.build_interactions(df_int[["user", "repository"]].itertuples(index=False))


## Modelo base


In [26]:
(lfm_train, lfm_test) = lfm.cross_validation.random_train_test_split(lfm_interactions, test_percentage=0.2, random_state=42)
# no aplica porque son likes
#(lfm_train_w, lfm_test_w) = lfm.cross_validation.random_train_test_split(lfm_weights, test_percentage=0.2, random_state=42)

In [35]:
model = lfm.LightFM(no_components=10, k=5, n=10, learning_schedule='adagrad', loss='logistic', learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=None)
model.fit(lfm_train,
          #sample_weight=lfm_train_w #no aplica porque son likes
          epochs=64,
          num_threads=8,)

<lightfm.lightfm.LightFM at 0x7f9408f0f490>

evaluación del modelo base

In [36]:
#lfm.evaluation.precision_at_k(model, test_interactions=lfm_test, train_interactions=lfm_train, k=10, preserve_rows=False, num_threads=8, check_intersections=True)
train_precision = lfm.evaluation.precision_at_k(model, lfm_train, k=10, preserve_rows=False, num_threads=8, check_intersections=True)
test_precision = lfm.evaluation.precision_at_k(model, lfm_test, k=10, preserve_rows=False, num_threads=8, check_intersections=True)

print(train_precision.mean())
print(test_precision.mean())

0.027499322
0.020668117


In [39]:
train_recall = lfm.evaluation.recall_at_k(model, lfm_train, k=10, preserve_rows=False, num_threads=8, check_intersections=True)
test_recall = lfm.evaluation.recall_at_k(model, lfm_test, k=10, preserve_rows=False, num_threads=8, check_intersections=True)

print(train_recall.mean())
print(test_recall.mean())

0.1023916568194092
0.15474207426950504


## Item features

Se agregan features de los items

In [150]:
#len(';'.join([i for i in df_items["topics"].values.tolist() if i is not None]).split(';'))
#len(set(';'.join([i for i in df_items["topics"].values.tolist() if i is not None]).split(';')))
item_features = list(set(';'.join([i for i in df_items["language"].values.tolist() if i is not None]).split(';'))) \
                + list(set(';'.join([i for i in df_items["topics"].values.tolist() if i is not None]).split(';'))) \
                + list(df_items.forks.unique()) \
                + list(df_items.stars.unique())

In [151]:
ds_item_features = lfm.data.Dataset()
ds_item_features.fit(users=df_int.user.unique(), items=df_int.repository.unique(), item_features=item_features)
print(ds_item_features.interactions_shape())

(12955, 298)


Necesito ahora armar una estructura donde para cada item, todas sus features:

```
["repo_1", ["python", "nlp", etc...]]
```

In [100]:
def is_flattened(item):
    if type(item) in [list, tuple]:
        return True
    return False

In [115]:
def flatten(collection):
    flatted_collection = []
    for item in collection:
        if is_flattened(item):
            flatted_collection += flatten(item)
        else:
            flatted_collection.append(item)
    return(flatted_collection)

# test
#l = [1, 2, [3, 4], 5, [6, 7, 8], [9, [10, 11, 12], [13, 14, 15], 16], 17]
#l = flatten(l)
#l == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] # True

In [152]:
ifs = [] # la lista base que almacenara todos los items
for idx, row in df_items.iterrows():
    #print((row.id, (row.topics.split(";"), row.language.split(";"))))
    topics_splitted = []
    if row.topics is not None:
        topics_splitted = row.topics.split(";")
    
    language_splitted = []
    if row.language is not None:
        language_splitted = row.language.split(";")
    item_f = (
        row.id,
        tuple(
            flatten(
                (topics_splitted,
                 language_splitted,
                 row.forks,
                 row.stars,
                )
            )
        )
    )
    ifs.append(item_f)

In [153]:
item_features = ds_item_features.build_item_features(ifs)
item_features

<298x1948 sparse matrix of type '<class 'numpy.float32'>'
	with 4720 stored elements in Compressed Sparse Row format>

In [154]:
# construyo las interacciones
(lfm_interactions_ifs, lfm_weights_ifs) = ds_item_features.build_interactions(df_int[["user", "repository"]].itertuples(index=False))


In [155]:
#df_items[["id", "forks", "stars", "watchers", "issues", "subscribers"]]
df_items_lfm = build_dummies(df_items, "language", remove_original_column=True)
df_items_lfm = build_dummies(df_items_lfm, "topics", remove_original_column=True)

df_items_lfm = df_items_lfm.drop(columns=["about", "es_fork", "archived"], axis=1).shape

In [156]:
(lfm_train_ifs, lfm_test_ifs) = lfm.cross_validation.random_train_test_split(lfm_interactions_ifs, test_percentage=0.2, random_state=42)

model_ifs = lfm.LightFM(no_components=10, k=5, n=10, learning_schedule='adagrad', loss='logistic', learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=None)
model_ifs.fit(lfm_train_ifs,
          #sample_weight=lfm_train_w #no aplica porque son likes
          item_features=item_features,
          epochs=64,
          num_threads=8,)

<lightfm.lightfm.LightFM at 0x7f9408500250>

In [157]:
train_precision_ifs = lfm.evaluation.precision_at_k(model_ifs, lfm_train_ifs, item_features=item_features, k=10, preserve_rows=False, num_threads=8, check_intersections=True)
test_precision_ifs = lfm.evaluation.precision_at_k(model_ifs, lfm_test_ifs, item_features=item_features, k=10, preserve_rows=False, num_threads=8, check_intersections=True)

print(train_precision_ifs.mean())
print(test_precision_ifs.mean())

0.005391493
0.0034220533


# Evaluación de los modelos

NDCG

https://gist.github.com/jbochi/2e8ddcc5939e70e5368326aa034a144e