In [1]:
import pandas as pd
import numpy as np
from scipy.io import mmwrite
from scipy.io import mmread
from scipy.sparse import csr_matrix
import requests
import implicit
from tqdm import tqdm

import gc
import json
import os

In [2]:
GITHUB_KEY = "../credentials/github.txt"
if os.path.exists(GITHUB_KEY):
    with open(GITHUB_KEY, 'r') as f:
        api_key = f.readline()        
        headers = {"Authorization": "bearer " + api_key.strip()}

def repo_meta_api(item_id):
    GITHUB_URL = "https://api.github.com/repositories/%s"%item_id
    res = requests.get(GITHUB_URL, headers=headers)

    if res.status_code == 200:
        print(json.loads(res.text)['full_name'])

### Set problem
1. Split actors into train(90%) and test(10%).
2. For each test actor, count the number of action withour considering the type
3. Mask the half of the repos by random for each actor: they are the answers to predict
4. Repos which are not masked are actors to train for predicting

**train test splitting(9:1)**

In [3]:
%%time
df_raw = pd.read_pickle('tf_related.pkl')

from sklearn.model_selection import train_test_split
# 9:1 splitby user
actors = df_raw.actor_id.drop_duplicates()
train, test = train_test_split(actors, test_size=0.1, random_state = 85)

train_df = df_raw[df_raw.actor_id.isin(train)]
test_df = df_raw[df_raw.actor_id.isin(test)]

train_df.shape, test_df.shape

CPU times: user 17.4 s, sys: 1.09 s, total: 18.5 s
Wall time: 11.3 s


((17080512, 4), (1898520, 4))

In [4]:
%%time
test_problem = []
test_answer = []
for actor_id, group in tqdm(test_df.groupby('actor_id')):
    # take unique repo ids and shuffle them
    temp_repos = group.repo_id.drop_duplicates().sample(frac=1, random_state=85)
    # half of the repos are answer and rests are problem. if #repo is odd, assign the last one repo to problem
    answers = temp_repos.head(len(temp_repos)//2)
    test_answer.append(group[group.repo_id.isin(answers)])
    test_problem.append(group[~group.repo_id.isin(answers)])
test_df = pd.concat(test_problem)
test_answer = pd.concat(test_answer)

100%|██████████| 105341/105341 [04:09<00:00, 422.23it/s]


CPU times: user 7min 19s, sys: 9.05 s, total: 7min 28s
Wall time: 7min 26s


In [5]:
def answer_generator(test_answer):
    answer = {}
    for actor_id, group in tqdm(test_answer.groupby('actor_id')):
        answer[actor_id] = group.groupby('repo_id')['count'].sum().sort_values(ascending=False).index.tolist()
        
    answer_df = pd.Series(answer).reset_index()
    answer_df.columns = ['actor_id', 'repo_answer']

    print("Answer generated")
    return answer_df

In [6]:
answer_df = answer_generator(test_answer)

100%|██████████| 82810/82810 [02:04<00:00, 665.49it/s]


Answer generated


In [7]:
train_df.to_pickle('train.pkl')
test_df.to_pickle('test.pkl')
answer_df.to_pickle('answer.pkl')

In [10]:
train_df.repo_id.nunique(), train_df.actor_id.nunique()

(43093, 948069)

In [None]:
raise

In [None]:
item_factors = pd.DataFrame(model.item_factors, index = idx2iid.values())
user_factors = pd.DataFrame(model.user_factors, index = idx2uid.values())

In [None]:
user_factors

In [None]:
item_factors

In [None]:
for i in item_factors.dot(item_factors.loc[2325298]).nlargest().index:
    repo_meta_api(i)

In [None]:
target_norm = item_factors.loc[2325298]
target_norm = target_norm / np.linalg.norm(target_norm)

In [None]:
item_factors

In [None]:
item_factor_norms = item_factors.apply(np.linalg.norm, axis=1)

In [None]:
%%time
item_factors_normalized = item_factors.values / item_factor_norms.values.reshape((18232,1))

In [None]:
item_factors_normalized = pd.DataFrame(item_factors_normalized, index=item_factors.index, columns=item_factors.columns)

In [None]:
top_100 = item_factors_normalized.dot(target_norm).nlargest(100)

for i in top_100.index[:10]:
    repo_meta_api(i)

In [None]:
target_norm = item_factors.loc[45717250]
target_norm = target_norm / np.linalg.norm(target_norm)

In [None]:
top_100 = item_factors_normalized.dot(target_norm).nlargest(100)

for i in top_100.index[:10]:
    print(i)
    repo_meta_api(i)

In [None]:
item_factors_normalized.to_pickle("item_factor_norm.pkl")

In [None]:
item_factors_normalized.shape

In [None]:
item_factors_normalized.dtypes

In [None]:
target = answer_als[answer_als.actor_id == 14805681].iloc[0]

In [None]:
for i in target.repo_seen: repo_meta_api(i)

In [None]:
for i in target.repo_answer: repo_meta_api(i)

In [None]:
for i in target.repo_recommend: repo_meta_api(i)

In [None]:
df.repo_id.nunique()

### ALS scoring (k=1)

In [None]:
%%time
train_matrix_, model_, idx2iid_, idx2uid_ = als_model_generator(df, 1)
df_rec_ = als_rec_generator(df, train_matrix_, model_, idx2iid_, idx2uid_)
answer_als_ = answer_df.merge(df_rec_, on='actor_id', how='outer')

In [None]:
%%time
ndcgs_ = []
for _, row in tqdm(answer_als_.iterrows()):
    if sum(np.isnan(row.repo_answer))>0: continue

    if row.repo_recommend != None:
        ndcgs_.append(ndcg_calculator(row.repo_answer, row.repo_recommend))
    else:
        # if no recs, user top as fallback
        ndcgs_.append(ndcg_calculator(row.repo_answer, top_repos[:len(row.repo_answer)]))

In [None]:
print("ALS mean nDCG: %.4f"%(sum(ndcgs_) / len(ndcgs_)))

In [None]:
idx2iid

In [None]:
item_factors_ = pd.DataFrame(model_.item_factors, indeax = idx2iid_.values())

In [None]:
item_factors_.to_pickle("als_repo_factor.pkl")

In [None]:
item_factors_.shape

In [None]:
item_factors_.dtypes

In [None]:
uid2idx_ = {v:k for k, v in idx2uid_.items()}
iid2idx_ = {v:k for k, v in idx2iid_.items()}

sim_items_ = model_.similar_items(iid2idx_[2325298], 20)
for i in range(20):
    repo_meta_api(idx2iid_[sim_items_[i][0]])

In [None]:
uid2idx_ = {v:k for k, v in idx2uid_.items()}
iid2idx_ = {v:k for k, v in idx2iid_.items()}

sim_items_ = model_.similar_items(iid2idx_[15045751], 20)
for i in range(20):
    repo_meta_api(idx2iid_[sim_items_[i][0]])

In [None]:
ndcgs_.index(1)

In [None]:
ndcgs_series = pd.Series(ndcgs_)

In [None]:
ndcgs_series[ndcgs_series==1]

In [None]:
target = answer_als_.iloc[696]
target

In [None]:
repo_meta_api(46806184)

In [None]:
repo_meta_api(20580498)

In [None]:
repo_meta_api(32689863)

In [None]:
answer_als.loc[answer_als.repo_recommend.notna(), 'repo_recommend'].apply(len)

### K=1 vs K=5

In [None]:
k_5 = answer_als[answer_als.repo_recommend.notna()]
k_1 = answer_als_[answer_als.repo_recommend.notna()]

In [None]:
%%time
ndcgs_1 = []
for _, row in tqdm(k_1.iterrows()):
    if sum(np.isnan(row.repo_answer))>0: continue

    if row.repo_recommend != None:
        ndcgs_1.append(ndcg_calculator(row.repo_answer, row.repo_recommend))
    else:
        # if no recs, user top as fallback
        ndcgs_1.append(ndcg_calculator(row.repo_answer, top_repos[:len(row.repo_answer)]))

In [None]:
sum(ndcgs_1) / len(ndcgs_1)

In [None]:
%%time
ndcgs_5 = []
for _, row in tqdm(k_5.iterrows()):
    if sum(np.isnan(row.repo_answer))>0: continue

    if row.repo_recommend != None:
        ndcgs_5.append(ndcg_calculator(row.repo_answer, row.repo_recommend))
    else:
        # if no recs, user top as fallback
        ndcgs_5.append(ndcg_calculator(row.repo_answer, top_repos[:len(row.repo_answer)]))

In [None]:
sum(ndcgs_5) / len(ndcgs_5)