In [1]:
import pandas as pd
import numpy as np
from scipy.io import mmwrite
from scipy.io import mmread
from scipy.sparse import csr_matrix
import requests
import implicit

import gc
import json
import os

In [2]:
GITHUB_KEY = "../credentials/github.txt"
if os.path.exists(GITHUB_KEY):
    with open(GITHUB_KEY, 'r') as f:
        api_key = f.readline()        
        headers = {"Authorization": "bearer " + api_key.strip()}

In [3]:
df = pd.read_pickle('ui_df')

In [4]:
df['type'].value_counts()

WatchEvent                       29036719
ForkEvent                         4257402
IssueCommentEvent                 2683839
IssuesEvent                       1672355
PullRequestEvent                  1627943
PullRequestReviewCommentEvent      453330
PushEvent                          362538
CreateEvent                        315933
DeleteEvent                        230391
CommitCommentEvent                 179467
GollumEvent                         74152
ReleaseEvent                        63995
MemberEvent                         45241
Event                               17126
PublicEvent                          3509
DownloadEvent                         701
ForkApplyEvent                        432
Name: type, dtype: int64

In [88]:
df.actor_id.nunique(), df.repo_id.nunique()

(251341, 98540)

### preprocessing

In [5]:
def ui_matrix_generator(event_type):
    ui_count = df[df['type']==event_type].groupby(['actor_id', 'repo_id'])['count'].sum().reset_index(name='count')
    while True:
        prev = len(ui_count)

        user_count = ui_count.groupby('actor_id')['count'].sum()
        ui_count = ui_count[ui_count.actor_id.isin(user_count[user_count>=5].index)]

        repo_count = ui_count.groupby('repo_id')['count'].sum()
        ui_count = ui_count[ui_count.repo_id.isin(repo_count[repo_count>=5].index)]

        if len(ui_count) == prev:
            break
        else:
            print("%s row deleted"%(prev-len(ui_count)))
    print("#user: %s, #repo: %s"%(ui_count.actor_id.nunique(), ui_count.repo_id.nunique()))
    return ui_count

In [6]:
def scr_generator(ui_count):
    ui_count['count_log'] = (ui_count['count']+1).apply(np.log)
    uid_to_idx = {uid: idx for (idx, uid) in enumerate(ui_count.actor_id.unique().tolist())}
    iid_to_idx = {iid: idx for (idx, iid) in enumerate(ui_count.repo_id.unique().tolist())}
    row, col, dat = ui_count.actor_id.tolist(), ui_count.repo_id.tolist(), ui_count['count_log'].tolist()
    row = [uid_to_idx[r] for r in row]
    col = [iid_to_idx[c] for c in col]
    train_matrix = csr_matrix((dat, (row,col)), shape=(1 + np.max(row), 1 + np.max(col)))
    return uid_to_idx, iid_to_idx, train_matrix

## Buffalo

In [None]:
mmwrite('data/main', train_matrix)

In [None]:
with open("data/uid", "w") as f:
    for uid in uid_to_idx:
        print(uid, file=f)

with open("data/iid", "w") as f:
    for iid in iid_to_idx:
        print(iid, file=f)

### ALS

In [None]:
from buffalo.algo.als import ALS
from buffalo.algo.bpr import BPRMF
from buffalo.misc import aux, log
from buffalo.algo.options import ALSOption, BPRMFOption
import buffalo.data
from buffalo.data.mm import MatrixMarketOptions

In [None]:
!pip install n2

In [None]:
!pip3 install buffalo

### implicit

**Star**

In [89]:
%%time
ui_count = ui_matrix_generator('WatchEvent')
uid_to_idx, iid_to_idx, train_matrix = scr_generator(ui_count)

119148 row deleted
1851 row deleted
24 row deleted
4 row deleted
#user: 161354, #repo: 63671
CPU times: user 1min 34s, sys: 8.33 s, total: 1min 42s
Wall time: 1min 18s


In [10]:
idx2uid = {v:k for k, v in uid_to_idx.items()}
idx2iid = {v:k for k, v in iid_to_idx.items()}

In [11]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=20)
# train the model on a sparse matrix of item/user/confidence weights
model.fit(train_matrix.T)



HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [12]:
def repo_meta_api(item_id):
    GITHUB_URL = "https://api.github.com/repositories/%s"%item_id
    res = requests.get(GITHUB_URL, headers=headers)

    if res.status_code == 200:
#         print(json.dumps(json.loads(res.text),indent=2))
        print(json.loads(res.text)['full_name'])

In [13]:
iid_to_idx[ui_count.groupby('repo_id').size().nlargest(1).index.values[0]]

2355

In [14]:
# find related items
related = model.similar_items(iid_to_idx[45717250], ui_count.repo_id.nunique())

In [15]:
# 연관 아이템
for i in range(15):
    repo_meta_api(idx2iid[related[i][0]])

tensorflow/tensorflow
BVLC/caffe
opencv/opencv
tensorflow/models
tesseract-ocr/tesseract
pytorch/pytorch
protocolbuffers/protobuf
keras-team/keras
scikit-learn/scikit-learn
tensorflow/playground
ApolloAuto/apollo
microsoft/CNTK
bitcoin/bitcoin
facebookarchive/caffe2
torch/torch7


In [16]:
# 비연관 아이템
for i in range(1,15):
    repo_meta_api(idx2iid[related[-i][0]])

WeAreGenki/minna-ui
LibCrowds/libcrowds
vuejs-fr/vuejs.org
alphasights/paint
theodi/pathway
camptocamp/puppet-postfix
beaker-project/beaker
travis-ci/travis-migrations
coreyja/devicon-lookup
OnsenUI/onsen.io
ttssdev/appflow
dteviot/WebToEpub
danielyxie/bitburner
osmlab/maproulette3


**Issues**

In [94]:
ui_count = ui_matrix_generator('IssuesEvent')
uid_to_idx, iid_to_idx, train_matrix =scr_generator(ui_count)

86346 row deleted
1029 row deleted
2 row deleted
#user: 143936, #repo: 66542


In [95]:
idx2uid = {v:k for k, v in uid_to_idx.items()}
idx2iid = {v:k for k, v in iid_to_idx.items()}

In [96]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=20)
# train the model on a sparse matrix of item/user/confidence weights
model.fit(train_matrix.T)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [25]:
# find related items
related = model.similar_items(iid_to_idx[45717250], ui_count.repo_id.nunique())

- apex는 토치 진영이라 좀 의외
- 컨트리퓨터의 이슈를 좀 날리면 괜찮아질 수 있지 않을까?

In [26]:
# 연관 아이템
for i in range(15):
    repo_meta_api(idx2iid[related[i][0]])

tensorflow/tensorflow
tensorflow/models
NVIDIA/apex
tensorflow/datasets
tensorflow/benchmarks
erlerobot/gym-gazebo
tensorflow/tensor2tensor
aymericdamien/TensorFlow-Examples
kpu/kenlm
facebookresearch/InferSent
tensorflow/addons
horovod/horovod
udacity/CarND-Term1-Starter-Kit
tensorpack/tensorpack
facebookresearch/maskrcnn-benchmark


In [27]:
# 비연관 아이템
for i in range(1,15):
    repo_meta_api(idx2iid[related[-i][0]])

gw-analysis/detector-characterization
LVPlayground/playground
geodesymiami/rsmas_insar
marius300482/meta
huntermcmillian/huntermcmillian
dKvale/aqi-watch
ita-social-projects/GreenCity
JBurlison/Pandaros.Settlers
ECLK/IncidentManagement
i-RIC/prepost-gui
JeffersonLab/halld_recon
youseedk/dna


In [37]:
ui_count.groupby('actor_id').size().nlargest(20)

actor_id
23040076    2795
26384082    1271
27856297     807
660477       644
271906       578
29139614     536
3709715      481
2480569      443
5316         396
827205       394
297678       389
33569        388
11966684     371
5268928      352
11999859     291
240344       289
952007       288
973543       281
278153       276
413772       267
dtype: int64

In [38]:
# find related items
related = model.similar_users(uid_to_idx[11999859], ui_count.repo_id.nunique())

In [58]:
related

[(117416, 0.96739495),
 (131471, 0.87764853),
 (87544, 0.87512547),
 (98568, 0.87493706),
 (81774, 0.87131417),
 (114136, 0.85445523),
 (133062, 0.85371554),
 (10875, 0.85227454),
 (74119, 0.8508545),
 (67843, 0.84867686),
 (47742, 0.84784913),
 (62893, 0.8465018),
 (5262, 0.84399325),
 (71291, 0.8422783),
 (96062, 0.8419598),
 (5560, 0.84155893),
 (98883, 0.84091145),
 (107083, 0.8407303),
 (115387, 0.83944225),
 (40103, 0.8383808),
 (40445, 0.83799624),
 (87436, 0.8376895),
 (45784, 0.83723736),
 (140305, 0.8363416),
 (138369, 0.83622617),
 (140075, 0.83622146),
 (133045, 0.83444655),
 (100129, 0.8338103),
 (68130, 0.8337493),
 (78169, 0.8335605),
 (6384, 0.83340013),
 (142470, 0.83268625),
 (133760, 0.8316619),
 (134814, 0.8312304),
 (27427, 0.8306577),
 (84377, 0.830545),
 (53174, 0.82961094),
 (108429, 0.8271523),
 (137158, 0.82646966),
 (44626, 0.82640874),
 (9185, 0.8258388),
 (69124, 0.82296836),
 (123026, 0.82223386),
 (38276, 0.8221086),
 (123721, 0.82173216),
 (63797, 0.8211

In [39]:
for i in range(15):
    print(idx2uid[related[i][0]])

11999859
19420983
5570343
7288621
4736643
10949969
20677314
123932
3658706
2821085
1269330
2222855
47590
3264971
6826348


In [48]:
target = ui_count[ui_count.actor_id==11999859].sort_values('count', ascending=False).head()
target

Unnamed: 0,actor_id,repo_id,count,count_log
1458232,11999859,26836182,91,4.521789
1458305,11999859,44264925,16,2.833213
1458243,11999859,29329884,14,2.70805
1458215,11999859,23096959,11,2.484907
1458192,11999859,16930617,10,2.397895


In [49]:
for i in target.repo_id:
    repo_meta_api(i)

therecipe/qt
microsoft/vscode-go
minio/mc
golang/go
andlabs/ui


In [50]:
rel = ui_count[ui_count.actor_id==19420983].sort_values('count', ascending=False).head()
rel

Unnamed: 0,actor_id,repo_id,count,count_log
1578005,19420983,51199322,2,1.098612
1578004,19420983,20904437,1,0.693147
1578007,19420983,93152223,1,0.693147
1578008,19420983,93505869,1,0.693147
1578009,19420983,98540924,1,0.693147


In [51]:
for i in rel.repo_id:
    repo_meta_api(i)

lipangit/JiaoZiVideoPlayer
gin-gonic/gin
scwang90/SmartRefreshLayout
sunfusheng/GlideImageView
shadowsocksrr/shadowsocksr-android


In [54]:
rel = ui_count[ui_count.actor_id==5570343].sort_values('count', ascending=False).head()
display(rel)
for i in rel.repo_id:
    repo_meta_api(i)

Unnamed: 0,actor_id,repo_id,count,count_log
1161049,5570343,50709152,2,1.098612
1161050,5570343,65002012,2,1.098612
1161042,5570343,1713774,1,0.693147
1161043,5570343,9117329,1,0.693147
1161044,5570343,15257213,1,0.693147


kataras/iris
iamduo/workq
TTTAttributedLabel/TTTAttributedLabel
oneinstack/lnmp
ldcsaa/HP-Socket


In [55]:
rel = ui_count[ui_count.actor_id==7288621].sort_values('count', ascending=False).head()
display(rel)
for i in rel.repo_id:
    repo_meta_api(i)

Unnamed: 0,actor_id,repo_id,count,count_log
1275625,7288621,23803846,2,1.098612
1275632,7288621,203587744,2,1.098612
1275624,7288621,353561,1,0.693147
1275626,7288621,59257993,1,0.693147
1275627,7288621,62952373,1,0.693147


iceman1001/proxmark3
postwoman-io/postwoman
vakata/jstree
hwdsl2/docker-ipsec-vpn-server
unidoc/unidoc


In [56]:
rel = ui_count[ui_count.actor_id==20677314].sort_values('count', ascending=False).head()
display(rel)
for i in rel.repo_id:
    repo_meta_api(i)

Unnamed: 0,actor_id,repo_id,count,count_log
1590840,20677314,145340771,16,2.833213
1590689,20677314,31792824,16,2.833213
1590834,20677314,140326958,10,2.397895
1590809,20677314,121454193,9,2.302585
1590676,20677314,25594973,7,2.079442


go-flutter-desktop/go-flutter
flutter/flutter
memspace/zefyr
google/flutter-desktop-embedding
gobuffalo/buffalo


In [57]:
rel = ui_count[ui_count.actor_id==123932].sort_values('count', ascending=False).head()
display(rel)
for i in rel.repo_id:
    repo_meta_api(i)

Unnamed: 0,actor_id,repo_id,count,count_log
177313,123932,32113089,3012,8.010692
177294,123932,2945088,74,4.317488
177300,123932,13855476,51,3.951244
177293,123932,2793788,9,2.302585
177305,123932,19370814,9,2.302585


revel/revel
go-gorm/gorm
htacg/tidy-html5
postfixadmin/postfixadmin


**Fork**

In [59]:
ui_count = ui_matrix_generator('ForkEvent')
uid_to_idx, iid_to_idx, train_matrix =scr_generator(ui_count)
idx2uid = {v:k for k, v in uid_to_idx.items()}
idx2iid = {v:k for k, v in iid_to_idx.items()}
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=20)
# train the model on a sparse matrix of item/user/confidence weights
model.fit(train_matrix.T)
# find related items
related = model.similar_items(iid_to_idx[45717250], ui_count.repo_id.nunique())

175399 row deleted
4853 row deleted
124 row deleted
12 row deleted
#user: 163103, #repo: 58592


HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [61]:
# 연관 아이템
for i in range(15):
    repo_meta_api(idx2iid[related[i][0]])

tensorflow/tensorflow
BVLC/caffe
lammps/lammps
keras-team/keras
PaddlePaddle/FluidDoc
ros/ros
FacultadInformatica-LinkedData/Curso2016-2017
tensorflow/models
fkanehiro/hrpsys-base
thoth-station/amun-api
MouseLand/suite2p
equinor/gordo
opentargets/data_pipeline
thoth-station/cve-update-job
nest/nestml


In [62]:
# 비연관 아이템
for i in range(1,15):
    repo_meta_api(idx2iid[related[-i][0]])

nerds-and-company/schematic
ezsystems/DemoBundle
Teleopti/styleguide
symfony/swiftmailer-bundle
ezsystems/LegacyBridge
ImpactDevelopment/ImpactIssues
webdevops/TYPO3-metaseo
railt/railt
contao/installation-bundle
illuminate/auth
codefog/contao-haste
Mangopay/mangopay2-php-sdk


In [69]:
ui_count[ui_count.repo_id.isin([idx2iid[i[0]] for i in related[:15]])].groupby('repo_id')['count'].sum()

repo_id
7789923        86
12007030       65
12791642     1570
18689903       16
33015583     2142
41415230        5
45717250     6432
49882925        5
51117837     2246
67412339        5
136450404      60
141175642       5
143154587       5
150013302       6
152386408       5
Name: count, dtype: int64

In [70]:
ui_count[ui_count.repo_id.isin([idx2iid[i[0]] for i in related[-15:]])].groupby('repo_id')['count'].sum()

repo_id
1007121     11
2641408     51
2994528     11
4365438     10
8177076     11
11449909    14
13468135     6
23513497     7
29585452    24
31259653    11
36852343    16
39899230     8
44344253     7
76078736     9
91753282     6
Name: count, dtype: int64

In [73]:
GITHUB_URL = "https://api.github.com"

In [80]:
res = requests.get("/".join((
    GITHUB_URL, 'repos', 'benfred','implicit')))

In [81]:
json.loads(res.text)

{'id': 56417681,
 'node_id': 'MDEwOlJlcG9zaXRvcnk1NjQxNzY4MQ==',
 'name': 'implicit',
 'full_name': 'benfred/implicit',
 'private': False,
 'owner': {'login': 'benfred',
  'id': 69536,
  'node_id': 'MDQ6VXNlcjY5NTM2',
  'avatar_url': 'https://avatars2.githubusercontent.com/u/69536?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/benfred',
  'html_url': 'https://github.com/benfred',
  'followers_url': 'https://api.github.com/users/benfred/followers',
  'following_url': 'https://api.github.com/users/benfred/following{/other_user}',
  'gists_url': 'https://api.github.com/users/benfred/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/benfred/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/benfred/subscriptions',
  'organizations_url': 'https://api.github.com/users/benfred/orgs',
  'repos_url': 'https://api.github.com/users/benfred/repos',
  'events_url': 'https://api.github.com/users/benfred/events{/privacy}',
  'received_event

In [82]:
df[df.repo_id==56417681]

Unnamed: 0,repo_id,actor_id,type,count
110450,56417681,7275205,ForkEvent,1
221617,56417681,22462463,ForkEvent,1
512856,56417681,6124706,WatchEvent,1
539322,56417681,2549270,WatchEvent,1
683495,56417681,876666,WatchEvent,1
...,...,...,...,...
40238574,56417681,10204855,WatchEvent,1
40305702,56417681,860719,WatchEvent,1
40918920,56417681,357835,IssueCommentEvent,6
40920198,56417681,630936,IssueCommentEvent,2


In [97]:
related = model.similar_items(iid_to_idx[56417681], ui_count.repo_id.nunique())

In [98]:
# 연관 아이템
for i in range(15):
    repo_meta_api(idx2iid[related[i][0]])

benfred/implicit
RaRe-Technologies/gensim
automl/auto-sklearn
EpistasisLab/tpot
imagej/imagej
machinalis/iepy
zenogantner/MyMediaLite
CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers
DEAP/deap
abhiTronix/vidgear
mariusmuja/flann
lyst/lightfm
ugeneunipro/ugene
scijava/scijava-common
noamraph/tqdm


In [99]:
# 비연관 아이템
for i in range(1,15):
    repo_meta_api(idx2iid[related[-i][0]])

biud436/MV
lzim/teampsd
toconnell/kdm-manager
CougsInSpace/CougSat1-Hardware
ideals/Ideal-CMS
vm6502q/qrack
bbeck13/RhythmRunner
NeuTrix/redux-crud-todo-cli
e18rtest/test
NCAR/METplus
LucaDiStasio/thinPlyMechanics


### 인사이트 정리
- 액션에 따라 추천 모델이 서빙하는 결과가 크게 다르다 ex. commit vs star
  - 멀티액션 모델?
- 유저 단위 추천에서, 유저의 action intesity가 반영되지 않는 것 같다
  - ALS의 특성 상 confidence를 계산하기 때문에 그럴 수 있다
  - 유저 count를 임베딩 벡터에 곱해줄까?
- Organization과 repository가 같은 경우
- tf에서 토치 진영 쪽(apex) 나오는 건 어떻게 해결할까?