In [1]:
import sys
sys.path.insert(1, '/Users/vinta/Projects/albedo/src/main/python/deps/')

In [2]:
from albedo_toolkit.common import loadRawData

rawDF = loadRawData()

rawDF.rdd.getNumPartitions()

1

In [3]:
from albedo_toolkit.transformers import RatingBuilder

ratingBuilder = RatingBuilder()
ratingDF = ratingBuilder.transform(rawDF)
ratingDF.cache()

ratingDF.rdd.getNumPartitions()

DataFrame[user: int, item: int, rating: int, starred_at: timestamp]

200

In [4]:
import pyspark.sql.functions as F

ratingDF.agg(F.count('rating'), F.countDistinct('user'), F.countDistinct('item')).show()

stargazersCountDF = ratingDF \
    .groupBy('item') \
    .agg(F.count('user').alias('stargazers_count')) \
    .orderBy('stargazers_count', ascending=False)
stargazersCountDF.show()

starredCountDF = ratingDF \
    .groupBy('user') \
    .agg(F.count('item').alias('starred_count')) \
    .orderBy('starred_count', ascending=False)
starredCountDF.show()

+-------------+--------------------+--------------------+
|count(rating)|count(DISTINCT user)|count(DISTINCT item)|
+-------------+--------------------+--------------------+
|      3121629|               10483|              551216|
+-------------+--------------------+--------------------+

+--------+----------------+
|    item|stargazers_count|
+--------+----------------+
| 2126244|            2211|
|10270250|            1683|
|  943149|            1605|
|  291137|            1567|
|13491895|            1526|
| 9384267|            1480|
| 3544424|            1468|
| 7691631|            1441|
|29028775|            1427|
| 1334369|            1399|
| 6498492|            1395|
|21737465|            1375|
|  211666|            1343|
| 3470471|            1318|
| 3100121|            1263|
| 1062897|            1251|
|45717250|            1250|
| 1861458|            1213|
|14440270|            1210|
|  460078|            1200|
+--------+----------------+
only showing top 20 rows

+-------+--

In [5]:
from albedo_toolkit.transformers import DataCleaner

dataCleaner = DataCleaner(
    minItemStargazersCount=2,
    maxItemStargazersCount=4000,
    minUserStarredCount=2,
    maxUserStarredCount=5000
)
cleanDF = dataCleaner.transform(ratingDF)

In [6]:
cleanDF.agg(F.count('rating'), F.countDistinct('user'), F.countDistinct('item')).show()

+-------------+--------------------+--------------------+
|count(rating)|count(DISTINCT user)|count(DISTINCT item)|
+-------------+--------------------+--------------------+
|      2761118|               10472|              245626|
+-------------+--------------------+--------------------+



In [7]:
from pyspark.ml.recommendation import ALS

wholeDF = cleanDF
wholeDF.cache()

als = ALS(implicitPrefs=True, seed=42) \
    .setRank(50) \
    .setRegParam(0.5) \
    .setAlpha(40) \
    .setMaxIter(22)

alsModel = als.fit(wholeDF)

DataFrame[user: int, item: int, rating: int, starred_at: timestamp]

In [8]:
from albedo_toolkit.evaluators import RankingEvaluator
from albedo_toolkit.transformers import PredictionProcessor

predictedDF = alsModel.transform(wholeDF)

predictionProcessor = PredictionProcessor()
predictionDF = predictionProcessor.transform(predictedDF)

k = 30
rankingEvaluator = RankingEvaluator(k=k)
ndcg = rankingEvaluator.evaluate(predictionDF)
print('NDCG', ndcg)

NDCG 0.33227421748627317


In [9]:
from albedo_toolkit.common import recommendItems

username = 'vinta'
recommendedItemsDF = recommendItems(rawDF, alsModel, username, topN=k, excludeKnownItems=False)
for item in recommendedItemsDF.collect():
    repoName = item['repo_full_name']
    repoUrl = 'https://github.com/{0}'.format(repoName)
    print(repoUrl, item['prediction'], item['repo_language'], item['stargazers_count'])

https://github.com/squeaky-pl/japronto 1.303621768951416 C 4271
https://github.com/codelucas/newspaper 1.2780914306640625 Python 4616
https://github.com/locustio/locust 1.276933193206787 Python 5246
https://github.com/faif/python-patterns 1.231217384338379 Python 11445
https://github.com/MagicStack/uvloop 1.2143199443817139 Python 3533
https://github.com/dbader/schedule 1.211169958114624 Python 3081
https://github.com/django/channels 1.2050305604934692 Python 2097
https://github.com/crsmithdev/arrow 1.203598976135254 Python 4236
https://github.com/pennersr/django-allauth 1.1907161474227905 Python 3031
https://github.com/scrapinghub/portia 1.1849253177642822 Python 5084
https://github.com/PyMySQL/PyMySQL 1.1842286586761475 Python 2727
https://github.com/channelcat/sanic 1.1837971210479736 Python 5808
https://github.com/rosarior/awesome-django 1.1822545528411865  5498
https://github.com/getsentry/sentry 1.1799488067626953 Python 12651
https://github.com/tqdm/tqdm 1.175230860710144 Python