In [18]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder\
.master("local[*]")\
.appName("Examples")\
.config('spark.executor.memory', '2g')\
.config('spark.executor.cores', '2')\
.config('spark.cores.max', '2')\
.config('spark.driver.memory','2g')\
.getOrCreate()

engine = Engine(spark, "/repositories", "siva")

print("%d repositories successfully loaded" % (engine.repositories.count()))


7 repositories successfully loaded


In [19]:
seed=1234

## Repositories schema

In [20]:
engine.repositories.printSchema()
# engine.repositories.select()

root
 |-- id: string (nullable = false)
 |-- urls: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- is_fork: boolean (nullable = true)
 |-- repository_path: string (nullable = true)



## References schema

In [21]:
engine.repositories.references.printSchema()

root
 |-- repository_id: string (nullable = false)
 |-- name: string (nullable = false)
 |-- hash: string (nullable = false)
 |-- is_remote: boolean (nullable = false)



## Commits schema

In [22]:
engine.repositories.references.commits.printSchema()

root
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- index: integer (nullable = false)
 |-- hash: string (nullable = false)
 |-- message: string (nullable = false)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- parents_count: integer (nullable = false)
 |-- author_email: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- author_date: timestamp (nullable = true)
 |-- committer_email: string (nullable = true)
 |-- committer_name: string (nullable = true)
 |-- committer_date: timestamp (nullable = true)



In [23]:
engine.repositories.references.filter("is_remote=true")\
                              .filter("NOT name LIKE 'refs/heads/HEAD' AND name LIKE 'refs/heads/%'")\
                              .commits.printSchema()

root
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- index: integer (nullable = false)
 |-- hash: string (nullable = false)
 |-- message: string (nullable = false)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- parents_count: integer (nullable = false)
 |-- author_email: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- author_date: timestamp (nullable = true)
 |-- committer_email: string (nullable = true)
 |-- committer_name: string (nullable = true)
 |-- committer_date: timestamp (nullable = true)



In [24]:
df = engine.repositories.references.filter("is_remote=true")\
                              .filter("NOT name LIKE 'refs/heads/HEAD' AND name LIKE 'refs/heads/%'")\
                              .commits
df.select("repository_id", "reference_name", "committer_date").show(10, False)

+--------------------------------+-------------------+-------------------+
|repository_id                   |reference_name     |committer_date     |
+--------------------------------+-------------------+-------------------+
|github.com/gruns/furl           |refs/heads/asdict  |2017-09-02 11:13:48|
|github.com/gruns/furl           |refs/heads/asdict  |2017-09-02 11:13:48|
|github.com/gruns/furl           |refs/heads/master  |2018-01-11 03:56:53|
|github.com/gruns/furl           |refs/heads/master  |2018-01-11 03:56:53|
|github.com/gpjt/webgl-lessons   |refs/heads/master  |2013-04-10 18:01:08|
|github.com/gpjt/webgl-lessons   |refs/heads/master  |2013-04-10 18:01:08|
|github.com/tparisi/webgl-lessons|refs/heads/master  |2015-11-26 22:20:49|
|github.com/simongog/sdsl        |refs/heads/gh-pages|2012-08-13 05:33:49|
|github.com/simongog/sdsl        |refs/heads/gh-pages|2012-08-13 05:33:49|
|github.com/PyCQA/pycodestyle    |refs/heads/gh-502  |2016-06-26 04:00:15|
+------------------------

In [25]:
# Get refs with date = latest date

In [26]:
df = engine.repositories.references.filter("is_remote=true")\
                              .filter("NOT name LIKE 'refs/heads/HEAD' AND name LIKE 'refs/heads/%'")\
                              .commits
df.groupBy("repository_id").agg(F.max("committer_date")).show()
repo_id_to_latest_commit = df.groupBy("repository_id").agg(F.max("committer_date"))\
                             .withColumnRenamed('repository_id', 'repo_id')

+--------------------+-------------------+
|       repository_id|max(committer_date)|
+--------------------+-------------------+
|github.com/gruns/...|2018-01-11 03:56:53|
|github.com/eomaho...|2016-09-16 15:31:50|
|github.com/PyCQA/...|2018-01-24 17:48:24|
|github.com/github...|2017-12-12 01:35:04|
|github.com/simong...|2012-08-13 05:33:49|
|github.com/gpjt/w...|2013-04-10 18:01:08|
|github.com/tparis...|2015-11-26 22:20:49|
+--------------------+-------------------+



 ## Get branch with latest commit for each repository

In [27]:
conditions = []
conditions.append(repo_id_to_latest_commit.repo_id == df.repository_id)
conditions.append(repo_id_to_latest_commit["max(committer_date)"] == df.committer_date)
branch_with_latest_commit = df.join(repo_id_to_latest_commit, conditions, "inner")\
                              .distinct()\
                              .select('repository_id', 'reference_name', "committer_date")
branch_with_latest_commit.show(10, False)

+--------------------------------+--------------------------------+-------------------+
|repository_id                   |reference_name                  |committer_date     |
+--------------------------------+--------------------------------+-------------------+
|github.com/tparisi/webgl-lessons|refs/heads/master               |2015-11-26 22:20:49|
|github.com/simongog/sdsl        |refs/heads/gh-pages             |2012-08-13 05:33:49|
|github.com/eomahony/Numberjack  |refs/heads/master               |2016-09-16 15:31:50|
|github.com/gruns/furl           |refs/heads/master               |2018-01-11 03:56:53|
|github.com/github/markup        |refs/heads/kivikakk/pass-symlink|2017-12-12 01:35:04|
|github.com/gpjt/webgl-lessons   |refs/heads/master               |2013-04-10 18:01:08|
+--------------------------------+--------------------------------+-------------------+



## Splitting train and test sets

We can use randomSplit over the repositories DataFrame to get the train and test sets. The same can be done later to get the train and validation sets depending on the specific cross-validation approach used. 

In [28]:
data = branch_with_latest_commit\
        .withColumnRenamed('repository_id', 'repo_id')\
        .withColumnRenamed('reference_name', 'ref_name')\

[train, test] = data.randomSplit([0.8, 0.2], seed)

print("Total count %d || Train count %d || Test count %d" % (data.count(), train.count(), test.count()))

Total count 3 || Train count 4 || Test count 2


## Prepare train dataset with Python UASTs

First, get Python blobs with UASTs:

In [31]:
# Get repo_ids and ref_names to filter and avoid extracting UASTs on all blobs
repo_ids, ref_names = set(), set()

# Get python list of Row objects contain repository_id and reference_name
repo_ref_list = branch_with_latest_commit.select("repository_id", "reference_name").collect()
for row in repo_ref_list: 
    repo_ids.add(row.repository_id)
    ref_names.add(row.reference_name)

languages = ["Python"]
# Get blobs with UASTs
blobs = df.blobs
filtered_blobls = blobs\
                    .repartition(2)\
                    .filter(blobs.repository_id.isin(repo_ids))\
                    .filter(blobs.reference_name.isin(ref_names))\
                    .classify_languages()\
                    .filter("is_binary = false")\
                    .filter(F.col("lang").isin(languages))\
                    .dropDuplicates(['blob_id'])\
                    .cache()

In [33]:
blobs.count()

122

In [43]:
python_uasts = python_blobs\
    .repartition(32)\
    .extract_uasts()\
    .drop("content")\
    .cache()

In [44]:
python_uasts.count()

342

Join python_blobs dataframe with the train dataframe to get UASTS we want to use for training:

In [45]:
train_python = train\
    .join(python_uasts, 
       (train.repo_id == python_uasts.repository_id) 
        & (train.ref_name == python_uasts.reference_name))\
    .cache()

In [46]:
train_python.count()

314

In [47]:
train_python.printSchema()

root
 |-- repo_id: string (nullable = false)
 |-- ref_name: string (nullable = false)
 |-- index: integer (nullable = false)
 |-- hash: string (nullable = false)
 |-- message: string (nullable = false)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- parents_count: integer (nullable = false)
 |-- author_email: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- author_date: timestamp (nullable = true)
 |-- committer_email: string (nullable = true)
 |-- committer_name: string (nullable = true)
 |-- committer_date: timestamp (nullable = true)
 |-- blob_id: string (nullable = true)
 |-- commit_hash: string (nullable = true)
 |-- repository_id: string (nullable = true)
 |-- reference_name: string (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- uast: array (nullable = false)
 |    |-- element: binary (containsNull = true)

