From 1d4f2e4f216d43f808d2681d21cbc3d29c152669 Mon Sep 17 00:00:00 2001
From: Liubov Yaronskaya <lyaronskaya@sourcerer.io>
Date: Tue, 23 Oct 2018 14:12:00 +0300
Subject: [PATCH 1/5] feat: colleagues(fast).

---
 build.gradle                                  |   1 +
 src/main/kotlin/app/api/Api.kt                |  10 +-
 src/main/kotlin/app/api/MockApi.kt            |  17 +--
 src/main/kotlin/app/api/ServerApi.kt          |  24 ++--
 src/main/kotlin/app/hashers/Colleagues.kt     |  63 ++++++++++
 src/main/kotlin/app/hashers/CommitCrawler.kt  | 112 ++++++++++++++++++
 src/main/kotlin/app/hashers/RepoHasher.kt     |   8 +-
 src/main/kotlin/app/model/AuthorDistance.kt   |  38 ++++++
 .../kotlin/app/model/AuthorDistanceGroup.kt   |  33 ++++++
 src/main/proto/sourcerer.proto                |  11 ++
 .../tests/hashers/AuthorDistanceHasherTest.kt |  86 ++++++++++++++
 11 files changed, 377 insertions(+), 26 deletions(-)
 create mode 100644 src/main/kotlin/app/hashers/Colleagues.kt
 create mode 100644 src/main/kotlin/app/model/AuthorDistance.kt
 create mode 100644 src/main/kotlin/app/model/AuthorDistanceGroup.kt
 create mode 100644 src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt

diff --git a/build.gradle b/build.gradle
index dfe250a8..7fc9d45a 100644
--- a/build.gradle
+++ b/build.gradle
@@ -61,6 +61,7 @@ buildConfig {
     buildConfigField 'boolean', 'LONGEVITY_ENABLED', project.hasProperty('longevity-enabled') ? project.property('longevity-enabled').toString() : 'false'
     buildConfigField 'long', 'HEARTBEAT_RATE', project.hasProperty('heartbeat-rate') ? project.property('heartbeat-rate').toString() : '60000'
     buildConfigField 'boolean', 'META_HASHER_ENABLED', project.hasProperty('meta-hasher-enabled') ? project.property('meta-hasher-enabled').toString() : 'true'
+    buildConfigField 'boolean', 'DISTANCES_ENABLED', project.hasProperty('distances-enabled') ? project.property('distances-enabled').toString() : 'true'
 
     buildConfig
 }
diff --git a/src/main/kotlin/app/api/Api.kt b/src/main/kotlin/app/api/Api.kt
index 85161b3f..140945ee 100644
--- a/src/main/kotlin/app/api/Api.kt
+++ b/src/main/kotlin/app/api/Api.kt
@@ -3,13 +3,7 @@
 
 package app.api
 
-import app.model.Author
-import app.model.Commit
-import app.model.Fact
-import app.model.Process
-import app.model.ProcessEntry
-import app.model.Repo
-import app.model.User
+import app.model.*
 
 interface Api {
     companion object {
@@ -36,4 +30,6 @@ interface Api {
     fun postAuthors(authorsList: List<Author>): Result<Unit>
     fun postProcessCreate(requestNumEntries: Int): Result<Process>
     fun postProcess(processEntries: List<ProcessEntry>): Result<Unit>
+    fun postAuthorDistances(authorDistanceList: List<AuthorDistance>):
+            Result<Unit>
 }
diff --git a/src/main/kotlin/app/api/MockApi.kt b/src/main/kotlin/app/api/MockApi.kt
index 0a65a9b0..13d86a8a 100644
--- a/src/main/kotlin/app/api/MockApi.kt
+++ b/src/main/kotlin/app/api/MockApi.kt
@@ -4,13 +4,7 @@
 package app.api
 
 import app.Logger
-import app.model.Author
-import app.model.Commit
-import app.model.Repo
-import app.model.Fact
-import app.model.Process
-import app.model.ProcessEntry
-import app.model.User
+import app.model.*
 
 class MockApi(  // GET requests.
     var mockUser: User = User(),
@@ -25,6 +19,7 @@ class MockApi(  // GET requests.
     var receivedUsers: MutableList<User> = mutableListOf()
     var receivedProcessCreate: MutableList<Process> = mutableListOf()
     var receivedProcess: MutableList<Process> = mutableListOf()
+    var receivedDistances: MutableList<AuthorDistance> = mutableListOf()
 
     // DELETE requests.
     var receivedDeletedCommits: MutableList<Commit> = mutableListOf()
@@ -93,4 +88,12 @@ class MockApi(  // GET requests.
         receivedProcess.add(Process(entries = processEntries))
         return Result()
     }
+
+    override fun postAuthorDistances(authorDistanceList:
+                                     List<AuthorDistance>): Result<Unit> {
+        Logger.debug { "MockApi: postAuthorDistances request (${authorDistanceList
+                .size} distances)" }
+        receivedDistances.addAll(authorDistanceList)
+        return Result()
+    }
 }
diff --git a/src/main/kotlin/app/api/ServerApi.kt b/src/main/kotlin/app/api/ServerApi.kt
index ae4cbb40..f1c72cb3 100644
--- a/src/main/kotlin/app/api/ServerApi.kt
+++ b/src/main/kotlin/app/api/ServerApi.kt
@@ -6,16 +6,7 @@ package app.api
 import app.BuildConfig
 import app.Logger
 import app.config.Configurator
-import app.model.Author
-import app.model.AuthorGroup
-import app.model.Commit
-import app.model.CommitGroup
-import app.model.Fact
-import app.model.FactGroup
-import app.model.Process
-import app.model.ProcessEntry
-import app.model.Repo
-import app.model.User
+import app.model.*
 import com.github.kittinunf.fuel.core.FuelManager
 import com.github.kittinunf.fuel.core.Method
 import com.github.kittinunf.fuel.core.Request
@@ -125,6 +116,13 @@ class ServerApi (private val configurator: Configurator) : Api {
                                .body(process.serialize())
     }
 
+    private fun createRequestPostAuthorDistances(distances:
+                                                 AuthorDistanceGroup):
+            Request {
+        return post("/distances").header(getContentTypeHeader())
+                .body(distances.serialize())
+    }
+
     private fun <T> makeRequest(request: Request,
                                 requestName: String,
                                 parser: (ByteArray) -> T): Result<T> {
@@ -214,4 +212,10 @@ class ServerApi (private val configurator: Configurator) : Api {
         return makeRequest(createRequestPostProcess(process), "postProcess", {})
     }
 
+    override fun postAuthorDistances(authorDistanceList: List<AuthorDistance>):
+            Result<Unit> {
+        val distances = AuthorDistanceGroup(authorDistanceList)
+        return makeRequest(createRequestPostAuthorDistances(distances),
+                "postDistances", {})
+    }
 }
diff --git a/src/main/kotlin/app/hashers/Colleagues.kt b/src/main/kotlin/app/hashers/Colleagues.kt
new file mode 100644
index 00000000..cf94420b
--- /dev/null
+++ b/src/main/kotlin/app/hashers/Colleagues.kt
@@ -0,0 +1,63 @@
+// Copyright 2018 Sourcerer Inc. All Rights Reserved.
+// Author: Liubov Yaronskaya (lyaronskaya@sourcerer.io)
+
+package app.hashers
+
+import app.FactCodes
+import app.api.Api
+import app.model.Author
+import app.model.Fact
+import app.model.Repo
+import io.reactivex.Observable
+import java.util.concurrent.TimeUnit
+
+class AuthorDistanceHasher(
+        private val serverRepo: Repo,
+        private val api: Api,
+        private val emails: HashSet<String>,
+        private val userEmails: HashSet<String>) {
+    fun updateFromObservable(observable: Observable<Triple<String,
+            List<String>, Long>>, onError: (Throwable) -> Unit) {
+        val authorScores = hashMapOf<String, Double>()
+        emails.forEach { authorScores[it] = 0.0 }
+
+        // Store the time of the earliest commit for a path by user.
+        val authorPathLastContribution = hashMapOf<String, Long>()
+
+        observable.subscribe({ (email, paths, time) ->
+            if (email in userEmails) {
+                paths.forEach { path ->
+                    authorPathLastContribution[path] = time
+                }
+            }
+            else {
+                val score = paths
+                     .filter { path -> path in authorPathLastContribution }
+                     .filter { path ->
+                        val authorTime = authorPathLastContribution[path]!!
+                        val timeDelta = TimeUnit.DAYS.convert(
+                                authorTime - time, TimeUnit.SECONDS)
+                         timeDelta < 365
+                     }.size
+                authorScores[email] = authorScores[email]!! + score
+            }
+        }, onError, {
+            val stats = mutableListOf<Fact>()
+            val author = Author(email = userEmails.toList()[0])
+            authorScores.forEach { email, value ->
+                if (email !in userEmails) {
+                    stats.add(Fact(serverRepo, FactCodes.COLLEAGUES, value =
+                    email, value2 = value.toString(), author = author))
+                }
+            }
+
+            postDistancesToServer(stats)
+        })
+    }
+
+    private fun postDistancesToServer(stats: List<Fact>) {
+        if (stats.isNotEmpty()) {
+            api.postFacts(stats).onErrorThrow()
+        }
+    }
+}
diff --git a/src/main/kotlin/app/hashers/CommitCrawler.kt b/src/main/kotlin/app/hashers/CommitCrawler.kt
index f784e904..3d1e7b1f 100644
--- a/src/main/kotlin/app/hashers/CommitCrawler.kt
+++ b/src/main/kotlin/app/hashers/CommitCrawler.kt
@@ -251,6 +251,118 @@ object CommitCrawler {
         })
     }
 
+    fun getJGitPathsObservable(git: Git,
+                               tail : RevCommit? = null) :
+            Observable<Triple<String, List<String>, Long>> = Observable.create {
+        subscriber ->
+        val repo: Repository = git.repository
+        val revWalk = RevWalk(repo)
+        val head: RevCommit =
+                try { revWalk.parseCommit(getDefaultBranchHead(git)) }
+                catch(e: Exception) { throw Exception("No head was found!") }
+
+        val df = DiffFormatter(DisabledOutputStream.INSTANCE)
+        df.setRepository(repo)
+        df.isDetectRenames = true
+
+        val confTreeWalk = TreeWalk(repo)
+        confTreeWalk.addTree(head.tree)
+        confTreeWalk.filter = PathFilter.create(CONF_FILE_PATH)
+
+        var ignoredPaths =
+                if (confTreeWalk.next()) {
+                    getIgnoredPaths(repo, confTreeWalk.getObjectId(0))
+                }
+                else {
+                    listOf()
+                }
+
+        var commitCount = 0
+        revWalk.markStart(head)
+        var commit: RevCommit? = revWalk.next()  // Move the walker to the head.
+        while (commit != null && commit != tail) {
+            commitCount++
+            val parentCommit: RevCommit? = revWalk.next()
+
+            // Smart casts are not yet supported for a mutable variable captured
+            // in an inline lambda, see
+            // https://youtrack.jetbrains.com/issue/KT-7186.
+            if (Logger.isTrace) {
+                val commitName = commit.name
+                val commitMsg = commit.shortMessage
+                Logger.trace { "commit: $commitName; '$commitMsg'" }
+                if (parentCommit != null) {
+                    val parentCommitName = parentCommit.name
+                    val parentCommitMsg = parentCommit.shortMessage
+                    Logger.trace { "parent commit: $parentCommitName; " +
+                            "'$parentCommitMsg'" }
+                }
+                else {
+                    Logger.trace { "parent commit: null" }
+                }
+            }
+
+            val email = commit.authorIdent.emailAddress.toLowerCase()
+
+            val diffEntries = df.scan(parentCommit, commit)
+            val paths = diffEntries
+                    .filter { diff ->
+                        diff.changeType != DiffEntry.ChangeType.COPY
+                    }
+                    .filter { diff ->
+                        val path = diff.newPath
+                        for (cnv in VendorConventions) {
+                            if (cnv.containsMatchIn(path)) {
+                                return@filter false
+                            }
+                        }
+
+                        val fileId =
+                                if (path != DiffEntry.DEV_NULL) {
+                                    diff.newId.toObjectId()
+                                } else {
+                                    diff.oldId.toObjectId()
+                                }
+                        val stream = try {
+                            repo.open(fileId).openStream()
+                        } catch (e: Exception) {
+                            null
+                        }
+                        stream != null && !RawText.isBinary(stream)
+                    }
+                    .mapNotNull { diff ->
+                        val filePath =
+                                if (diff.getNewPath() != DiffEntry.DEV_NULL) {
+                                    diff.getNewPath()
+                                } else {
+                                    diff.getOldPath()
+                                }
+
+                        // Update ignored paths list. The config file has retroactive
+                        // force, i.e. if it was added at this commit, then we presume
+                        // it is applied to all commits, preceding this commit.
+                        if (diff.oldPath == CONF_FILE_PATH) {
+                            ignoredPaths =
+                                    getIgnoredPaths(repo, diff.newId.toObjectId())
+                        }
+
+                        if (!ignoredPaths.any { path ->
+                            if (path.endsWith("/")) {
+                                filePath.startsWith(path)
+                            }
+                            else {
+                                path == filePath
+                            }
+                        }) {filePath} else null
+                    }
+            val date = commit.authorIdent.getWhen().time / 1000
+            subscriber.onNext(Triple(email, paths, date))
+            commit = parentCommit
+        }
+
+        subscriber.onComplete()
+    }
+
     private fun getDiffFiles(jgitRepo: Repository,
                              jgitDiffs: List<JgitDiff>) : List<DiffFile> {
         return jgitDiffs
diff --git a/src/main/kotlin/app/hashers/RepoHasher.kt b/src/main/kotlin/app/hashers/RepoHasher.kt
index da0ae0c4..a428e7e5 100644
--- a/src/main/kotlin/app/hashers/RepoHasher.kt
+++ b/src/main/kotlin/app/hashers/RepoHasher.kt
@@ -7,8 +7,6 @@ import app.BuildConfig
 import app.Logger
 import app.api.Api
 import app.config.Configurator
-import app.extractors.Extractor
-import app.extractors.Heuristics
 import app.model.Author
 import app.model.LocalRepo
 import app.model.ProcessEntry
@@ -98,6 +96,12 @@ class RepoHasher(private val api: Api,
                                            commitsCount = commitsCount,
                                            userEmails = userEmail)
             }
+            if (BuildConfig.DISTANCES_ENABLED) {
+                val userEmails = configurator.getUser().emails.map { it.email }.toHashSet()
+                val pathsObservable = CommitCrawler.getJGitPathsObservable(git)
+                AuthorDistanceHasher(serverRepo, api, emails, userEmails)
+                        .updateFromObservable(pathsObservable, onError)
+            }
 
             // Start and synchronously wait until all subscribers complete.
             Logger.print("Stats computation. May take a while...")
diff --git a/src/main/kotlin/app/model/AuthorDistance.kt b/src/main/kotlin/app/model/AuthorDistance.kt
new file mode 100644
index 00000000..9fb959a8
--- /dev/null
+++ b/src/main/kotlin/app/model/AuthorDistance.kt
@@ -0,0 +1,38 @@
+// Copyright 2018 Sourcerer Inc. All Rights Reserved.
+// Author: Liubov Yaronskaya (lyaronskaya@sourcerer.io)
+
+package app.model
+
+import app.Protos
+import com.google.protobuf.InvalidProtocolBufferException
+import java.security.InvalidParameterException
+
+data class AuthorDistance(
+        var repo: Repo = Repo(),
+        var email: String = "",
+        var score: Double = 0.0
+) {
+    @Throws(InvalidParameterException::class)
+    constructor(proto: Protos.AuthorDistance) : this() {
+        repo = Repo(rehash = proto.repoRehash)
+        email = proto.email
+        score = proto.score
+    }
+
+    @Throws(InvalidProtocolBufferException::class)
+    constructor(bytes: ByteArray) : this(Protos.AuthorDistance.parseFrom(bytes))
+
+    constructor(serialized: String) : this(serialized.toByteArray())
+
+    fun getProto(): Protos.AuthorDistance {
+        return Protos.AuthorDistance.newBuilder()
+                .setRepoRehash(repo.rehash)
+                .setEmail(email)
+                .setScore(score)
+                .build()
+    }
+
+    fun serialize(): ByteArray {
+        return getProto().toByteArray()
+    }
+}
diff --git a/src/main/kotlin/app/model/AuthorDistanceGroup.kt b/src/main/kotlin/app/model/AuthorDistanceGroup.kt
new file mode 100644
index 00000000..b6aee1f7
--- /dev/null
+++ b/src/main/kotlin/app/model/AuthorDistanceGroup.kt
@@ -0,0 +1,33 @@
+// Copyright 2018 Sourcerer Inc. All Rights Reserved.
+// Author: Liubov Yaronskaya (lyaronskaya@sourcerer.io)
+
+package app.model
+
+import app.Protos
+import com.google.protobuf.InvalidProtocolBufferException
+import java.security.InvalidParameterException
+
+data class AuthorDistanceGroup(
+        var stats: List<AuthorDistance> = listOf()
+) {
+    @Throws(InvalidParameterException::class)
+    constructor(proto: Protos.AuthorDistanceGroup) : this() {
+        stats = proto.authorDistancesList.map { AuthorDistance(it) }
+    }
+
+    @Throws(InvalidProtocolBufferException::class)
+    constructor(bytes: ByteArray) : this(Protos.AuthorDistanceGroup.parseFrom
+    (bytes))
+
+    constructor(serialized: String) : this(serialized.toByteArray())
+
+    fun getProto(): Protos.AuthorDistanceGroup {
+        return Protos.AuthorDistanceGroup.newBuilder()
+                .addAllAuthorDistances(stats.map { it.getProto() })
+                .build()
+    }
+
+    fun serialize(): ByteArray {
+        return getProto().toByteArray()
+    }
+}
diff --git a/src/main/proto/sourcerer.proto b/src/main/proto/sourcerer.proto
index c63d7f65..3aaa2e65 100644
--- a/src/main/proto/sourcerer.proto
+++ b/src/main/proto/sourcerer.proto
@@ -1,5 +1,6 @@
 // Copyright 2017 Sourcerer, Inc. All Rights Reserved.
 // Author: Anatoly Kislov (anatoly@sourcerer.io)
+// Author: Liubov Yaronskaya (lyaronskaya@sourcerer.io)
 
 syntax = "proto3";
 
@@ -155,3 +156,13 @@ message ProcessEntry {
     uint32 status = 2;
     uint32 error_code = 3;
 }
+
+message AuthorDistance {
+    string email = 1;
+    double score = 2;
+    string repo_rehash = 3;
+}
+
+message AuthorDistanceGroup {
+    repeated AuthorDistance author_distances = 1;
+}
diff --git a/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt b/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
new file mode 100644
index 00000000..f06bf9c1
--- /dev/null
+++ b/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
@@ -0,0 +1,86 @@
+// Copyright 2018 Sourcerer Inc. All Rights Reserved.
+// Author: Liubov Yaronskaya (lyaronskaya@sourcerer.io)
+
+package test.tests.hashers
+
+import app.FactCodes
+import app.api.MockApi
+import app.hashers.AuthorDistanceHasher
+import app.hashers.CommitCrawler
+import app.model.Author
+import app.model.Fact
+import app.model.Repo
+import org.eclipse.jgit.api.Git
+import org.jetbrains.spek.api.Spek
+import org.jetbrains.spek.api.dsl.given
+import org.jetbrains.spek.api.dsl.it
+import test.utils.TestRepo
+import java.io.File
+import java.util.*
+import kotlin.test.assertTrue
+import kotlin.test.fail
+
+class AuthorDistanceHasherTest : Spek({
+    given("repo with a file") {
+        val testRepoPath = "../author_distance_hasher"
+        val testRepo = TestRepo(testRepoPath)
+        val serverRepo = Repo(rehash = "test_repo_rehash")
+        val api = MockApi(mockRepo = serverRepo)
+        val fileName = "test1.txt"
+        val author1 = Author("First Author", "first.author@gmail.com")
+        val author2 = Author("Second Author", "second.author@gmail.com")
+        val author3 = Author("Third Author", "third.author@gmail.com")
+        val emails = hashSetOf(author1.email, author2.email, author3.email)
+
+        testRepo.createFile(fileName, listOf("line1", "line2"))
+        testRepo.commit(message = "initial commit",
+                author = author1,
+                date = Calendar.Builder().setDate(2017, 1, 1).setTimeOfDay
+                (0, 0, 0).build().time)
+
+        testRepo.deleteLines(fileName, 1, 1)
+        testRepo.commit(message = "delete second line",
+                author = author2,
+                date = Calendar.Builder().setDate(2017, 1, 1).setTimeOfDay
+                (0, 1, 0).build().time)
+
+        testRepo.deleteLines(fileName, 0, 0)
+        testRepo.commit(message = "delete first line",
+                author = author1,
+                date = Calendar.Builder().setDate(2018, 1, 1).setTimeOfDay
+                (0, 1, 0).build().time)
+        testRepo.insertLines(fileName, 0, listOf("line1"))
+        testRepo.commit(message = "add first line",
+                author = author3,
+                date = Calendar.Builder().setDate(2019, 1, 1).setTimeOfDay
+                (0, 1, 0).build().time)
+
+        val gitHasher = Git.open(File(testRepoPath))
+        it("extracts colleagues") {
+            val observable = CommitCrawler.getJGitPathsObservable(gitHasher)
+            AuthorDistanceHasher(serverRepo, api, emails,
+                    hashSetOf(author2.email)).updateFromObservable(observable,
+                    onError = { _ -> fail("exception") })
+
+            assertTrue(api.receivedFacts.contains(
+                    Fact(repo = serverRepo,
+                            code = FactCodes.COLLEAGUES,
+                            author = author2,
+                            value = author1.email,
+                            value2 = (1.0).toString())
+            ))
+
+            assertTrue(api.receivedFacts.contains(
+                    Fact(repo = serverRepo,
+                            code = FactCodes.COLLEAGUES,
+                            author = author2,
+                            value = author3.email,
+                            value2 = (0.0).toString())
+            ))
+        }
+
+        afterGroup {
+            testRepo.destroy()
+        }
+    }
+})

From 5c23dbd23fdb6f7b6013e84c1ab7ef2eccb51acc Mon Sep 17 00:00:00 2001
From: Liubov Yaronskaya <lyaronskaya@sourcerer.io>
Date: Tue, 23 Oct 2018 14:24:43 +0300
Subject: [PATCH 2/5] wip: post distances.

---
 src/main/kotlin/app/hashers/Colleagues.kt     | 11 ++++-----
 .../tests/hashers/AuthorDistanceHasherTest.kt | 23 ++++++++-----------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/src/main/kotlin/app/hashers/Colleagues.kt b/src/main/kotlin/app/hashers/Colleagues.kt
index cf94420b..7e7ff513 100644
--- a/src/main/kotlin/app/hashers/Colleagues.kt
+++ b/src/main/kotlin/app/hashers/Colleagues.kt
@@ -6,6 +6,7 @@ package app.hashers
 import app.FactCodes
 import app.api.Api
 import app.model.Author
+import app.model.AuthorDistance
 import app.model.Fact
 import app.model.Repo
 import io.reactivex.Observable
@@ -42,12 +43,10 @@ class AuthorDistanceHasher(
                 authorScores[email] = authorScores[email]!! + score
             }
         }, onError, {
-            val stats = mutableListOf<Fact>()
-            val author = Author(email = userEmails.toList()[0])
+            val stats = mutableListOf<AuthorDistance>()
             authorScores.forEach { email, value ->
                 if (email !in userEmails) {
-                    stats.add(Fact(serverRepo, FactCodes.COLLEAGUES, value =
-                    email, value2 = value.toString(), author = author))
+                    stats.add(AuthorDistance(serverRepo, email, value))
                 }
             }
 
@@ -55,9 +54,9 @@ class AuthorDistanceHasher(
         })
     }
 
-    private fun postDistancesToServer(stats: List<Fact>) {
+    private fun postDistancesToServer(stats: List<AuthorDistance>) {
         if (stats.isNotEmpty()) {
-            api.postFacts(stats).onErrorThrow()
+            api.postAuthorDistances(stats).onErrorThrow()
         }
     }
 }
diff --git a/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt b/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
index f06bf9c1..e0be1b2e 100644
--- a/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
+++ b/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
@@ -8,6 +8,7 @@ import app.api.MockApi
 import app.hashers.AuthorDistanceHasher
 import app.hashers.CommitCrawler
 import app.model.Author
+import app.model.AuthorDistance
 import app.model.Fact
 import app.model.Repo
 import org.eclipse.jgit.api.Git
@@ -62,21 +63,15 @@ class AuthorDistanceHasherTest : Spek({
                     hashSetOf(author2.email)).updateFromObservable(observable,
                     onError = { _ -> fail("exception") })
 
-            assertTrue(api.receivedFacts.contains(
-                    Fact(repo = serverRepo,
-                            code = FactCodes.COLLEAGUES,
-                            author = author2,
-                            value = author1.email,
-                            value2 = (1.0).toString())
-            ))
+            assertTrue(api.receivedDistances.contains(
+                    AuthorDistance(repo = serverRepo,
+                                   email = author1.email,
+                                   score = 1.0)))
 
-            assertTrue(api.receivedFacts.contains(
-                    Fact(repo = serverRepo,
-                            code = FactCodes.COLLEAGUES,
-                            author = author2,
-                            value = author3.email,
-                            value2 = (0.0).toString())
-            ))
+            assertTrue(api.receivedDistances.contains(
+                    AuthorDistance(repo = serverRepo,
+                                   email = author3.email,
+                                   score = 0.0)))
         }
 
         afterGroup {

From cfca26921ecb62ffa67d7c2f0208aa33496c89bc Mon Sep 17 00:00:00 2001
From: Liubov Yaronskaya <lyaronskaya@sourcerer.io>
Date: Thu, 25 Oct 2018 16:53:28 +0300
Subject: [PATCH 3/5] wip: crawler to extract different data.

---
 src/main/kotlin/app/hashers/CodeLongevity.kt  |  12 +-
 src/main/kotlin/app/hashers/Colleagues.kt     |  13 +-
 src/main/kotlin/app/hashers/CommitCrawler.kt  | 175 +++++-------------
 src/main/kotlin/app/hashers/RepoHasher.kt     |   7 +-
 .../tests/hashers/AuthorDistanceHasherTest.kt |   5 +-
 5 files changed, 69 insertions(+), 143 deletions(-)

diff --git a/src/main/kotlin/app/hashers/CodeLongevity.kt b/src/main/kotlin/app/hashers/CodeLongevity.kt
index 950a2214..ceda54b1 100644
--- a/src/main/kotlin/app/hashers/CodeLongevity.kt
+++ b/src/main/kotlin/app/hashers/CodeLongevity.kt
@@ -320,7 +320,7 @@ class CodeLongevity(
     /**
      * Scans the repo to extract code line ages.
      */
-    fun updateFromObservable(diffObservable: Observable<JgitPair> =
+    fun updateFromObservable(diffObservable: Observable<JgitData> =
                                 CommitCrawler.getJGitObservable(git),
                              onError: (Throwable) -> Unit = {},
                              api: Api,
@@ -396,7 +396,7 @@ class CodeLongevity(
      * the revisions of the repo.
      */
     fun getLinesList(tail : RevCommit? = null,
-                     diffObservable: Observable<JgitPair> =
+                     diffObservable: Observable<JgitData> =
                         CommitCrawler.getJGitObservable(git),
                      onError: (Throwable) -> Unit = {}) : List<CodeLine> {
         val codeLines: MutableList<CodeLine> = mutableListOf()
@@ -411,7 +411,7 @@ class CodeLongevity(
      * the revisions of the repo.
      */
     fun getLinesObservable(tail : RevCommit? = null,
-                           diffObservable: Observable<JgitPair>,
+                           diffObservable: Observable<JgitData>,
                            onError: (Throwable) -> Unit)
         : Observable<CodeLine> =
         Observable.create { subscriber ->
@@ -448,7 +448,7 @@ class CodeLongevity(
             // to the diff. Traverse the diffs backwards to handle double
             // renames properly.
             // TODO(alex): cover file renames by tests (see APP-132 issue).
-            for ((diff, editList) in diffs.asReversed()) {
+            for ((diff, editList) in diffs!!.asReversed()) {
                 val oldPath = diff.oldPath
                 val oldId = diff.oldId.toObjectId()
                 val newPath = diff.newPath
@@ -483,7 +483,7 @@ class CodeLongevity(
                         Logger.trace { "ins ($insStart, $insEnd)" }
 
                         for (idx in insStart until insEnd) {
-                            val from = RevCommitLine(commit, newId,
+                            val from = RevCommitLine(commit!!, newId,
                                                      newPath, idx, false)
                             try {
                                 val to = lines[idx]
@@ -514,7 +514,7 @@ class CodeLongevity(
 
                         val tmpLines = ArrayList<RevCommitLine>(delCount)
                         for (idx in delStart until delEnd) {
-                            tmpLines.add(RevCommitLine(commit, oldId,
+                            tmpLines.add(RevCommitLine(commit!!, oldId,
                                                        oldPath, idx, true))
                         }
                         lines.addAll(delStart, tmpLines)
diff --git a/src/main/kotlin/app/hashers/Colleagues.kt b/src/main/kotlin/app/hashers/Colleagues.kt
index 7e7ff513..644d1bde 100644
--- a/src/main/kotlin/app/hashers/Colleagues.kt
+++ b/src/main/kotlin/app/hashers/Colleagues.kt
@@ -3,11 +3,8 @@
 
 package app.hashers
 
-import app.FactCodes
 import app.api.Api
-import app.model.Author
 import app.model.AuthorDistance
-import app.model.Fact
 import app.model.Repo
 import io.reactivex.Observable
 import java.util.concurrent.TimeUnit
@@ -17,15 +14,18 @@ class AuthorDistanceHasher(
         private val api: Api,
         private val emails: HashSet<String>,
         private val userEmails: HashSet<String>) {
-    fun updateFromObservable(observable: Observable<Triple<String,
-            List<String>, Long>>, onError: (Throwable) -> Unit) {
+    fun updateFromObservable(observable: Observable<JgitData>, onError: (Throwable)
+    -> Unit) {
         val authorScores = hashMapOf<String, Double>()
         emails.forEach { authorScores[it] = 0.0 }
 
         // Store the time of the earliest commit for a path by user.
         val authorPathLastContribution = hashMapOf<String, Long>()
 
-        observable.subscribe({ (email, paths, time) ->
+        observable.subscribe({
+            val email =  it.email!!
+            val paths = it.paths!!
+            val time = it.date!!
             if (email in userEmails) {
                 paths.forEach { path ->
                     authorPathLastContribution[path] = time
@@ -49,7 +49,6 @@ class AuthorDistanceHasher(
                     stats.add(AuthorDistance(serverRepo, email, value))
                 }
             }
-
             postDistancesToServer(stats)
         })
     }
diff --git a/src/main/kotlin/app/hashers/CommitCrawler.kt b/src/main/kotlin/app/hashers/CommitCrawler.kt
index 3d1e7b1f..0a5be12f 100644
--- a/src/main/kotlin/app/hashers/CommitCrawler.kt
+++ b/src/main/kotlin/app/hashers/CommitCrawler.kt
@@ -1,5 +1,6 @@
 // Copyright 2017 Sourcerer Inc. All Rights Reserved.
 // Author: Anatoly Kislov (anatoly@sourcerer.io)
+// Author: Liubov Yaronskaya (lyaronskaya@sourcerer.io)
 
 package app.hashers
 
@@ -29,7 +30,12 @@ import org.eclipse.jgit.treewalk.TreeWalk
 import org.eclipse.jgit.util.io.DisabledOutputStream
 import java.util.LinkedList
 
-data class JgitPair(val commit: RevCommit, val list: List<JgitDiff>)
+data class JgitData(var commit: RevCommit? = null,
+                    var list: List<JgitDiff>? = null,
+                    var paths: List<String>? = null,
+                    var date: Long? = null,
+                    var email: String? = null)
+
 data class JgitDiff(val diffEntry: DiffEntry, val editList: EditList)
 
 /**
@@ -96,9 +102,14 @@ object CommitCrawler {
 
     fun getJGitObservable(git: Git,
                           totalCommitCount: Int = 0,
+                          extractCommit: Boolean = true,
+                          extractDiffs: Boolean = true,
+                          extractPaths: Boolean = false,
+                          extractDate: Boolean = false,
+                          extractEmail: Boolean = false,
                           filteredEmails: HashSet<String>? = null,
                           tail : RevCommit? = null) :
-        Observable<JgitPair> = Observable.create { subscriber ->
+        Observable<JgitData> = Observable.create { subscriber ->
         val repo: Repository = git.repository
         val revWalk = RevWalk(repo)
         val head: RevCommit =
@@ -156,9 +167,9 @@ object CommitCrawler {
                 commit = parentCommit
                 continue
             }
+            val paths = mutableListOf<String>()
 
             val diffEntries = df.scan(parentCommit, commit)
-            val diffEdits = diffEntries
             .filter { diff ->
                 diff.changeType != DiffEntry.ChangeType.COPY
             }
@@ -199,24 +210,46 @@ object CommitCrawler {
                         getIgnoredPaths(repo, diff.getNewId().toObjectId())
                 }
 
-                !ignoredPaths.any { path ->
+                if (!ignoredPaths.any { path ->
                     if (path.endsWith("/")) {
                         filePath.startsWith(path)
                     }
                     else {
                         path == filePath
                     }
+                }) {
+                    paths.add(filePath)
+                    true
+                } else false
+            }
+
+            val jgitData = JgitData()
+            if (extractCommit) {
+                jgitData.commit = commit
+            }
+            if (extractDiffs) {
+                val diffEdits = diffEntries
+                .map { diff ->
+                    JgitDiff(diff, df.toFileHeader(diff).toEditList())
                 }
+                .filter { diff ->
+                    diff.editList.fold(0) { acc, edit ->
+                        acc + edit.lengthA + edit.lengthB
+                    } < MAX_DIFF_SIZE
+                }
+                jgitData.list = diffEdits
             }
-            .map { diff ->
-                JgitDiff(diff, df.toFileHeader(diff).toEditList())
+            if (extractPaths) {
+                jgitData.paths = paths
             }
-            .filter { diff ->
-                diff.editList.fold(0) { acc, edit ->
-                    acc + edit.lengthA + edit.lengthB
-                } < MAX_DIFF_SIZE
+            if (extractDate) {
+                jgitData.date = commit.authorIdent.getWhen().time / 1000
             }
-            subscriber.onNext(JgitPair(commit, diffEdits))
+            if (extractEmail) {
+                jgitData.email = email
+            }
+
+            subscriber.onNext(jgitData)
             commit = parentCommit
         }
 
@@ -229,12 +262,12 @@ object CommitCrawler {
     }
 
     fun getObservable(git: Git,
-                      jgitObservable: Observable<JgitPair>,
+                      jgitObservable: Observable<JgitData>,
                       repo: Repo): Observable<Commit> {
-        return jgitObservable.map( { (jgitCommit, jgitDiffs) ->
+        return jgitObservable.map( { (jgitCommit, jgitDiffs, _) ->
             // Mapping and stats extraction.
-            val commit = Commit(jgitCommit)
-            commit.diffs = getDiffFiles(git.repository, jgitDiffs)
+            val commit = Commit(jgitCommit!!)
+            commit.diffs = getDiffFiles(git.repository, jgitDiffs!!)
 
             // Count lines on all non-binary files. This is additional
             // statistics to CommitStats because not all file extensions
@@ -251,118 +284,6 @@ object CommitCrawler {
         })
     }
 
-    fun getJGitPathsObservable(git: Git,
-                               tail : RevCommit? = null) :
-            Observable<Triple<String, List<String>, Long>> = Observable.create {
-        subscriber ->
-        val repo: Repository = git.repository
-        val revWalk = RevWalk(repo)
-        val head: RevCommit =
-                try { revWalk.parseCommit(getDefaultBranchHead(git)) }
-                catch(e: Exception) { throw Exception("No head was found!") }
-
-        val df = DiffFormatter(DisabledOutputStream.INSTANCE)
-        df.setRepository(repo)
-        df.isDetectRenames = true
-
-        val confTreeWalk = TreeWalk(repo)
-        confTreeWalk.addTree(head.tree)
-        confTreeWalk.filter = PathFilter.create(CONF_FILE_PATH)
-
-        var ignoredPaths =
-                if (confTreeWalk.next()) {
-                    getIgnoredPaths(repo, confTreeWalk.getObjectId(0))
-                }
-                else {
-                    listOf()
-                }
-
-        var commitCount = 0
-        revWalk.markStart(head)
-        var commit: RevCommit? = revWalk.next()  // Move the walker to the head.
-        while (commit != null && commit != tail) {
-            commitCount++
-            val parentCommit: RevCommit? = revWalk.next()
-
-            // Smart casts are not yet supported for a mutable variable captured
-            // in an inline lambda, see
-            // https://youtrack.jetbrains.com/issue/KT-7186.
-            if (Logger.isTrace) {
-                val commitName = commit.name
-                val commitMsg = commit.shortMessage
-                Logger.trace { "commit: $commitName; '$commitMsg'" }
-                if (parentCommit != null) {
-                    val parentCommitName = parentCommit.name
-                    val parentCommitMsg = parentCommit.shortMessage
-                    Logger.trace { "parent commit: $parentCommitName; " +
-                            "'$parentCommitMsg'" }
-                }
-                else {
-                    Logger.trace { "parent commit: null" }
-                }
-            }
-
-            val email = commit.authorIdent.emailAddress.toLowerCase()
-
-            val diffEntries = df.scan(parentCommit, commit)
-            val paths = diffEntries
-                    .filter { diff ->
-                        diff.changeType != DiffEntry.ChangeType.COPY
-                    }
-                    .filter { diff ->
-                        val path = diff.newPath
-                        for (cnv in VendorConventions) {
-                            if (cnv.containsMatchIn(path)) {
-                                return@filter false
-                            }
-                        }
-
-                        val fileId =
-                                if (path != DiffEntry.DEV_NULL) {
-                                    diff.newId.toObjectId()
-                                } else {
-                                    diff.oldId.toObjectId()
-                                }
-                        val stream = try {
-                            repo.open(fileId).openStream()
-                        } catch (e: Exception) {
-                            null
-                        }
-                        stream != null && !RawText.isBinary(stream)
-                    }
-                    .mapNotNull { diff ->
-                        val filePath =
-                                if (diff.getNewPath() != DiffEntry.DEV_NULL) {
-                                    diff.getNewPath()
-                                } else {
-                                    diff.getOldPath()
-                                }
-
-                        // Update ignored paths list. The config file has retroactive
-                        // force, i.e. if it was added at this commit, then we presume
-                        // it is applied to all commits, preceding this commit.
-                        if (diff.oldPath == CONF_FILE_PATH) {
-                            ignoredPaths =
-                                    getIgnoredPaths(repo, diff.newId.toObjectId())
-                        }
-
-                        if (!ignoredPaths.any { path ->
-                            if (path.endsWith("/")) {
-                                filePath.startsWith(path)
-                            }
-                            else {
-                                path == filePath
-                            }
-                        }) {filePath} else null
-                    }
-            val date = commit.authorIdent.getWhen().time / 1000
-            subscriber.onNext(Triple(email, paths, date))
-            commit = parentCommit
-        }
-
-        subscriber.onComplete()
-    }
-
     private fun getDiffFiles(jgitRepo: Repository,
                              jgitDiffs: List<JgitDiff>) : List<DiffFile> {
         return jgitDiffs
diff --git a/src/main/kotlin/app/hashers/RepoHasher.kt b/src/main/kotlin/app/hashers/RepoHasher.kt
index a428e7e5..f1ba7ba6 100644
--- a/src/main/kotlin/app/hashers/RepoHasher.kt
+++ b/src/main/kotlin/app/hashers/RepoHasher.kt
@@ -71,7 +71,7 @@ class RepoHasher(private val api: Api,
                 filteredEmails
             } else null
             val jgitObservable = CommitCrawler.getJGitObservable(git,
-                rehashes.size, crawlerEmails
+                rehashes.size, filteredEmails = crawlerEmails
             ).publish()
             val observable = CommitCrawler.getObservable(git,
                 jgitObservable, serverRepo)
@@ -98,7 +98,10 @@ class RepoHasher(private val api: Api,
             }
             if (BuildConfig.DISTANCES_ENABLED) {
                 val userEmails = configurator.getUser().emails.map { it.email }.toHashSet()
-                val pathsObservable = CommitCrawler.getJGitPathsObservable(git)
+                val pathsObservable = CommitCrawler.getJGitObservable(git,
+                        extractCommit = false, extractDate = true,
+                        extractDiffs = false, extractEmail = true,
+                        extractPaths = true)
                 AuthorDistanceHasher(serverRepo, api, emails, userEmails)
                         .updateFromObservable(pathsObservable, onError)
             }
diff --git a/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt b/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
index e0be1b2e..7b1d0cd0 100644
--- a/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
+++ b/src/test/kotlin/test/tests/hashers/AuthorDistanceHasherTest.kt
@@ -58,7 +58,10 @@ class AuthorDistanceHasherTest : Spek({
 
         val gitHasher = Git.open(File(testRepoPath))
         it("extracts colleagues") {
-            val observable = CommitCrawler.getJGitPathsObservable(gitHasher)
+            val observable = CommitCrawler.getJGitObservable(gitHasher,
+                extractCommit = false, extractDate = true,
+                extractDiffs = false, extractEmail = true,
+                extractPaths = true)
             AuthorDistanceHasher(serverRepo, api, emails,
                     hashSetOf(author2.email)).updateFromObservable(observable,
                     onError = { _ -> fail("exception") })

From 4729d06af0010e88e086eeca92df57ffa4e1edd8 Mon Sep 17 00:00:00 2001
From: Anatoly Stansler <anatoly@sourcerer.io>
Date: Wed, 31 Oct 2018 13:47:49 +0300
Subject: [PATCH 4/5] chore: style

---
 src/main/kotlin/app/api/ServerApi.kt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/main/kotlin/app/api/ServerApi.kt b/src/main/kotlin/app/api/ServerApi.kt
index f1c72cb3..891a9909 100644
--- a/src/main/kotlin/app/api/ServerApi.kt
+++ b/src/main/kotlin/app/api/ServerApi.kt
@@ -117,8 +117,7 @@ class ServerApi (private val configurator: Configurator) : Api {
     }
 
     private fun createRequestPostAuthorDistances(distances:
-                                                 AuthorDistanceGroup):
-            Request {
+        AuthorDistanceGroup): Request {
         return post("/distances").header(getContentTypeHeader())
                 .body(distances.serialize())
     }

From 483bd54daf2d14760377b876cbe46f306af5ae2a Mon Sep 17 00:00:00 2001
From: Anatoly Stansler <anatoly@sourcerer.io>
Date: Mon, 12 Nov 2018 18:15:06 +0300
Subject: [PATCH 5/5] chore: style changes

---
 .../app/hashers/{Colleagues.kt => AuthorDistanceHasher.kt}    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename src/main/kotlin/app/hashers/{Colleagues.kt => AuthorDistanceHasher.kt} (94%)

diff --git a/src/main/kotlin/app/hashers/Colleagues.kt b/src/main/kotlin/app/hashers/AuthorDistanceHasher.kt
similarity index 94%
rename from src/main/kotlin/app/hashers/Colleagues.kt
rename to src/main/kotlin/app/hashers/AuthorDistanceHasher.kt
index 644d1bde..db213de4 100644
--- a/src/main/kotlin/app/hashers/Colleagues.kt
+++ b/src/main/kotlin/app/hashers/AuthorDistanceHasher.kt
@@ -14,8 +14,8 @@ class AuthorDistanceHasher(
         private val api: Api,
         private val emails: HashSet<String>,
         private val userEmails: HashSet<String>) {
-    fun updateFromObservable(observable: Observable<JgitData>, onError: (Throwable)
-    -> Unit) {
+    fun updateFromObservable(observable: Observable<JgitData>,
+                             onError: (Throwable) -> Unit) {
         val authorScores = hashMapOf<String, Double>()
         emails.forEach { authorScores[it] = 0.0 }