generated from JetBrains/intellij-platform-plugin-template
/
SimilarChunksWithPaths.kt
148 lines (126 loc) ยท 6.23 KB
/
SimilarChunksWithPaths.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// Copyright 2000-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license that can be found in the LICENSE file.
package com.intellij.temporary.similar.chunks
import com.intellij.openapi.application.runReadAction
import com.intellij.openapi.diagnostic.logger
import com.intellij.openapi.fileEditor.impl.EditorHistoryManager
import com.intellij.openapi.fileTypes.FileType
import com.intellij.openapi.roots.ProjectFileIndex
import com.intellij.openapi.roots.ProjectRootManager
import com.intellij.openapi.vfs.VfsUtilCore
import com.intellij.openapi.vfs.VirtualFile
import com.intellij.psi.PsiElement
import com.intellij.psi.PsiManager
import java.io.File
/**
* Those code logic was similar to GitHub Copilot (compare to [https://github.com/mengjian-github/copilot-analysis] )
* but JetBrains has no plan to use it. I don't why, but still had lots of bugs. So I just keep it here, and maybe
* I will re-implementation it in the future.
*/
class SimilarChunksWithPaths(private var snippetLength: Int = 60, private var maxRelevantFiles: Int = 20) {
companion object {
val INSTANCE: SimilarChunksWithPaths = SimilarChunksWithPaths()
fun createQuery(element: PsiElement, chunkSize: Int = 60): String? {
if (element.language.displayName.lowercase() == "markdown") {
return null
}
return runReadAction {
try {
val similarChunksWithPaths = SimilarChunksWithPaths(chunkSize).similarChunksWithPaths(element)
if (similarChunksWithPaths.paths?.isEmpty() == true || similarChunksWithPaths.chunks?.isEmpty() == true) {
return@runReadAction null
}
// todo: change to count query by size
val query = similarChunksWithPaths.format()
if (query.length < 10) {
return@runReadAction null
}
if (query.length > 1024) {
logger<SimilarChunksWithPaths>().warn("Query size is too large: ${query.length}")
// split to 1024
return@runReadAction query.substring(0, 1024)
}
return@runReadAction query
} catch (e: Exception) {
return@runReadAction null
}
}
}
}
private fun similarChunksWithPaths(element: PsiElement): SimilarChunkContext {
val mostRecentFiles = getMostRecentFiles(element)
val mostRecentFilesRelativePaths = mostRecentFiles.map { INSTANCE.relativePathTo(it, element)!! }
val chunks = extractChunks(element, mostRecentFiles)
val jaccardSimilarities = tokenLevelJaccardSimilarity(chunks, element)
val paths = mutableListOf<String>()
val chunksList = mutableListOf<String>()
for ((fileIndex, jaccardList) in jaccardSimilarities.withIndex()) {
val maxIndex = jaccardList.indexOf(jaccardList.maxOrNull())
paths.add(mostRecentFilesRelativePaths[fileIndex])
chunksList.add(chunks[fileIndex][maxIndex])
}
val language = element.language
return SimilarChunkContext(language, paths, chunksList)
}
private fun tokenLevelJaccardSimilarity(chunks: List<List<String>>, element: PsiElement): List<List<Double>> {
val currentFileTokens: Set<String> = tokenize(element.containingFile.text).toSet()
return chunks.map { list ->
list.map {
val tokenizedFile: Set<String> = tokenize(it).toSet()
similarityScore(currentFileTokens, tokenizedFile)
}
}
}
private fun relativePathTo(relativeFile: VirtualFile, element: PsiElement): String? {
val fileIndex: ProjectFileIndex = ProjectRootManager.getInstance(element.project).fileIndex
var contentRoot: VirtualFile? = runReadAction {
fileIndex.getContentRootForFile(relativeFile)
}
if (contentRoot == null) {
contentRoot = fileIndex.getClassRootForFile(relativeFile)
}
return contentRoot?.let { VfsUtilCore.getRelativePath(relativeFile, it, File.separatorChar) }
}
/**
* since is slowly will tokenize, we revoke to same way will Copilot:
* https://github.com/mengjian-github/copilot-analysis#promptelement%E4%B8%BB%E8%A6%81%E5%86%85%E5%AE%B9
*
*/
private fun tokenize(chunk: String): List<String> {
return chunk.split(Regex("[^a-zA-Z0-9]")).filter { it.isNotBlank() }
}
/**
* ```kotlin
* fun calculateJaccardSimilarity(setA: Set<Any>, setB: Set<Any>): Double {
* val intersectionSize = setA.intersect(setB).size.toDouble()
* val unionSize = (setA union setB).size.toDouble()
* return intersectionSize / unionSize
* }
* ```
*/
private fun similarityScore(set1: Set<String>, set2: Set<String>): Double {
val intersectionSize: Int = (set1 intersect set2).size
val unionSize: Int = (set1 union set2).size
return intersectionSize.toDouble() / unionSize.toDouble()
}
private fun extractChunks(element: PsiElement, mostRecentFiles: List<VirtualFile>): List<List<String>> {
val psiManager: PsiManager = PsiManager.getInstance(element.project)
return mostRecentFiles.mapNotNull { file ->
val psiFile = psiManager.findFile(file)
psiFile?.text
?.split("\n", limit = snippetLength)
?.filter {
!it.trim().startsWith("import ") && !it.trim().startsWith("package ")
}
?.chunked(snippetLength)?.flatten()
}
}
private fun getMostRecentFiles(element: PsiElement): List<VirtualFile> {
val fileType: FileType = element.containingFile?.fileType ?: return emptyList()
val recentFiles: List<VirtualFile> = EditorHistoryManager.getInstance(element.project).fileList.filter { file ->
file.isValid && file.fileType == fileType && file != element.containingFile.virtualFile
}
val start = (recentFiles.size - maxRelevantFiles + 1).coerceAtLeast(0)
val end = (recentFiles.size - 1).coerceAtLeast(0)
return recentFiles.subList(start, end)
}
}