# Mining through jira issues

Superlinked combines the power of vector search - which enables finding similar issues to a query text or reference issue - with powerful filtering to empower a really versatile search over Jira issues.

In [1]:
%pip install superlinked==37.4.0

In [2]:
import os
from typing import cast

import pandas as pd

from superlinked import framework as sl

pd.set_option("display.max_columns", 60)

## Read and check data

In [3]:
DATA_PATH: str = "https://storage.googleapis.com/superlinked-notebook-jira-issue-discovery/issue_data.json"
issue_data = pd.read_json(DATA_PATH, lines=True)
issue_data.head()

Unnamed: 0,id,type,status,priority,resolution,popularity,company,severity,comments,participants,has_attachment,versions_affected,duplicate_of,components,labels,referenced_by,references,blocks,blocked_by,causes,caused_by,fix_version,resolution_time,text
0,SRCTREEWIN-12334,Bug,Needs Triage,Low,,0,symphonyai.com,2.0,,[eb89d03d1ea4(JIRAUSER4541977)],False,[3.2.2],[SRCTREEWIN-2409],[Git],[],[],[],[],[],[],[],[],,Code scroll not proper in horizontal way !imag...
1,SRCTREEWIN-12336,Suggestion,Gathering Interest,,,0,hotmail.com,,,[45603a355f99(jdemeyer1930161690)],False,[],[],[UX],[],[],[],[],[],[],[],[],,Names for patches When you create two patches ...
2,SRCTREEWIN-12337,Bug,Needs Triage,Low,,0,efi.com,2.0,,[2931c1386587(aloke.bordia1371316332)],False,[3.2.4],[],[General],[],[],[],[],[],[],[],[],,Cherry-pick does not provide an option to push...
3,SRCTREEWIN-12338,Bug,Needs Triage,Low,,0,bytecommerce.nl,2.0,26/Sep/2019 5:43 AM;157e9a1ee4cf;Wanted to ask...,[157e9a1ee4cf(roy.van.den.ekker)],False,[3.2.4],[],[Beta],[],[],[],[],[],[],[],[],,Git flow not automatically branching from corr...
4,SRCTREEWIN-12341,Bug,Needs Triage,Low,,0,wakeone.co,3.0,,[93f49685815c(JIRAUSER4546844)],False,[3.2.2],[],[UX],[],[],[],[],[],[],[],[],,The adjustable border line between staged and ...


## Superlinked config

In [4]:
class Issue(sl.Schema):
    id: sl.IdField
    type: sl.String
    status: sl.String
    priority: sl.String | None
    resolution: sl.String | None
    popularity: sl.Integer
    company: sl.String
    severity: sl.Float | None
    comments: sl.String
    participants: sl.StringList
    has_attachment: sl.String  # could be Boolean if NLQ supported it
    versions_affected: sl.StringList
    duplicate_of: sl.StringList
    components: sl.StringList
    labels: sl.StringList
    referenced_by: sl.StringList
    references: sl.StringList
    blocks: sl.StringList
    blocked_by: sl.StringList
    causes: sl.StringList
    caused_by: sl.StringList
    fix_version: sl.StringList
    resolution_time: sl.Float | None
    text: sl.String


issue = Issue()

In [5]:
unique_categories: dict[str, list[str]] = {
    categorical_column_name: cast(list[str], issue_data[categorical_column_name].astype(str).dropna().unique().tolist())
    for categorical_column_name in ["status", "resolution"]
}

In [6]:
MODELNAME: str = "sentence-transformers/all-mpnet-base-v2"

comment_space = sl.TextSimilaritySpace(issue.comments, model=MODELNAME)
text_space = sl.TextSimilaritySpace(issue.text, model=MODELNAME)

popularity_space = sl.NumberSpace(issue.popularity, min_value=0.0, max_value=50, mode=sl.Mode.SIMILAR)
severity_space = sl.NumberSpace(issue.severity, min_value=1.0, max_value=3.0, mode=sl.Mode.SIMILAR)
resolution_time_space = sl.NumberSpace(issue.resolution_time, min_value=0.0, max_value=474, mode=sl.Mode.SIMILAR)

status_space = sl.CategoricalSimilaritySpace(issue.status, categories=unique_categories["status"])
resolution_space = sl.CategoricalSimilaritySpace(issue.resolution, categories=unique_categories["resolution"])

issue_index = sl.Index(
    spaces=[
        comment_space,
        text_space,
        popularity_space,
        severity_space,
        resolution_time_space,
        status_space,
        resolution_space,
    ],
    fields=[
        issue.status,
        issue.priority,
        issue.resolution,
        issue.company,
        issue.severity,
        issue.participants,
        issue.has_attachment,
        issue.versions_affected,
        issue.duplicate_of,
        issue.components,
        issue.labels,
        issue.referenced_by,
        issue.references,
        issue.blocks,
        issue.blocked_by,
        issue.causes,
        issue.caused_by,
        issue.fix_version,
    ],
)

11:08:26 superlinked.framework.dsl.index.index INFO   initialized index


In [7]:
df_parser = sl.DataFrameParser(issue)
issue_source: sl.InMemorySource = sl.InMemorySource(issue, parser=df_parser)
executor: sl.InMemoryExecutor = sl.InMemoryExecutor(sources=[issue_source], indices=[issue_index])
app: sl.InMemoryApp = executor.run()

11:08:27 superlinked.framework.query.query_dag_evaluator INFO   initialized query dag
11:08:27 superlinked.framework.online.online_dag_evaluator INFO   initialized entity dag
11:08:27 superlinked.framework.dsl.executor.interactive.interactive_executor INFO   started in-memory app


### Ingest data

In [8]:
issue_source.put(issue_data)

11:08:34 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

11:08:48 superlinked.framework.online.online_dag_evaluator INFO   evaluated entities
11:08:48 superlinked.framework.online.source.online_data_processor INFO   stored input data


## Run queries

### Simple filtering

In [9]:
# we can filter for a priority string being equal
filter_priority_query = (
    sl.Query(issue_index)
    .find(issue)
    .filter(issue.priority == sl.Param("filter_priority"))
    .select(fields=[issue.priority])
    .limit(3)
)

res_filter_priority = app.query(filter_priority_query, filter_priority="Medium")
sl.PandasConverter.to_pandas(res_filter_priority)  # 0.0 similarity as these are filter queries only

11:08:57 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
11:08:57 superlinked.framework.dsl.executor.query.query_executor INFO   executed query


Unnamed: 0,priority,id,similarity_score
0,Medium,SRCTREEWIN-13259,0.0
1,Medium,SRCTREEWIN-13675,0.0
2,Medium,SRCTREEWIN-13776,0.0


In [10]:
# or a severity number being larger as well
filter_severity_query = (
    sl.Query(issue_index)
    .find(issue)
    .filter(issue.severity >= sl.Param("filter_severity"))
    .select(fields=[issue.severity])
    .limit(3)
)

res_filter_severity = app.query(filter_severity_query, filter_severity=3.0)
sl.PandasConverter.to_pandas(res_filter_severity)

11:08:58 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
11:08:58 superlinked.framework.dsl.executor.query.query_executor INFO   executed query


Unnamed: 0,severity,id,similarity_score
0,3.0,SRCTREEWIN-12341,0.0
1,3.0,SRCTREEWIN-12345,0.0
2,3.0,SRCTREEWIN-12353,0.0


### Vector search for similar issues with filters

In [11]:
# we can find a special theme in severe issues using vector search
severe_auth_failure_query = (
    sl.Query(issue_index)
    .find(issue)
    .filter(issue.severity > sl.Param("filter_severity"))
    .similar(text_space, sl.Param("query_text"))
    .select(fields=[issue.severity, issue.text])
    .limit(3)
)

res_severe_auth_failure = app.query(
    severe_auth_failure_query, filter_severity=2.0, query_text="authentication failures"
)
sl.PandasConverter.to_pandas(res_severe_auth_failure)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

11:09:00 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
11:09:00 superlinked.framework.dsl.executor.query.query_executor INFO   executed query


Unnamed: 0,severity,text,id,similarity_score
0,3.0,Authentication failure - Keeps attempting to a...,SRCTREEWIN-13084,0.627935
1,3.0,"Failed Authentication after account setup Hi,W...",SRCTREEWIN-13109,0.456589
2,3.0,Github basic login with PAT fails with unhelpf...,SRCTREEWIN-13321,0.351087


### Search with natural language

In [12]:
openai_config = sl.OpenAIClientConfig(api_key=os.environ["OPEN_AI_API_KEY"], model="gpt-4o")

# we can find a special theme in severe issues using vector search
nlq_severe_auth_failure_query = (
    sl.Query(issue_index)
    .find(issue)
    .filter(issue.severity >= sl.Param("filter_severity"))
    .similar(text_space, sl.Param("query_text"))
    .with_natural_query(sl.Param("natural_query"), openai_config)
    .select(fields=[issue.severity, issue.text])
    .limit(3)
)

res_nlq_severe_auth_failure = app.query(
    nlq_severe_auth_failure_query, natural_query="high severity authentication failures"
)
sl.PandasConverter.to_pandas(res_nlq_severe_auth_failure)

11:07:29 httpx INFO   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
11:07:29 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
11:07:29 superlinked.framework.dsl.executor.query.query_executor INFO   executed query


Unnamed: 0,severity,text,id,similarity_score
0,3.0,Authentication failure - Keeps attempting to a...,SRCTREEWIN-13084,0.627935
1,3.0,"Failed Authentication after account setup Hi,W...",SRCTREEWIN-13109,0.456589
2,3.0,Github basic login with PAT fails with unhelpf...,SRCTREEWIN-13321,0.351087


In [13]:
# show relevant extracted seach params
relevant_params = ["filter_severity", "query_text"]
{key: value for key, value in res_nlq_severe_auth_failure.metadata.search_params.items() if key in relevant_params}

{'filter_severity': 3.0, 'query_text': 'authentication failures'}

In [14]:
# and the situation can be a lot more sophisticated as well
severity_version_status_natural_query = (
    sl.Query(issue_index)
    .find(issue)
    .similar(text_space, sl.Param("query_text"))
    .filter(issue.severity >= sl.Param("filter_severity"))
    .filter(issue.versions_affected.contains(sl.Param("filter_version")))
    .filter(issue.status == sl.Param("filter_status"))
    .with_natural_query(sl.Param("natural_query"), openai_config)
    .select(fields=[issue.severity, issue.text, issue.versions_affected, issue.status])
    .limit(10)
)

sophisticated_natural_query: str = """
    high severity, git, Bitbucket, or version control related
    issues that affect version 3.3.9 and triage is needed"""

res_severity_version_status_natural = app.query(
    severity_version_status_natural_query,
    natural_query=sophisticated_natural_query,
)
sl.PandasConverter.to_pandas(res_severity_version_status_natural)

11:07:33 httpx INFO   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

11:07:33 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
11:07:33 superlinked.framework.dsl.executor.query.query_executor INFO   executed query


Unnamed: 0,severity,text,versions_affected,status,id,similarity_score
0,3.0,"Sourctree closes after ""Checkout in Sourctree""...",[3.3.9],Needs Triage,SRCTREEWIN-13125,0.479753
1,3.0,"Git - Enabling ""Push all tags to remotes"" will...",[3.3.9],Needs Triage,SRCTREEWIN-13363,0.478067
2,3.0,Open remote - URL customization In our company...,[3.3.9],Needs Triage,SRCTREEWIN-13254,0.462704
3,3.0,"""Check out in Sourctree"" always tries to clone...",[3.3.9],Needs Triage,SRCTREEWIN-13126,0.422688
4,3.0,Different branch folders are merged anyway Hi!...,[3.3.9],Needs Triage,SRCTREEWIN-13438,0.413081
5,3.0,Github basic login with PAT fails with unhelpf...,[3.3.9],Needs Triage,SRCTREEWIN-13321,0.394501
6,3.0,Subtree Dissappears from Stash after Reopening...,[3.3.9],Needs Triage,SRCTREEWIN-13288,0.390878
7,3.0,Issues when trtying to Authenticate. Everytime...,[3.3.9],Needs Triage,SRCTREEWIN-13720,0.385265
8,3.0,Same name of repos If you have same names of r...,[3.3.9],Needs Triage,SRCTREEWIN-13329,0.334843
9,3.0,Resolve Conflicts -> Mine/Theirs when popping ...,[3.3.9],Needs Triage,SRCTREEWIN-13799,0.326965


In [15]:
# show relevant extracted seach params
relevant_params = ["filter_severity", "query_text", "filter_version", "filter_status"]
{
    key: value
    for key, value in res_severity_version_status_natural.metadata.search_params.items()
    if key in relevant_params
}

{'query_text': 'git, Bitbucket, or version control related issues',
 'filter_severity': 3.0,
 'filter_version': ['3.3.9'],
 'filter_status': 'Needs Triage'}

### Finding similar issue to a specific issue

In [16]:
# let's find a similar issue, but assign weights to different attributes
similar_issues_query = (
    sl.Query(
        issue_index,
        weights={
            text_space: 1.0,
            comment_space: 0.0,
            popularity_space: 1.0,
            severity_space: 1.0,
            resolution_time_space: 0.0,
            status_space: 0.0,
            resolution_space: 0.0,
        },
    )
    .find(issue)
    .with_vector(issue, sl.Param("issue_id"))
    .select(
        fields=[
            issue.severity,
            issue.text,
            issue.popularity,
            issue.severity,
            issue.resolution_time,
            issue.status,
            issue.resolution,
        ]
    )
    .limit(10)
)

res_similar_issues = app.query(similar_issues_query, issue_id="SRCTREEWIN-12334")
sl.PandasConverter.to_pandas(res_similar_issues)

11:07:33 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
11:07:33 superlinked.framework.dsl.executor.query.query_executor INFO   executed query


Unnamed: 0,severity,text,popularity,status,id,similarity_score,resolution_time,resolution
0,2.0,Code scroll not proper in horizontal way !imag...,0,Needs Triage,SRCTREEWIN-12334,1.0,,
1,2.0,"Hidden scrollbar on the History tab Greetings,...",4,Closed,SRCTREEWIN-12356,0.861396,387.0,Cannot Reproduce
2,2.0,Subwindow drap and drop can not be dropped at ...,0,Needs Triage,SRCTREEWIN-13482,0.81876,,
3,2.0,Scrolling to bottom of file content pane freez...,1,Closed,SRCTREEWIN-13225,0.804471,127.0,Fixed
4,2.0,Sourcetree hangs when I mousewheel on the diff...,0,Needs Triage,SRCTREEWIN-13318,0.789222,,
5,2.0,Nonenglish encoding display error There is a p...,0,Gathering Impact,SRCTREEWIN-13872,0.786665,,
6,2.0,SourceTree not showing the diff just says 'sel...,2,Short Term Backlog,SRCTREEWIN-13959,0.784653,,
7,2.0,File list displays in reverse alphabetical ord...,0,Needs Triage,SRCTREEWIN-14153,0.782129,,
8,2.0,Issue with local repositories and tabs at star...,0,Needs Triage,SRCTREEWIN-13717,0.775956,,
9,2.0,Issue with local repositories and tabs at star...,0,Needs Triage,SRCTREEWIN-13719,0.775956,,


In [17]:
# and we can combine that with natural language search too
similar_issues_filtered_query = (
    sl.Query(
        issue_index,
        weights={
            text_space: 1.0,
            comment_space: 0.0,
            popularity_space: 1.0,
            severity_space: 1.0,
            resolution_time_space: 0.0,
            status_space: 0.0,
            resolution_space: 0.0,
        },
    )
    .find(issue)
    .with_vector(issue, sl.Param("issue_id"))
    .filter(issue.status == sl.Param("filter_status"))
    .with_natural_query(sl.Param("natural_query"), openai_config)
    .select(fields=[issue.text, issue.status, issue.severity])
    .limit(10)
)

res_similar_issues_filtered = app.query(
    similar_issues_filtered_query, natural_query="similar issues to SRCTREEWIN-12334 that need triage"
)
sl.PandasConverter.to_pandas(res_similar_issues_filtered)

11:07:36 httpx INFO   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
11:07:38 httpx INFO   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
11:07:38 superlinked.framework.query.query_dag_evaluator INFO   evaluated query
11:07:38 superlinked.framework.dsl.executor.query.query_executor INFO   executed query


Unnamed: 0,text,status,severity,id,similarity_score
0,Code scroll not proper in horizontal way !imag...,Needs Triage,2.0,SRCTREEWIN-12334,1.0
1,Subwindow drap and drop can not be dropped at ...,Needs Triage,2.0,SRCTREEWIN-13482,0.81876
2,Sourcetree hangs when I mousewheel on the diff...,Needs Triage,2.0,SRCTREEWIN-13318,0.789222
3,File list displays in reverse alphabetical ord...,Needs Triage,2.0,SRCTREEWIN-14153,0.782129
4,Issue with local repositories and tabs at star...,Needs Triage,2.0,SRCTREEWIN-13717,0.775956
5,Issue with local repositories and tabs at star...,Needs Triage,2.0,SRCTREEWIN-13719,0.775956
6,Diff not working on UTF-16 LE files without BO...,Needs Triage,2.0,SRCTREEWIN-12976,0.771097
7,Maximized SourceTree window covers taskbar whe...,Needs Triage,2.0,SRCTREEWIN-13976,0.770757
8,Cannot unstage hunk While viewing in-app diff ...,Needs Triage,2.0,SRCTREEWIN-13540,0.768646
9,Support CR as a line break in diff views Some ...,Needs Triage,2.0,SRCTREEWIN-13878,0.765955
