# Text Similarity

In [1]:
import pandas as pd



In [2]:
# load the clean (intermediate) data
pr_df = pd.read_parquet("data/intermediate_data/pr_df.parquet", engine="pyarrow")
pr_df = pr_df.head(10000)

# check the shape
print(pr_df.shape)

# check the column names
print(pr_df.columns)


(10000, 10)
Index(['repo', 'parent_repo', 'child_repo', 'issue_id', 'issue_number',
       'issue', 'text_size', 'usernames', 'users', 'mock_number'],
      dtype='object')


In [3]:
# let's have a look at the `issues` column
pr_df["issue"]

0                         Title: WIP - v3\nusername_0: \n
1       Title: [AppBar] Fix swipe to go back gesture f...
2       Title: Add initial support for iOS, tvOS and w...
3       Title: added fix for nav priority links render...
4       Title: coqPackages.CoLoR: 1.4.0 -> 1.6.0\nuser...
                              ...                        
9995    Title: server/zclient: Retry zebra message ver...
9996    Title: Add Reason icon\nusername_0: **Changes ...
9997    Title: Avoid auto-hyphenation of code in the d...
9998    Title: spark: activate R backend\nusername_0: ...
9999    Title: SliverChildDelegate should know which c...
Name: issue, Length: 10000, dtype: object

In [4]:
# let's check if there are null values in this column
print(pr_df["issue"].isnull().sum())

0


In [5]:
# let's preprocess the text and lowercase everything and remove the "title" prefix
print(pr_df['issue'].head())


0                      Title: WIP - v3\nusername_0: \n
1    Title: [AppBar] Fix swipe to go back gesture f...
2    Title: Add initial support for iOS, tvOS and w...
3    Title: added fix for nav priority links render...
4    Title: coqPackages.CoLoR: 1.4.0 -> 1.6.0\nuser...
Name: issue, dtype: object


In [6]:
# extract the issue title (leave the username_0 comment out for now)
pr_df['issue_title'] = pr_df['issue'].str.split("username_0: ").str[0].str.strip()
# print(pr_df.head())

# now remove the redundant "Title: " prefix
pr_df['issue_title'] = pr_df['issue_title'].str.replace("Title: ", "", regex=False)

print(pr_df.head())

                                          repo          parent_repo  \
0                       kaisermann/svelte-i18n           kaisermann   
1  material-components/material-components-ios  material-components   
2                                 dlang/phobos                dlang   
3               patternfly/patternfly-elements           patternfly   
4                                NixOS/nixpkgs                NixOS   

                child_repo   issue_id  issue_number  \
0              svelte-i18n  550510104            40   
1  material-components-ios  551064006          9444   
2                   phobos  551980198          7355   
3      patternfly-elements  552466427           686   
4                  nixpkgs  553379763         78253   

                                               issue  text_size  \
0                    Title: WIP - v3\nusername_0: \n       2398   
1  Title: [AppBar] Fix swipe to go back gesture f...        355   
2  Title: Add initial support for iOS, tv

In [7]:
# while we're are it, extract the user comments too
pr_df['issue_comments'] = pr_df['issue'].str.split("username_0: ").str[1].str.strip()

In [8]:
# let's remove special characters and markdown, whitespace
import re

def clean_text(text):
    # remove markdown
    text = re.sub(r'[#!\[\]<>\-*_|]+', '', text)
    # remove whitespace
    text = re.sub(r'\s+', ' ', text)
    return text

pr_df['issue_title_clean'] = pr_df['issue_title'].apply(clean_text)
pr_df['issue_comments_clean'] = pr_df['issue_comments'].apply(clean_text)


In [9]:
print(pr_df['issue_title_clean'].head())
pr_df.head()


0                                               WIP v3
1    AppBar Fix swipe to go back gesture for MDCApp...
2        Add initial support for iOS, tvOS and watchOS
3    added fix for nav priority links rendering wit...
4                       coqPackages.CoLoR: 1.4.0 1.6.0
Name: issue_title_clean, dtype: object


Unnamed: 0,repo,parent_repo,child_repo,issue_id,issue_number,issue,text_size,usernames,users,mock_number,issue_title,issue_comments,issue_title_clean,issue_comments_clean
0,kaisermann/svelte-i18n,kaisermann,svelte-i18n,550510104,40,Title: WIP - v3\nusername_0: \n,2398,"[kaisermann, elbourki]",elbourki,52812,WIP - v3,,WIP v3,
1,material-components/material-components-ios,material-components,material-components-ios,551064006,9444,Title: [AppBar] Fix swipe to go back gesture f...,355,"[jverkoey, bryanoltman]",bryanoltman,38978,[AppBar] Fix swipe to go back gesture for MDCA...,[AppBar] Fix swipe to go back gesture for MDCA...,AppBar Fix swipe to go back gesture for MDCApp...,AppBar Fix swipe to go back gesture for MDCApp...
2,dlang/phobos,dlang,phobos,551980198,7355,"Title: Add initial support for iOS, tvOS and w...",4306,"[wilzbach, etcimon, Geod24, CyberShadow, t...",jacob-carlborg,66742,"Add initial support for iOS, tvOS and watchOS",I've only tested this on a 64 bit iPhone runni...,"Add initial support for iOS, tvOS and watchOS",I've only tested this on a 64 bit iPhone runni...
3,patternfly/patternfly-elements,patternfly,patternfly-elements,552466427,686,Title: added fix for nav priority links render...,1995,"[starryeyez024, LyndseyR]",LyndseyR,14002,added fix for nav priority links rendering wit...,## Fix bug which causes arrows to appear on pr...,added fix for nav priority links rendering wit...,Fix bug which causes arrows to appear on prim...
4,NixOS/nixpkgs,NixOS,nixpkgs,553379763,78253,Title: coqPackages.CoLoR: 1.4.0 -> 1.6.0\nuser...,2544,"[vbgl, jpas]",jpas,72876,coqPackages.CoLoR: 1.4.0 -> 1.6.0,<!-- Nixpkgs has a lot of new incoming Pull Re...,coqPackages.CoLoR: 1.4.0 1.6.0,Nixpkgs has a lot of new incoming Pull Reques...


In [10]:
# save the cleaned data to disk for future use and a happy world
pr_df.to_parquet("data/intermediate_data/pr_df_clean_issues.parquet", engine="pyarrow")

## Now the fun stuff

### 1. Vectorize the Text

We'll convert the `issue_title_cleaned` column into numerical representations using **TF-IDF** (Term Frequency-Inverse Document Frequency). This method is ideal for capturing the importance of words in a document relative to the dataset.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# convert cleaned issue titles into a list
issue_titles = pr_df['issue_title_clean'].dropna().tolist()

# initialize and fit the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')    # remove common stop words
tfidf_matrix = vectorizer.fit_transform(issue_titles)


The output matrix is a sparse matrix where each row represents an issue title as a vector.

In [12]:
print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')

TF-IDF matrix shape: (10000, 13209)


Now we compute text similarities. We use the TF-IDF matrix to calculate pairwise cosine similarity, which measures how similar each issue title is to the others.

NOTE: I tried to run this code:
```py
from sklearn.metrics.pairwise import cosine_similarity

# calculate cosine similarity between issue titles
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
```

but the jupyter kernel crashed. Probably the data set is too large. Let's try querying one single issue and compare it.


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

query_idx = 0 
query_vector = tfidf_matrix[query_idx]

# compute similarity of the query issue with all others
similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

# get the top N most similar issues
top_indices = similarity_scores.argsort()[::-1][:10]
for i in top_indices:
    print(f"similarity: {similarity_scores[i]:.2f} | title: {issue_titles[i]}")


similarity: 1.00 | title: WIP v3
similarity: 0.68 | title: Feat/v3
similarity: 0.66 | title: Fixes for v3.0.4
similarity: 0.62 | title: bump to v3.8.5
similarity: 0.56 | title: WIP
similarity: 0.56 | title: WIP
similarity: 0.56 | title: WIP: See also
similarity: 0.50 | title: Slim v3
similarity: 0.46 | title: v3.6.0 with array helper
similarity: 0.44 | title: Bittrex api v3


Now, that works, which means we can work in chunks. But before we try that, let's still try something else. Let's try using **FAISS** (Facebook AI Similarity Search).

Note: Well that failed too. I did this:

```py
import faiss
import numpy as np

# convert the tf-idf matrix to numpy array
dense_matrix = tfidf_matrix.toarray()

# build faiss index
dimension = dense_matrix.shape[1]             # number of features
index = faiss.IndexFlatL2(dimension)          # build the index
index.add(dense_matrix.astype(np.float32))    # add vectors to the index
```

So I guess I will resort to using the previous method but in chunks.

We will process the rows of the TF-IDF matrix in manageable chunks and compute similarities agains the entire dataset.

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
import os

def compute_chunked_similarity(tfidf_matrix, chunk_size=1000, output_dir = "data/intermediate_data/similarity_chunks/"):
    """
    compute cosine similarity in chunks to prevent memory overload.

    parameters:
    - tfidf_matrix: sparse matrix (tf-idf representation)
    - chunk_size: number of rows to process in each chunk

    returns:
    - sparse similarity matrix
    """

    num_rows = tfidf_matrix.shape[0]
    similarity_chunks = []

    for start_idx in range(0, num_rows, chunk_size):
        end_idx = min(start_idx + chunk_size, num_rows)
        print(f'processing rows {start_idx} to {end_idx}... ')

        # compute similarity for the current chunk
        chunk = tfidf_matrix[start_idx:end_idx]
        chunk_similarity = cosine_similarity(chunk, tfidf_matrix, dense_output = False)

        assert chunk_similarity.shape[1] == tfidf_matrix.shape[0], "Chunk similarity shape does not match tfidf matrix shape"

        # save the sparse similarity matrix to disk to avoid RAM overload
        output_path = os.path.join(output_dir, f'similarity_chunk_{start_idx}_{end_idx}.npz')
        sp.save_npz(output_path, chunk_similarity)

    print("all chunks processed and saved")

The above "solution" is still quite a lot for my computer. So I just ran it once and produced some chunks for the similarity. Below is how one would run it:

```py
# run the cosine similarity in chunks
similarity_matrix = compute_chunked_similarity(tfidf_matrix, chunk_size=1000)
```