# Text Similarity

In [None]:
import pandas as pd



In [None]:
# load the clean (intermediate) data
pr_df = pd.read_parquet("data/intermediate_data/pr_df.parquet", engine="pyarrow")
# pr_df = pr_df.head(10000)

# check the shape
print(pr_df.shape)

# check the column names
print(pr_df.columns)


In [None]:
# let's have a look at the `issues` column
pr_df["issue"]

In [None]:
# let's check if there are null values in this column
print(pr_df["issue"].isnull().sum())

In [None]:
# let's preprocess the text and lowercase everything and remove the "title" prefix
print(pr_df['issue'].head())


In [None]:
# extract the issue title (leave the username_0 comment out for now)
pr_df['issue_title'] = pr_df['issue'].str.split("username_0: ").str[0].str.strip()

In [None]:
# while we're are it, extract the user comments too
pr_df['issue_comments'] = pr_df['issue'].str.split("username_0: ").str[1].str.strip()

In [None]:
# let's remove special characters and markdown, whitespace
import re

def clean_text(text):
    # remove markdown
    text = re.sub(r'[#!\[\]<>\-*_|]+', '', text)
    # remove whitespace
    text = re.sub(r'\s+', ' ', text)
    return text

pr_df['issue_title_clean'] = pr_df['issue_title'].apply(clean_text)
pr_df['issue_comments_clean'] = pr_df['issue_comments'].apply(clean_text)


In [None]:
print(pr_df['issue_title_clean'].head())
pr_df.head()


In [None]:
# save the cleaned data to disk for future use and a happy world
pr_df.to_parquet("data/intermediate_data/pr_df_clean_issues.parquet", engine="pyarrow")

## Now the fun stuff

### 1. Vectorize the Text

We'll convert the `issue_title_cleaned` column into numerical representations using **TF-IDF** (Term Frequency-Inverse Document Frequency). This method is ideal for capturing the importance of words in a document relative to the dataset.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# convert cleaned issue titles into a list
issue_titles = pr_df['issue_title_clean'].dropna().tolist()

# initialize and fit the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')    # remove common stop words
tfidf_matrix = vectorizer.fit_transform(issue_titles)


The output matrix is a sparse matrix where each row represents an issue title as a vector.

In [None]:
print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')

Now we compute text similarities. We use the TF-IDF matrix to calculate pairwise cosine similarity, which measures how similar each issue title is to the others.

NOTE: I tried to run this code:
```py
from sklearn.metrics.pairwise import cosine_similarity

# calculate cosine similarity between issue titles
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
```

but the jupyter kernel crashed. Probably the data set is too large. Let's try querying one single issue and compare it.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

query_idx = 0 
query_vector = tfidf_matrix[query_idx]

# compute similarity of the query issue with all others
similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

# get the top N most similar issues
top_indices = similarity_scores.argsort()[::-1][:10]
for i in top_indices:
    print(f"similarity: {similarity_scores[i]:.2f} | title: {issue_titles[i]}")


Now, that works, which means we can work in chunks. But before we try that, let's still try something else. Let's try using **FAISS** (Facebook AI Similarity Search).

Note: Well that failed too. I did this:

```py
import faiss
import numpy as np

# convert the tf-idf matrix to numpy array
dense_matrix = tfidf_matrix.toarray()

# build faiss index
dimension = dense_matrix.shape[1]             # number of features
index = faiss.IndexFlatL2(dimension)          # build the index
index.add(dense_matrix.astype(np.float32))    # add vectors to the index
```

So I guess I will resort to using the previous method but in chunks.

We will process the rows of the TF-IDF matrix in manageable chunks and compute similarities agains the entire dataset.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
import os

def compute_chunked_similarity(tfidf_matrix, chunk_size=1000, output_dir = "data/intermediate_data/similarity_chunks/"):
    """
    compute cosine similarity in chunks to prevent memory overload.

    parameters:
    - tfidf_matrix: sparse matrix (tf-idf representation)
    - chunk_size: number of rows to process in each chunk

    returns:
    - sparse similarity matrix
    """

    num_rows = tfidf_matrix.shape[0]
    similarity_chunks = []

    for start_idx in range(0, num_rows, chunk_size):
        end_idx = min(start_idx + chunk_size, num_rows)
        print(f'processing rows {start_idx} to {end_idx}... ')

        # compute similarity for the current chunk
        chunk = tfidf_matrix[start_idx:end_idx]
        chunk_similarity = cosine_similarity(chunk, tfidf_matrix, dense_output = False)

        assert chunk_similarity.shape[1] == tfidf_matrix.shape[0], "Chunk similarity shape does not match tfidf matrix shape"

        # save the sparse similarity matrix to disk to avoid RAM overload
        output_path = os.path.join(output_dir, f'similarity_chunk_{start_idx}_{end_idx}.npz')
        sp.save_npz(output_path, chunk_similarity)

    print("all chunks processed and saved")

The above "solution" is still quite a lot for my computer. So I just ran it once and produced some chunks for the similarity. Below is how one would run it:

```py
# run the cosine similarity in chunks
similarity_matrix = compute_chunked_similarity(tfidf_matrix, chunk_size=1000)
```