In [None]:
## Perform Topic Modeling using BERT

In [None]:
import pandas as pd
import numpy as np
import os
import re

In [None]:
all_docs = []
for k in range(1, 123+1 ):
    try:
        data = pd.read_csv(f"/kaggle/input/requirements/{k}.txt", header=None, sep='\t')
        documents = data.to_numpy().ravel().tolist()
        one_str  = ' '.join(documents)
        all_docs.append(one_str)
    except pd.errors.ParserError:
        print(f"Error reading file: {k}.txt")

In [None]:
pd.DataFrame({'Doc': all_docs})  #--> Every document is a project

In [None]:
pip install sentence_transformers

In [None]:
pip install umap-learn

In [None]:
pip install hdbscan

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
embeddings = model.encode(all_docs, show_progress_bar=True)

In [None]:
embeddings.shape

In [None]:
import umap.umap_ as umap
umap_embeddings = umap.UMAP(n_neighbors=4, 
                            n_components=5, 
                            metric='cosine',
                           random_state=123).fit_transform(embeddings)

In [None]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=3,
                          metric='euclidean',                      
                          cluster_selection_method='eom')
hdbscan_labels = cluster.fit_predict(umap_embeddings)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

umap_data = umap.UMAP(n_neighbors=4, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Get unique cluster labels and their corresponding colors
unique_labels = np.unique(cluster.labels_)
num_clusters = len(unique_labels)
colors = plt.cm.rainbow(np.linspace(0, 1, num_clusters))

# Plot scatter plot with different colors per cluster
fig, ax = plt.subplots(figsize=(10, 10))
for label, color in zip(unique_labels, colors):
    if label == -1:  # Outliers
        cluster_points = umap_data[cluster.labels_ == label]
        ax.scatter(cluster_points[:, 0], cluster_points[:, 1], color='#BDBDBD', s=10, label='Outliers')
    else:  # Clusters
        cluster_points = umap_data[cluster.labels_ == label]
        ax.scatter(cluster_points[:, 0], cluster_points[:, 1], color=color, s=10, label=f'Cluster {label}')

ax.set_title('Clustering')
ax.legend()
# plt.show()

In [None]:
labels = cluster.labels_

# Count the number of clusters (excluding noise points labeled as -1)
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)

print("Number of clusters:", num_clusters)

In [None]:
docs_df = pd.DataFrame(all_docs, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [None]:
docs_with_topic = docs_df

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(all_docs))

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [None]:
import warnings

# Temporarily suppress warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")


    original_top_n_words = pd.DataFrame(columns=['Key', 'New'])
    for key, value in top_n_words.items():
        new = [sublist[0] for sublist in value]
        original_top_n_words = original_top_n_words.append({'Key': key, 'New': new}, ignore_index=True)

    print(original_top_n_words) 

In [None]:
pd.set_option('display.max_colwidth', None) # Run this if you want to display the whole content
# pd.set_option('display.max_colwidth', 50) # Run this to return to default display options

In [None]:
data = {
    'Topic': [],
    'Top Words': [],
    'Size': []
}

# Iterate over the object
for topic, tuples in top_n_words.items():
    words = [word for word, _ in tuples]
    data['Topic'].append(topic)
    data['Top Words'].append(', '.join(words))
    data['Size'].append(int(topic_sizes[topic_sizes['Topic'] == topic]['Size']))


# Create a dataframe from the data dictionary
pd.DataFrame(data)

In [None]:
## A loop over n_neighbors, n_components and min_cluster_size values to find the right combination
import random

n_neighbors = 20
n_components = 2
min_cluster_size = 4

while True:
    umap_embeddings = umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine',
                                random_state=123).fit_transform(embeddings)

    cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                              metric='euclidean',                      
                              cluster_selection_method='eom').fit(umap_embeddings)
    
    cluster_sizes = pd.Series(cluster.labels_).value_counts()
    largest_cluster_size = cluster_sizes.max()
    
    if largest_cluster_size <= 80:
        break
    labels = cluster.labels_

    # Count the number of clusters (excluding noise points labeled as -1)
#     num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
#     if num_clusters <= 15 and num_clusters >= 8:
#         print('Found!')
#         break
    
    # Update parameter values 
    n_neighbors = random.randint(2, 10)
    n_components = random.randint(2, 10)
    min_cluster_size = random.randint(2, 10)

print("Final values:")
print("n_neighbors:", n_neighbors)
print("n_components:", n_components)
print("min_cluster_size:", min_cluster_size)
print("num_clusters:", num_clusters)

In [None]:
##################################################################################################################

In [None]:
## internal topic modeling

In [None]:
big_df = pd.DataFrame()
sub_cluster_length = 0
for i in range(len(docs_per_topic)):
    topic_id = docs_per_topic.iloc[i][0]
    topic = re.split(r'(?<=[.!?])\s+', docs_per_topic.iloc[i][1])
    internal_embeddings = model.encode(topic, show_progress_bar=True)
    internal_umap_embeddings = umap.UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine').fit_transform(internal_embeddings)

    if len(topic) >= 300: # > 300
        internal_umap_embeddings = umap.UMAP(n_neighbors=4, n_components = 5, min_dist=0.0, metric='cosine').fit_transform(internal_embeddings)
        internal_cluster = hdbscan.HDBSCAN(min_cluster_size=10,
                                        metric='euclidean',                      
                                        cluster_selection_method='eom').fit(internal_umap_embeddings)
    else: # < 300
        internal_umap_embeddings = umap.UMAP(n_neighbors=4, n_components = 2, min_dist=0.0, metric='cosine').fit_transform(internal_embeddings)
        internal_cluster = hdbscan.HDBSCAN(min_cluster_size=2,
                                        metric='euclidean',                      
                                        cluster_selection_method='eom').fit(internal_umap_embeddings)
#     result = pd.DataFrame(internal_umap_embeddings, columns=['x', 'y'])
#     result['labels'] = internal_cluster.labels_
    docs_df = pd.DataFrame(topic, columns=["Doc"])
    docs_df['Small_Cluster'] = internal_cluster.labels_
    docs_df['Doc_ID'] = range(len(docs_df))
    docs_per_topic_pertopic = docs_df.groupby(['Small_Cluster'], as_index = False).agg({'Doc': ' '.join})
    docs_per_topic_pertopic['Big_Cluster'] = topic_id
    big_df = big_df.append(docs_per_topic_pertopic, ignore_index = True)
    
    sub_cluster_length = sub_cluster_length + len(np.unique(internal_cluster.labels_))
    
print("Mean sub-cluster length:", sub_cluster_length / i)

In [None]:
# Change the order of the columns
cols = big_df.columns.tolist()

cols = cols[-1:] + cols[:-1]

all_clusters_df = big_df[cols]  #    OR    big_df = big_df.ix[:, cols]

all_clusters_df

In [None]:
new_sentence = """The system must be able to record a playing movie.
The system must be able to return to the menu when a movie is playing.
The system must be able to provide an option menu for a selected movie.
The system must be able to provide the ability to select subtitles for a selected movie.
The system must be able to provide a list of TV channels.
The user must be able to select a TV channel.
The system must be able to provide a list of program categories for a selected TV channel.
The system must be able to provide the weekly program for a selected channel.
The system must be able to project a selected program.
The system must be able to pause and stop a playing program.
The system must be able to forward and rewind a playing program.
The system must be able to record a playing program.
"""

from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tf_idf(new_sentence, existing_documents):
    documents = existing_documents + [new_sentence]

    vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(documents)

    new_sentence_tfidf = tfidf_matrix[-1]  # Extract TF-IDF vector for the new sentence

    return new_sentence_tfidf, vectorizer

new_sentence_tfidf, vectorizer = compute_tf_idf(new_sentence, all_docs)

feature_names = vectorizer.get_feature_names_out()
top_n_words_indices = np.argsort(new_sentence_tfidf.toarray())[0, -10:]  # Get indices of top 10 words
top_n_words = [(feature_names[i], new_sentence_tfidf[0, i]) for i in top_n_words_indices[::-1]]

print("Top words in the new sentence:")
for word, tfidf_score in top_n_words:
    print(f"{word}: {tfidf_score}")

In [None]:
new_list = [sublist[0] for sublist in top_n_words]
new_list = []
for sublist in top_n_words:
    new_list.append(sublist[0])

print(new_list)

In [None]:
print("The clusters: ")
print(original_top_n_words)
print("The top words of the new input:",new_list)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Initialize variables to track the most similar item
most_similar_item = None
highest_similarity_score = -1

# Iterate through each item in the DataFrame
for index, row in original_top_n_words.iterrows():
    new_new_list = row['New']
    
    # Convert the given list and the current item to sentence embeddings
    given_list_embedding = model.encode(" ".join(new_list), convert_to_tensor=True)
    new_list_embedding = model.encode(" ".join(new_new_list), convert_to_tensor=True)
    
    
    # Reshape the embeddings to match the input format for cosine_similarity
    given_list_embedding = given_list_embedding.reshape(1, -1)
    new_list_embedding = new_list_embedding.reshape(1, -1)
    
    # Compute the cosine similarity between the embeddings
    similarity_score = cosine_similarity(given_list_embedding, new_list_embedding)[0][0]
    
    # Check if the current item has a higher similarity score
    if similarity_score > highest_similarity_score:
        highest_similarity_score = similarity_score
        most_similar_item = row['New']
        assigned_cluster_id = row['Key']

print("Most similar item:", most_similar_item)
print("Assigned Cluster id:", assigned_cluster_id)

In [None]:
# Retrieve projects with the same topic 
filtered_df = docs_with_topic[docs_with_topic['Topic'] == int(assigned_cluster_id)]
# Retrieve the `Doc_ID` values where `Topic` is equal to assigned_cluster_id
doc_ids = filtered_df['Doc_ID'].tolist()
print("Similar projects are the projects with ids:", doc_ids)