# Setup

In [1]:
!pip install -U -q transformers bitsandbytes accelerate sentence-transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.3 MB/s[0m e

In [17]:
from sentence_transformers import SentenceTransformer, util

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

from sklearn.cluster import KMeans

from datasets import load_dataset

from math import sqrt

## Create the sentence-embedder

In [3]:
class SentenceEncoder():
    def __init__(self, model='sentence-transformers/all-MiniLM-L12-v2', device='cuda'):
        self.model = SentenceTransformer(model, device)
        pass
    
    def encode(self, sentences, convert_to_tensor=True):
        return self.model.encode(sentences, convert_to_tensor=convert_to_tensor)

## Method to setup the summarizer pipeline

In [4]:
def create_summarizer_pipeline():
  model_name = "pszemraj/long-t5-tglobal-xl-16384-book-summary-8bit"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  summarizer = pipeline(task="summarization", model=model, tokenizer=tokenizer, device_map="auto", model_kwargs={"load_in_8bit": True})
  return summarizer

# Clustering Algorithms

## kmeans

In [11]:
def cluster_k_means(corpus, corpus_embeddings, num_clusters):
  clustering_model = KMeans(n_clusters=num_clusters)
  clustering_model.fit(corpus_embeddings)
  cluster_assignment = clustering_model.labels_
  clustered_sentences = [[] for i in range(num_clusters)]
  clustered_embeddings = [[] for i in range(num_clusters)]
  for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])
    clustered_embeddings[cluster_id].append(corpus_embeddings[sentence_id])
  return clustered_sentences, clustered_embeddings

# Quora Duplicate Questions Dataset

In [None]:
def get_quora_questions_slice(slice_n):
  quora_dataset = load_dataset("quora", split="train")
  pass

In [6]:
slice_n = 10000
slice_quora_dataset = quora_dataset[:slice_n]

In [7]:
questions = []
for q in slice_quora_dataset['questions']:
  questions.append(q['text'][0])
  questions.append(q['text'][1])

In [8]:
st = SentenceEncoder()

Downloading (…)5dded/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d81d5dded/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)81d5dded/config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ded/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5dded/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading (…)dded/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)4d81d5dded/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1d5dded/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [10]:
embed_questions = st.encode(questions)

In [18]:
num_clusters = int(sqrt(len(questions)))

In [19]:
clustered_sentences, clustered_embeddings = cluster_k_means(corpus=questions,\
                                                            corpus_embeddings=embed_questions.cpu(),\
                                                            num_clusters=num_clusters)



In [16]:
sqrt(len(questions))

141.4213562373095

In [20]:
clustered_sentences[0]

['What is the right etiquette for wishing a Jehovah Witness happy birthday?',
 'Who are the Rohingya Muslims?',
 'Why are there so many Christians in Kerala?',
 'How can I convince people to stop blaspheming against the Holy Spirit?',
 'Is circumcision allowed in Islam?',
 'Can a Muslim guy date a non Muslim girl? Is it based on how religious the individual is or are there other factors to it?',
 'What do non-Muslims expect from Muslims?',
 'Why did the Umayyad Caliphate fall? What could have been done to avoid his fall?',
 'History of Islam: Why did the Umayyad Caliphate invade Christian territory?',
 'Why is Persian word “ذليل” meaning ‘a Muslim’ translated incorrectly on Microsoft Translator? Is it intentional or careless?',
 'If the whole world were to follow only one religion, would it be more peaceful?',
 'What is the difference between Shia and Shiite?',
 'In what way is the Ismaili sect different from the other Shia groups?',
 'What is the appropriate way to wish a Muslim frien