In [1]:
def makePredictions(list_of_text, model):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    
    import torch
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    # padding = True vs padding = "max_length"
    sample = tokenizer(list_of_text, padding=True, truncation=True)

    sample['input_ids'] = torch.Tensor(sample['input_ids']).to(torch.int64)
    sample['token_type_ids'] = torch.Tensor(sample['token_type_ids']).to(torch.int64)
    sample['attention_mask'] = torch.Tensor(sample['attention_mask']).to(torch.int64)

    batch = {k: v.to(device) for k, v in sample.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions  

In [2]:
def getEmbeddings(text, model, tokenizer):
    
    import torch
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#     device = torch.device("cpu")
    model.to(device)
    # padding = True vs padding = "max_length"

    sample = tokenizer(text, padding=True, truncation=True)

    sample['input_ids'] = torch.Tensor(sample['input_ids']).to(torch.int64)
    sample['token_type_ids'] = torch.Tensor(sample['token_type_ids']).to(torch.int64)
    sample['attention_mask'] = torch.Tensor(sample['attention_mask']).to(torch.int64)

    batch = {k: v.to(device) for k, v in sample.items()}
    with torch.no_grad():
        outputs = model.bert(**batch)
    
    return outputs.pooler_output

In [43]:
# Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)

def updateDataFrame(csv_file_name, model_max_length=500):
    import pandas as pd
    data = pd.read_csv(csv_file_name)
    data = data.drop(columns=['Unnamed: 0'])
    
    def split_sentences(list_of_words):
        chunked_list = list()
        chunk_size = model_max_length
        for i in range(0, len(list_of_words), chunk_size):
            chunked_list.append(list_of_words[i:i+chunk_size])

        return chunked_list

    for idx in range(len(data['data_string'])):
#         print(data['data_string'][idx])
#         data['data_category_number'][idx] = data['data_category_number'][idx] - 1
        data['data_category_number'][idx] = data['data_category_number'][idx]
        if len(str(data['data_string'][idx]).split()) > 500:
#             print(True)
            tempString = data['data_string'][idx]
            tempStringSplit = tempString.split()
            chunkedLists = split_sentences(tempStringSplit)
        
            for sentence in chunkedLists:
                tempSentence = " ".join(sentence)
                data.loc[len(data.index)] = [
                                            data['data_id'][idx], 
                                             tempSentence,
                                            data['2d_coor'][idx],
                                            data['data_title'][idx],
                                            data['data_category'][idx],
                                            data['data_category_number'][idx],
                                            ]
    return data

In [48]:
def fineTuneModel(df, number_of_labels, number_of_epochs=3):
    from datasets import load_dataset, Dataset
    #     dataset = load_dataset('csv', data_files='Care_Reviews.csv', split='train')
    df = df.dropna()
    dataset = Dataset.from_pandas(df)
    #    dataset = dataset.remove_columns(['__index_level_0__'])

    from datasets import DatasetDict
    
    train_testvalid = dataset.train_test_split()
    test_valid = train_testvalid['test'].train_test_split()
    
    train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
    
    dataset = train_test_valid_dataset.remove_columns(['data_id', '2d_coor', 'data_title','data_category'])
    
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


    def tokenize_function(examples):
    # refer documentation: padding=True or padding="max_length"
        return tokenizer(examples["data_string"], padding=True, truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["valid"]
    test_dataset = tokenized_datasets['test']

    tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    
#     tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    tokenized_datasets = tokenized_datasets.rename_column("data_category_number", "labels")
    
    tokenized_datasets.set_format("torch")
    
    small_train_dataset = tokenized_datasets["train"]
    small_eval_dataset = tokenized_datasets["test"]
#     print(small_eval_dataset)
    
    from torch.utils.data import DataLoader
    train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
    eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
    
    from transformers import AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=number_of_labels)
    
    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    from transformers import get_scheduler

    num_epochs = number_of_epochs
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    
    import torch

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#     device = torch.device("cpu")
    model.to(device)
    
    from tqdm.auto import tqdm
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            print(batch)
            outputs = model(**batch)
            print(outputs)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
#             break
#         break
            
    from datasets import load_metric
    metric = load_metric("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    final_score = metric.compute()
    return model, final_score

In [49]:
finetuneBertSeqModelWithCustomDataset(input_file_name='test_lda.csv', 
                                         model_max_length=500,
                                         number_of_labels=20,
                                         number_of_epochs=5,
                                         output_file_name="sample_debug.json")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,,Effects of low-efficiency pollinators on plant...,"0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0
1,25451119,This experiment investigated how the esthetic ...,,Electrophysiological brain dynamics during the...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
2,27236075,Women with high body dissatisfaction look less...,,Take a look at the bright side: Effects of pos...,"0.004*""parallel"" + 0.003*""work"" + 0.003*""herba...",9.0
3,15010496,We have used the technique of functional MRI t...,,Neural correlates of beauty.,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
4,18648595,Three aspects of hormesis with low doses of io...,,"Radiation hormesis: the good, the bad, and the...","0.005*""failure"" + 0.004*""pollinators"" + 0.003*...",5.0
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,,Children with severe and profound disabilities...,"0.003*""disabilities"" + 0.003*""aesthetic"" + 0.0...",3.0
96,18794733,As our ageing population demands to maintain y...,,"""Ethics in aesthetic nursing...avoiding the ug...","0.004*""west"" + 0.003*""nonsurgical"" + 0.002*""ae...",1.0
97,3984466,This study reviews the cases of 49 patients wi...,,[Subcutaneous tenotomy of the sternocleidomast...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
98,30231330,Acute myeloid leukemia (AML) was initially sub...,,"Acute Myeloid Leukemia: The Good, the Bad, and...","0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0


Dataframe with NaN removed: 





100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 68.36ba/s][A[A


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 360.12ba/s][A[A


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 227.85ba/s][A[A
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassificatio

{'labels': tensor([11.,  2., 17.,  3.,  3.,  4., 15.,  2.]), 'input_ids': tensor([[  101,  5689,   156,  ...,     0,     0,     0],
        [  101,  1109,  2656,  ...,     0,     0,     0],
        [  101,  1188,  1692,  ...,     0,     0,     0],
        ...,
        [  101,  1109, 12641,  ...,     0,     0,     0],
        [  101, 13197,  5295,  ...,     0,     0,     0],
        [  101,  4503, 11432,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 20]))

In [5]:
def get_json_file(data, model, output_file_name="something.json"):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    
    embedding_list = []
    for idx in range(len(data['data_string'])):
        if type(data['data_string'][idx]) is float:
            continue
        embed = getEmbeddings([data['data_string'][idx]], model, tokenizer).tolist()[0]
        embedding_list.append(embed)
    
    import numpy as np
    embeddings_for_umap = np.array(embedding_list)
    
    import umap.umap_ as umap
    umap_embedding = umap.UMAP().fit_transform(embeddings_for_umap, y=list(data['data_category_number']))
    
    data['2d_coor'] = umap_embedding.tolist()
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
    #     tmp_var = data_df_china_news['2d_coor'][idx].strip('][').split(', ')
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
    #     tmp_dict["data_x"] = str(tmp_var[0])
    #     tmp_dict["data_y"] = str(tmp_var[1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])
#         tmp_dict["data_string"] = str(data['data_string'][idx])
        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [6]:
def train_masked_bert(data, num_epochs=2, number_of_labels=5):
    from transformers import AutoTokenizer, BertForMaskedLM
    import torch

    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    model = BertForMaskedLM.from_pretrained('bert-base-cased')
    
    data = data.dropna()
    display(data)
    inputs = tokenizer(list(data['data_string']), return_tensors='pt', padding=True, truncation=True)
    
    inputs['labels'] = inputs.input_ids.detach().clone()
    
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
    
    selection = []

    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
        
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings):
            self.encodings = encodings
        def __getitem__(self, idx):
            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        def __len__(self):
            return len(self.encodings.input_ids)
        
    dataset = CustomDataset(inputs)
    
    loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # device = torch.device('cpu')
    # and move our model over to the selected device
    model.to(device)
    # activate training mode
    model.train()
    
    from transformers import AdamW
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=5e-5)
    
    from tqdm import tqdm  # for our progress bar

    epochs = num_epochs

    for epoch in range(epochs):
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True)
        for batch in loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
            torch.cuda.empty_cache()
#             break
#         break
            
    model.save_pretrained('pytorch_model_unsupervised_finetuned')
    return None

In [35]:
def fineTuneModelUnsupervised(df, number_of_labels=19, number_of_epochs=3):
    from datasets import load_dataset, Dataset
    #     dataset = load_dataset('csv', data_files='Care_Reviews.csv', split='train')
    df = df.dropna()
    display(df)
    dataset = Dataset.from_pandas(df)
    #dataset = dataset.remove_columns(['__index_level_0__'])

    from datasets import DatasetDict
    
    train_testvalid = dataset.train_test_split()
    test_valid = train_testvalid['test'].train_test_split()
    
    train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
    
    dataset = train_test_valid_dataset.remove_columns(['data_id', '2d_coor', 'data_title','data_category'])
    
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


    def tokenize_function(examples):
    # refer documentation: padding=True or padding="max_length"
        return tokenizer(examples["data_string"], padding=True, truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["valid"]
    test_dataset = tokenized_datasets['test']

    tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    
#     tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    tokenized_datasets = tokenized_datasets.rename_column("data_category_number", "labels")
    
    tokenized_datasets.set_format("torch")
    
    small_train_dataset = tokenized_datasets["train"]
    small_eval_dataset = tokenized_datasets["test"]
#     print(small_eval_dataset)
    
    from torch.utils.data import DataLoader
    train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
    eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
    
    from transformers import AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained("pytorch_model_unsupervised_finetuned", num_labels=number_of_labels)
    
    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    from transformers import get_scheduler

    num_epochs = number_of_epochs
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    
    import torch

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    from tqdm.auto import tqdm
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
#             print(outputs)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
#             break
#         break
            
    from datasets import load_metric
    metric = load_metric("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    final_score = metric.compute()
    return model, final_score

In [8]:
# Below are functions to delete files and directories (you will not be able to delete directories/files
# directly from the Jupyter Notebook UI)

In [9]:
import shutil
dir_name = 'pytorch_model_unsupervised_finetuned'
shutil.rmtree(dir_name)

FileNotFoundError: [Errno 2] No such file or directory: 'pytorch_model_unsupervised_finetuned'

In [9]:
# use the space below to play (call) with functions initialized above

In [19]:
import pandas as pd
pd.set_option('display.max_columns', None)

data = pd.read_json('pubmed_1k.json')
display(data)


data = data.rename(columns={2:'data_title', 5:'data_string',0:'data_id'})
data
data = data.drop(columns=[1,3,4,6])
data
data['2d_coor'] = ''
data['data_category'] = ''
data['data_category_number'] = ''
data = data.reindex(columns=['data_id', 'data_string', '2d_coor', 'data_title', 'data_category', 'data_category_number'])
data.dropna()
display(data)
data.to_csv('test.csv')

Unnamed: 0,0,1,2,3,4,5,6
0,15549400,journal,Effects of low-efficiency pollinators on plant...,[],2004-12-08,Floral visitors vary in their pollination effi...,
1,25451119,journal,Electrophysiological brain dynamics during the...,[],2015-01-12,This experiment investigated how the esthetic ...,
2,27236075,journal,Take a look at the bright side: Effects of pos...,[],2016-08-04,Women with high body dissatisfaction look less...,
3,15010496,journal,Neural correlates of beauty.,[],2004-04-07,We have used the technique of functional MRI t...,
4,18648595,journal,"Radiation hormesis: the good, the bad, and the...",[],2006-09-27,Three aspects of hormesis with low doses of io...,
...,...,...,...,...,...,...,...
95,9874951,journal,Children with severe and profound disabilities...,[],1998-12-28,Children with severe and profound disabilities...,
96,18794733,journal,"""Ethics in aesthetic nursing...avoiding the ug...",[],2008-10-16,As our ageing population demands to maintain y...,
97,3984466,journal,[Subcutaneous tenotomy of the sternocleidomast...,[],1985-02-15,This study reviews the cases of 49 patients wi...,
98,30231330,journal,"Acute Myeloid Leukemia: The Good, the Bad, and...",[],,Acute myeloid leukemia (AML) was initially sub...,


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,,Effects of low-efficiency pollinators on plant...,,
1,25451119,This experiment investigated how the esthetic ...,,Electrophysiological brain dynamics during the...,,
2,27236075,Women with high body dissatisfaction look less...,,Take a look at the bright side: Effects of pos...,,
3,15010496,We have used the technique of functional MRI t...,,Neural correlates of beauty.,,
4,18648595,Three aspects of hormesis with low doses of io...,,"Radiation hormesis: the good, the bad, and the...",,
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,,Children with severe and profound disabilities...,,
96,18794733,As our ageing population demands to maintain y...,,"""Ethics in aesthetic nursing...avoiding the ug...",,
97,3984466,This study reviews the cases of 49 patients wi...,,[Subcutaneous tenotomy of the sternocleidomast...,,
98,30231330,Acute myeloid leukemia (AML) was initially sub...,,"Acute Myeloid Leukemia: The Good, the Bad, and...",,


In [20]:
### Step 1 ###

# This function will create a csv in a format (mostly changing columns names) that we need for training models
# The following columns are needed: data_category_number, data_title, data_string, data_category, data_id, 2d_coor

# Note: This function will have to be modified according to the need as not all datasets have labels but above
# mentioned columns should be there
def create_structured_csv(csv_file_name):
    import pandas as pd
    data = pd.read_csv(csv_file_name)
    #data = data.drop(columns=['Unnamed: 0', 'year'])
    data = data.rename(columns={'seq':'data_category_number', 'title':'data_title', 'abstract':'data_string', 'CODE':'data_category','id':'data_id'})
    data['2d_coor'] = ''
    data = data.reindex(columns=['data_id', 'data_string', '2d_coor', 'data_title', 'data_category', 'data_category_number'])
    data.to_csv('test_cleaned.csv')
    return data

csv_file_name = 'test.csv'
data = create_structured_csv(csv_file_name)

In [21]:
### Step 2 ###

# Not all the datasets at DSC are labelled. Hence, we need to label some and we use LDA for that
def apply_lda_on_dataset(df):
    import gensim
    from gensim.utils import simple_preprocess
    from gensim.parsing.preprocessing import STOPWORDS
#     from nltk.stem.porter import *
    from gensim import corpora, models
    import numpy as np
    import pandas as pd
    np.random.seed(2018)

    import nltk
    nltk.download('wordnet')

    # Function that preprocesses all text documents before feeding to lda model
    def preprocess(text):
        result = []
        i = 0
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(token)
        return result
    
    # the file name should be the one generated from the function above     
#     data = pd.read_csv('Care_Reviews.csv')
    data = df
    documents = data
    documents = documents.dropna(subset=['data_string'])
    processed_docs = documents['data_string'].map(preprocess)
    
    # Creates a dictionary from the documents (Note: Here the argument 'preprocessed_docs' is a 'list of lists')
    dictionary = gensim.corpora.Dictionary(processed_docs)
    
    # Creates a bag_of_words corpus     
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    # Creates a tfidf matrix/table required for training
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    
    # Trains an lda model with tfidf
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)
    
    for idx in range(len(data['data_string'])):
        data_string = data['data_string'][idx]
#         print(type(data_string))
#         print(dictionary.doc2bow(preprocess(data_string)))
#         print(lda_model_tfidf.get_document_topics())
        data_string_topic_no = lda_model_tfidf.get_document_topics(dictionary.doc2bow(preprocess(data_string)))[0][0]
#         print(data_string_topic_no)
        data_string_topic = lda_model_tfidf.print_topic(data_string_topic_no)
        data['data_category'][idx] = data_string_topic
        data_string_topic_num = lda_model_tfidf.get_document_topics(dictionary.doc2bow(preprocess(data_string)))[0][0]
        data['data_category_number'][idx] = data_string_topic_num
        
    return data

In [24]:
data = apply_lda_on_dataset(data)

[nltk_data] Downloading package wordnet to /Users/ez/nltk_data...
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category'][idx] = data_string_topic
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data_string_topic_num


In [27]:
data.to_csv('test_lda.csv')

In [45]:
# The following function is responsible for fine-tuning an existing Bert Model (from huggingface) with a DSC dataset
def finetuneBertSeqModelWithCustomDataset(input_file_name='test_lda.csv', 
                                         model_max_length=500,
                                         number_of_labels=15,
                                         number_of_epochs=15,
                                         output_file_name="something.json"):
    # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)     
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    #data = data.drop(['Unnamed: 0']
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    #display(data)
    
    # Function that returns a fine-tuned model (fine-tuned on DSC dataset) and its score
    model, score = fineTuneModel(df=data, number_of_labels=number_of_labels, number_of_epochs=number_of_epochs)
    
    print('Here\s the fine-tuned model: ', model)
    print('Accuracy of the fine-tuned model on the test dataset is: ', score)
    
    # Function that returns the dataframe with embeddings (UMAP reduces high dimensional embedding to 2D)     
    data_with_embeddings = get_json_file(data, model, output_file_name)

In [46]:
finetuneBertSeqModelWithCustomDataset(input_file_name='test_lda.csv', 
                                         model_max_length=500,
                                         number_of_labels=20,
                                         number_of_epochs=5,
                                         output_file_name="sample_debug.json")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,,Effects of low-efficiency pollinators on plant...,"0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0
1,25451119,This experiment investigated how the esthetic ...,,Electrophysiological brain dynamics during the...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
2,27236075,Women with high body dissatisfaction look less...,,Take a look at the bright side: Effects of pos...,"0.004*""parallel"" + 0.003*""work"" + 0.003*""herba...",9.0
3,15010496,We have used the technique of functional MRI t...,,Neural correlates of beauty.,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
4,18648595,Three aspects of hormesis with low doses of io...,,"Radiation hormesis: the good, the bad, and the...","0.005*""failure"" + 0.004*""pollinators"" + 0.003*...",5.0
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,,Children with severe and profound disabilities...,"0.003*""disabilities"" + 0.003*""aesthetic"" + 0.0...",3.0
96,18794733,As our ageing population demands to maintain y...,,"""Ethics in aesthetic nursing...avoiding the ug...","0.004*""west"" + 0.003*""nonsurgical"" + 0.002*""ae...",1.0
97,3984466,This study reviews the cases of 49 patients wi...,,[Subcutaneous tenotomy of the sternocleidomast...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
98,30231330,Acute myeloid leukemia (AML) was initially sub...,,"Acute Myeloid Leukemia: The Good, the Bad, and...","0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0


Dataframe with NaN removed: 




100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 71.73ba/s][A

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 292.55ba/s][A

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 187.69ba/s][A
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the c

ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 20]))

In [41]:
# The following function is responsible for pretraining a masked bert model and fine-tuning the same pretrained model 
# (from huggingface) with a DSC dataset
def finetuneBertModelAfterPretrainingOfMaskedBertWithCustomDataset(input_file_name='ultra_clean_abstract_with_text.csv', 
                                         model_max_length=500,
                                         number_of_labels=19,
                                         number_of_epochs_for_masked_bert=2,
                                         number_of_epochs_for_finetuning_masked_bert=5                   
                                         output_file_name="something.json"):
    
    # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Function that pretrains a masked bert model and saves that model in a directory: 'pytorch_model_unsupervised_finetuned'
    train_masked_bert(data, num_epochs=number_of_epochs_for_masked_bert, number_of_labels=number_of_labels)
    
    # Function that fine-tunes the above pretrained masked bert model
    new_model, score = fineTuneModelUnsupervised(df, number_of_labels=number_of_labels, number_of_epochs=number_of_epochs_for_finetuning_masked_bert)
    
    print('Here\s the fine-tuned model: ', model)
    print('Accuracy of the fine-tuned model on the test dataset is: ', score)
    
    # Function that returns the dataframe with embeddings (UMAP reduces high dimensional embedding to 2D)     
    data_with_embeddings = get_json_file(data, model, output_file_name)

SyntaxError: invalid syntax (3938428121.py, line 8)

In [50]:
def UMAPWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         use_labels=True,
                         sentence_transformer_name='sentence-transformers/all-mpnet-base-v2'):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_umap = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with UMAP
    import umap.umap_ as umap
    umap_embedding = umap.UMAP().fit_transform(embeddings_for_umap, y=list(data['data_category_number']) if use_labels else None)
    
    data['2d_coor'] = umap_embedding.tolist()
    display(data)
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [53]:
d = UMAPWithCustomDataset(input_file_name='test_cleaned.csv', 
                        model_max_length=384,
                        output_json_file_name='test_umap.json',
                         use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,,Effects of low-efficiency pollinators on plant...,,
1,25451119,This experiment investigated how the esthetic ...,,Electrophysiological brain dynamics during the...,,
2,27236075,Women with high body dissatisfaction look less...,,Take a look at the bright side: Effects of pos...,,
3,15010496,We have used the technique of functional MRI t...,,Neural correlates of beauty.,,
4,18648595,Three aspects of hormesis with low doses of io...,,"Radiation hormesis: the good, the bad, and the...",,
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,,Children with severe and profound disabilities...,,
96,18794733,As our ageing population demands to maintain y...,,"""Ethics in aesthetic nursing...avoiding the ug...",,
97,3984466,This study reviews the cases of 49 patients wi...,,[Subcutaneous tenotomy of the sternocleidomast...,,
98,30231330,Acute myeloid leukemia (AML) was initially sub...,,"Acute Myeloid Leukemia: The Good, the Bad, and...",,


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,,Effects of low-efficiency pollinators on plant...,,
1,25451119,This experiment investigated how the esthetic ...,,Electrophysiological brain dynamics during the...,,
2,27236075,Women with high body dissatisfaction look less...,,Take a look at the bright side: Effects of pos...,,
3,15010496,We have used the technique of functional MRI t...,,Neural correlates of beauty.,,
4,18648595,Three aspects of hormesis with low doses of io...,,"Radiation hormesis: the good, the bad, and the...",,
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,,Children with severe and profound disabilities...,,
96,18794733,As our ageing population demands to maintain y...,,"""Ethics in aesthetic nursing...avoiding the ug...",,
97,3984466,This study reviews the cases of 49 patients wi...,,[Subcutaneous tenotomy of the sternocleidomast...,,
98,30231330,Acute myeloid leukemia (AML) was initially sub...,,"Acute Myeloid Leukemia: The Good, the Bad, and...",,


  0%|                                                    | 0/50 [40:36<?, ?it/s]
  0%|                                                    | 0/50 [14:31<?, ?it/s]
  0%|                                                    | 0/50 [07:41<?, ?it/s]


ContextualVersionConflict: (numpy 1.23.0rc3 (/Users/ez/miniconda3/envs/torch-nightly/lib/python3.8/site-packages), Requirement.parse('numpy<1.23.0,>=1.16.5'), {'scipy'})

In [54]:
def TSNEWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                         use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_tsne = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with TSNE
    from sklearn.manifold import TSNE
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(embeddings_for_tsne, list(data['data_category_number']) if use_labels else None)
    
    data['2d_coor'] = tsne_results.tolist()
    display(data)
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [21]:
d = TSNEWithCustomDataset(input_file_name='news_articles.csv', 
                        model_max_length=384,
                        output_json_file_name='news_articles_tsne.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                         use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4




[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 193492 samples in 0.069s...
[t-SNE] Computed neighbors for 193492 samples in 1197.803s...
[t-SNE] Computed conditional probabilities for sample 1000 / 193492
[t-SNE] Computed conditional probabilities for sample 2000 / 193492
[t-SNE] Computed conditional probabilities for sample 3000 / 193492
[t-SNE] Computed conditional probabilities for sample 4000 / 193492
[t-SNE] Computed conditional probabilities for sample 5000 / 193492
[t-SNE] Computed conditional probabilities for sample 6000 / 193492
[t-SNE] Computed conditional probabilities for sample 7000 / 193492
[t-SNE] Computed conditional probabilities for sample 8000 / 193492
[t-SNE] Computed conditional probabilities for sample 9000 / 193492
[t-SNE] Computed conditional probabilities for sample 10000 / 193492
[t-SNE] Computed conditional probabilities for sample 11000 / 193492
[t-SNE] Computed conditional probabilities for sample 12000 / 193492
[t-SNE] Computed conditional pro

[t-SNE] Computed conditional probabilities for sample 118000 / 193492
[t-SNE] Computed conditional probabilities for sample 119000 / 193492
[t-SNE] Computed conditional probabilities for sample 120000 / 193492
[t-SNE] Computed conditional probabilities for sample 121000 / 193492
[t-SNE] Computed conditional probabilities for sample 122000 / 193492
[t-SNE] Computed conditional probabilities for sample 123000 / 193492
[t-SNE] Computed conditional probabilities for sample 124000 / 193492
[t-SNE] Computed conditional probabilities for sample 125000 / 193492
[t-SNE] Computed conditional probabilities for sample 126000 / 193492
[t-SNE] Computed conditional probabilities for sample 127000 / 193492
[t-SNE] Computed conditional probabilities for sample 128000 / 193492
[t-SNE] Computed conditional probabilities for sample 129000 / 193492
[t-SNE] Computed conditional probabilities for sample 130000 / 193492
[t-SNE] Computed conditional probabilities for sample 131000 / 193492
[t-SNE] Computed con

Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[4.397582054138184, -3.5684354305267334]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[-0.3279964327812195, 8.61110782623291]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[4.169193744659424, -4.005739212036133]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[-2.864100933074951, -0.9488449692726135]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[0.2897432744503021, 10.175537109375]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[-5.589747428894043, 3.172562599182129]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[4.083289623260498, 9.9985933303833]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[4.8006367683410645, 9.934613227844238]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[4.684049606323242, 9.922048568725586]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[4.397582054138184, -3.5684354305267334]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[-0.3279964327812195, 8.61110782623291]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[4.169193744659424, -4.005739212036133]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[-2.864100933074951, -0.9488449692726135]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[0.2897432744503021, 10.175537109375]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[-5.589747428894043, 3.172562599182129]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[4.083289623260498, 9.9985933303833]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[4.8006367683410645, 9.934613227844238]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[4.684049606323242, 9.922048568725586]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


In [56]:
def PCAWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_pca = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with PCA
    from sklearn.decomposition import PCA

    pca_2d_class = PCA(n_components=p_components).fit(embeddings_for_pca, list(data['data_category_number']) if use_labels else None)

    pca_2d = pca_2d_class.transform(embeddings_for_pca)
    
    data['2d_coor'] = pca_2d.tolist()
    display(data)
    
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [57]:
d = PCAWithCustomDataset(input_file_name='test_lda.csv', 
                        model_max_length=384,
                        output_json_file_name='test_pca.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,,Effects of low-efficiency pollinators on plant...,"0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0
1,25451119,This experiment investigated how the esthetic ...,,Electrophysiological brain dynamics during the...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
2,27236075,Women with high body dissatisfaction look less...,,Take a look at the bright side: Effects of pos...,"0.004*""parallel"" + 0.003*""work"" + 0.003*""herba...",9.0
3,15010496,We have used the technique of functional MRI t...,,Neural correlates of beauty.,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
4,18648595,Three aspects of hormesis with low doses of io...,,"Radiation hormesis: the good, the bad, and the...","0.005*""failure"" + 0.004*""pollinators"" + 0.003*...",5.0
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,,Children with severe and profound disabilities...,"0.003*""disabilities"" + 0.003*""aesthetic"" + 0.0...",3.0
96,18794733,As our ageing population demands to maintain y...,,"""Ethics in aesthetic nursing...avoiding the ug...","0.004*""west"" + 0.003*""nonsurgical"" + 0.002*""ae...",1.0
97,3984466,This study reviews the cases of 49 patients wi...,,[Subcutaneous tenotomy of the sternocleidomast...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
98,30231330,Acute myeloid leukemia (AML) was initially sub...,,"Acute Myeloid Leukemia: The Good, the Bad, and...","0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,,Effects of low-efficiency pollinators on plant...,"0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0
1,25451119,This experiment investigated how the esthetic ...,,Electrophysiological brain dynamics during the...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
2,27236075,Women with high body dissatisfaction look less...,,Take a look at the bright side: Effects of pos...,"0.004*""parallel"" + 0.003*""work"" + 0.003*""herba...",9.0
3,15010496,We have used the technique of functional MRI t...,,Neural correlates of beauty.,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
4,18648595,Three aspects of hormesis with low doses of io...,,"Radiation hormesis: the good, the bad, and the...","0.005*""failure"" + 0.004*""pollinators"" + 0.003*...",5.0
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,,Children with severe and profound disabilities...,"0.003*""disabilities"" + 0.003*""aesthetic"" + 0.0...",3.0
96,18794733,As our ageing population demands to maintain y...,,"""Ethics in aesthetic nursing...avoiding the ug...","0.004*""west"" + 0.003*""nonsurgical"" + 0.002*""ae...",1.0
97,3984466,This study reviews the cases of 49 patients wi...,,[Subcutaneous tenotomy of the sternocleidomast...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
98,30231330,Acute myeloid leukemia (AML) was initially sub...,,"Acute Myeloid Leukemia: The Good, the Bad, and...","0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0


Downloading: 100%|██████████████████████████| 1.18k/1.18k [00:00<00:00, 228kB/s]
Downloading: 100%|█████████████████████████████| 190/190 [00:00<00:00, 56.2kB/s]
Downloading: 100%|█████████████████████████| 10.2k/10.2k [00:00<00:00, 2.15MB/s]
Downloading: 100%|██████████████████████████████| 612/612 [00:00<00:00, 106kB/s]
Downloading: 100%|█████████████████████████████| 116/116 [00:00<00:00, 19.1kB/s]
Downloading: 100%|██████████████████████████| 39.3k/39.3k [00:00<00:00, 642kB/s]
Downloading: 100%|█████████████████████████| 90.9M/90.9M [00:09<00:00, 9.44MB/s]
Downloading: 100%|███████████████████████████| 53.0/53.0 [00:00<00:00, 6.57kB/s]
Downloading: 100%|█████████████████████████████| 112/112 [00:00<00:00, 21.1kB/s]
Downloading: 100%|███████████████████████████| 466k/466k [00:00<00:00, 1.59MB/s]
Downloading: 100%|█████████████████████████████| 350/350 [00:00<00:00, 36.8kB/s]
Downloading: 100%|█████████████████████████| 13.2k/13.2k [00:00<00:00, 2.14MB/s]
Downloading: 100%|██████████

Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,"[0.27713698148727417, -0.10882366448640823]",Effects of low-efficiency pollinators on plant...,"0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0
1,25451119,This experiment investigated how the esthetic ...,"[0.5933237075805664, 0.09017793834209442]",Electrophysiological brain dynamics during the...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
2,27236075,Women with high body dissatisfaction look less...,"[0.5550868511199951, 0.047366585582494736]",Take a look at the bright side: Effects of pos...,"0.004*""parallel"" + 0.003*""work"" + 0.003*""herba...",9.0
3,15010496,We have used the technique of functional MRI t...,"[0.4833657741546631, 0.09579300880432129]",Neural correlates of beauty.,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
4,18648595,Three aspects of hormesis with low doses of io...,"[-0.14024655520915985, -0.2112492322921753]","Radiation hormesis: the good, the bad, and the...","0.005*""failure"" + 0.004*""pollinators"" + 0.003*...",5.0
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,"[-0.21721360087394714, -0.22595451772212982]",Children with severe and profound disabilities...,"0.003*""disabilities"" + 0.003*""aesthetic"" + 0.0...",3.0
96,18794733,As our ageing population demands to maintain y...,"[0.06945352256298065, -0.00463707372546196]","""Ethics in aesthetic nursing...avoiding the ug...","0.004*""west"" + 0.003*""nonsurgical"" + 0.002*""ae...",1.0
97,3984466,This study reviews the cases of 49 patients wi...,"[-0.3445499837398529, 0.38034558296203613]",[Subcutaneous tenotomy of the sternocleidomast...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
98,30231330,Acute myeloid leukemia (AML) was initially sub...,"[-0.20531556010246277, -0.16500544548034668]","Acute Myeloid Leukemia: The Good, the Bad, and...","0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,15549400,Floral visitors vary in their pollination effi...,"[0.27713698148727417, -0.10882366448640823]",Effects of low-efficiency pollinators on plant...,"0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0
1,25451119,This experiment investigated how the esthetic ...,"[0.5933237075805664, 0.09017793834209442]",Electrophysiological brain dynamics during the...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
2,27236075,Women with high body dissatisfaction look less...,"[0.5550868511199951, 0.047366585582494736]",Take a look at the bright side: Effects of pos...,"0.004*""parallel"" + 0.003*""work"" + 0.003*""herba...",9.0
3,15010496,We have used the technique of functional MRI t...,"[0.4833657741546631, 0.09579300880432129]",Neural correlates of beauty.,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
4,18648595,Three aspects of hormesis with low doses of io...,"[-0.14024655520915985, -0.2112492322921753]","Radiation hormesis: the good, the bad, and the...","0.005*""failure"" + 0.004*""pollinators"" + 0.003*...",5.0
...,...,...,...,...,...,...
95,9874951,Children with severe and profound disabilities...,"[-0.21721360087394714, -0.22595451772212982]",Children with severe and profound disabilities...,"0.003*""disabilities"" + 0.003*""aesthetic"" + 0.0...",3.0
96,18794733,As our ageing population demands to maintain y...,"[0.06945352256298065, -0.00463707372546196]","""Ethics in aesthetic nursing...avoiding the ug...","0.004*""west"" + 0.003*""nonsurgical"" + 0.002*""ae...",1.0
97,3984466,This study reviews the cases of 49 patients wi...,"[-0.3445499837398529, 0.38034558296203613]",[Subcutaneous tenotomy of the sternocleidomast...,"0.003*""challenging"" + 0.003*""bonds"" + 0.003*""c...",7.0
98,30231330,Acute myeloid leukemia (AML) was initially sub...,"[-0.20531556010246277, -0.16500544548034668]","Acute Myeloid Leukemia: The Good, the Bad, and...","0.004*""mood"" + 0.003*""backfilling"" + 0.002*""in...",4.0


In [32]:
def KMeansAndPCAWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_kmeans = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with PCA After applying KMeans
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    
    kmeans = KMeans(n_clusters=len(data['data_category_number'].unique()))
    labels = kmeans.fit_predict(embeddings_for_kmeans, list(data['data_category_number']) if use_labels else None)
    labels_scale = kmeans.labels_

    pca_2d_class = PCA(n_components=p_components).fit(embeddings_for_kmeans, labels.tolist())

    pca_2d = pca_2d_class.transform(embeddings_for_kmeans)
    
    data['2d_coor'] = pca_2d.tolist()
    display(data)
    
    new_labels = labels.tolist()
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
#         tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_category_number"] = str(new_labels[idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [33]:
d = KMeansAndPCAWithCustomDataset(input_file_name='news_articles.csv', 
                        model_max_length=384,
                        output_json_file_name='news_articles_kmeanspca.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[-0.21102222800254822, -0.1435994803905487]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[0.1290389597415924, 0.03235282376408577]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[-0.14857149124145508, -0.17933329939842224]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[0.3369511067867279, -0.1775447428226471]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[-0.026973845437169075, 0.012201634235680103]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[0.3518417477607727, -0.1885872483253479]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[0.3118555545806885, -0.04097635671496391]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[0.1637514978647232, 0.08637033402919769]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[0.14689454436302185, 0.08255422115325928]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[-0.21102222800254822, -0.1435994803905487]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[0.1290389597415924, 0.03235282376408577]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[-0.14857149124145508, -0.17933329939842224]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[0.3369511067867279, -0.1775447428226471]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[-0.026973845437169075, 0.012201634235680103]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[0.3518417477607727, -0.1885872483253479]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[0.3118555545806885, -0.04097635671496391]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[0.1637514978647232, 0.08637033402919769]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[0.14689454436302185, 0.08255422115325928]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


In [None]:
def PCAAndKMeansWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_kmeans = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with PCA After applying KMeans
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    
    pca_2d_class = PCA(n_components=p_components).fit(embeddings_for_kmeans, list(data['data_category_number']) if use_labels else None)
    pca_2d = pca_2d_class.transform(embeddings_for_kmeans)
    
    kmeans = KMeans(n_clusters=len(data['data_category_number'].unique()))
    labels = kmeans.fit_predict(pca_2d, list(data['data_category_number']) if use_labels else None)
    labels_scale = kmeans.labels_

    data['2d_coor'] = pca_2d.tolist()
    display(data)
    
    new_labels = labels.tolist()
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
#         tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_category_number"] = str(new_labels[idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [None]:
d = PCAAndKMeansWithCustomDataset(input_file_name='news_articles.csv', 
                        model_max_length=384,
                        output_json_file_name='news_articles_pcakmeans.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True)