In [23]:
def makePredictions(list_of_text, model):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    
    import torch
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    # padding = True vs padding = "max_length"
    sample = tokenizer(list_of_text, padding=True, truncation=True)

    sample['input_ids'] = torch.Tensor(sample['input_ids']).to(torch.int64)
    sample['token_type_ids'] = torch.Tensor(sample['token_type_ids']).to(torch.int64)
    sample['attention_mask'] = torch.Tensor(sample['attention_mask']).to(torch.int64)

    batch = {k: v.to(device) for k, v in sample.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions  

In [24]:
def getEmbeddings(text, model, tokenizer):
    
    import torch
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#     device = torch.device("cpu")
    model.to(device)
    # padding = True vs padding = "max_length"

    sample = tokenizer(text, padding=True, truncation=True)

    sample['input_ids'] = torch.Tensor(sample['input_ids']).to(torch.int64)
    sample['token_type_ids'] = torch.Tensor(sample['token_type_ids']).to(torch.int64)
    sample['attention_mask'] = torch.Tensor(sample['attention_mask']).to(torch.int64)

    batch = {k: v.to(device) for k, v in sample.items()}
    with torch.no_grad():
        outputs = model.bert(**batch)
    
    return outputs.pooler_output

In [25]:
# Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)

def updateDataFrame(csv_file_name, model_max_length=500):
    import pandas as pd
    data = pd.read_csv(csv_file_name)
    #data = data.drop(columns=['Unnamed: 0'])
    
    def split_sentences(list_of_words):
        chunked_list = list()
        chunk_size = model_max_length
        for i in range(0, len(list_of_words), chunk_size):
            chunked_list.append(list_of_words[i:i+chunk_size])

        return chunked_list

    for idx in range(len(data['data_string'])):
#         print(data['data_string'][idx])
#         data['data_category_number'][idx] = data['data_category_number'][idx] - 1
        data['data_category_number'][idx] = data['data_category_number'][idx]
        if len(str(data['data_string'][idx]).split()) > 500:
#             print(True)
            tempString = data['data_string'][idx]
            tempStringSplit = tempString.split()
            chunkedLists = split_sentences(tempStringSplit)
        
            for sentence in chunkedLists:
                tempSentence = " ".join(sentence)
                data.loc[len(data.index)] = [
                                            data['data_id'][idx], 
                                             tempSentence,
                                            data['2d_coor'][idx],
                                            data['data_title'][idx],
                                            data['data_category'][idx],
                                            data['data_category_number'][idx],
                                            ]
    return data

In [26]:
def fineTuneModel(df, number_of_labels, number_of_epochs=3):
    from datasets import load_dataset, Dataset
#     dataset = load_dataset('csv', data_files='Care_Reviews.csv', split='train')
    df = df.dropna()
    dataset = Dataset.from_pandas(df)
    dataset = dataset.remove_columns(['__index_level_0__'])

    from datasets import DatasetDict
    
    train_testvalid = dataset.train_test_split()
    test_valid = train_testvalid['test'].train_test_split()
    
    train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
    
    dataset = train_test_valid_dataset.remove_columns(['data_id', '2d_coor', 'data_title','data_category'])
    print(dataset)
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


    def tokenize_function(examples):
    # refer documentation: padding=True or padding="max_length"
        return tokenizer(examples["data_string"], padding=True, truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["valid"]
    test_dataset = tokenized_datasets['test']

    tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    
#     tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    tokenized_datasets = tokenized_datasets.rename_column("data_category_number", "labels")
    
    tokenized_datasets.set_format("torch")
    
    small_train_dataset = tokenized_datasets["train"]
    small_eval_dataset = tokenized_datasets["test"]
    print(small_train_dataset)
    
    from torch.utils.data import DataLoader
    train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=4)
    eval_dataloader = DataLoader(small_eval_dataset, batch_size=4)
    
    from transformers import AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=number_of_labels)
    
    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    from transformers import get_scheduler

    num_epochs = number_of_epochs
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    
    import torch

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#     device = torch.device("cpu")
    model.to(device)
    
    from tqdm.auto import tqdm
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            print(batch)
            outputs = model(**batch)
#             print(outputs)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            break
        break
            
    from datasets import load_metric
    metric = load_metric("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    final_score = metric.compute()
    return model, final_score

In [27]:
def get_json_file(data, model, output_file_name="something.json"):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    
    embedding_list = []
    for idx in range(len(data['data_string'])):
        if type(data['data_string'][idx]) is float:
            continue
        embed = getEmbeddings([data['data_string'][idx]], model, tokenizer).tolist()[0]
        embedding_list.append(embed)
    
    import numpy as np
    embeddings_for_umap = np.array(embedding_list)
    
    import umap.umap_ as umap
    umap_embedding = umap.UMAP().fit_transform(embeddings_for_umap, y=list(data['data_category_number']))
    
    data['2d_coor'] = umap_embedding.tolist()
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
    #     tmp_var = data_df_china_news['2d_coor'][idx].strip('][').split(', ')
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
    #     tmp_dict["data_x"] = str(tmp_var[0])
    #     tmp_dict["data_y"] = str(tmp_var[1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])
#         tmp_dict["data_string"] = str(data['data_string'][idx])
        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [28]:
def train_masked_bert(data, num_epochs=2, number_of_labels=5):
    from transformers import AutoTokenizer, BertForMaskedLM
    import torch

    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    model = BertForMaskedLM.from_pretrained('bert-base-cased')
    
    data = data.dropna()
    display(data)
    inputs = tokenizer(list(data['data_string']), return_tensors='pt', padding=True, truncation=True)
    
    inputs['labels'] = inputs.input_ids.detach().clone()
    
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
    
    selection = []

    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
        
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings):
            self.encodings = encodings
        def __getitem__(self, idx):
            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        def __len__(self):
            return len(self.encodings.input_ids)
        
    dataset = CustomDataset(inputs)
    
    loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # device = torch.device('cpu')
    # and move our model over to the selected device
    model.to(device)
    # activate training mode
    model.train()
    
    from transformers import AdamW
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=5e-5)
    
    from tqdm import tqdm  # for our progress bar

    epochs = num_epochs

    for epoch in range(epochs):
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True)
        for batch in loop:
            torch.cuda.empty_cache() #############
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            labels=labels)
            # extract loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
            torch.cuda.empty_cache()
#             break
#         break
            
    model.save_pretrained('pytorch_model_unsupervised_finetuned')
    return None

In [10]:
def fineTuneModelUnsupervised(df, number_of_labels=19, number_of_epochs=3):
    from datasets import load_dataset, Dataset
#     dataset = load_dataset('csv', data_files='Care_Reviews.csv', split='train')
    df = df.dropna()
    display(df)
    dataset = Dataset.from_pandas(df)
    dataset = dataset.remove_columns(['__index_level_0__'])

    from datasets import DatasetDict
    
    train_testvalid = dataset.train_test_split()
    test_valid = train_testvalid['test'].train_test_split()
    
    train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
    
    dataset = train_test_valid_dataset.remove_columns(['data_id', '2d_coor', 'data_title','data_category'])
    
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


    def tokenize_function(examples):
    # refer documentation: padding=True or padding="max_length"
        return tokenizer(examples["data_string"], padding=True, truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["valid"]
    test_dataset = tokenized_datasets['test']

    tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    
#     tokenized_datasets = tokenized_datasets.remove_columns(["data_string"])
    tokenized_datasets = tokenized_datasets.rename_column("data_category_number", "labels")
    
    tokenized_datasets.set_format("torch")
    
    small_train_dataset = tokenized_datasets["train"]
    small_eval_dataset = tokenized_datasets["test"]
#     print(small_eval_dataset)
    
    from torch.utils.data import DataLoader
    train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
    eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
    
    from transformers import AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained("pytorch_model_unsupervised_finetuned", num_labels=number_of_labels)
    
    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    from transformers import get_scheduler

    num_epochs = number_of_epochs
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    
    import torch

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    from tqdm.auto import tqdm
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
#             print(outputs)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
#             break
#         break
            
    from datasets import load_metric
    metric = load_metric("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    final_score = metric.compute()
    return model, final_score

In [8]:
# Below are functions to delete files and directories (you will not be able to delete directories/files
# directly from the Jupyter Notebook UI)

In [11]:
# use the space below to play (call) with functions initialized above

In [13]:
# Convert .json to .csv
import pandas as pd
with open('Care_reviews_10k.json', encoding='utf-8') as inputfile:
    df = pd.read_json(inputfile)
df
df = df.drop(columns=[1,3,4,6])
df = df.rename(columns={0:'data_id', 2:'data_title', 5:'data_string'})

df['2d_coor'] = ''
df['data_category'] = ''
df['data_category_number'] = ''
df = df.reindex(columns=['data_id', 'data_string', '2d_coor', 'data_title', 'data_category', 'data_category_number'])
df.dropna()

for index in range(len(df)):
    pos=df.loc[index, "data_title"].find("rating: ")
    #print(pos)
   
    #print(aaa)
    df['data_category_number'][index]=df.loc[index, "data_title"][pos+8]
    if df.loc[index, "data_title"][pos+8] == "1":
        df['data_category'][index]="Poor"
    elif df.loc[index, "data_title"][pos+8] == "2":
        df['data_category'][index]="Fair"
    elif df.loc[index, "data_title"][pos+8] == "3":
        df['data_category'][index]="Good"
    elif df.loc[index, "data_title"][pos+8] == "4":
        df['data_category'][index]="Very Good"
    elif df.loc[index, "data_title"][pos+8] == "5":
        df['data_category'][index]="Excelent"
    df.loc[index, "data_title"]=df.loc[index, "data_title"][0:pos-2]
df.head()

df.to_csv('Care_review_full.csv', encoding='utf-8', index=False)
display(df)

Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
9995,0669d71c-f596-4183-a6ef-6ceb660cac3f,I needed a prescription and thought going to t...,,urgentCare: Cleveland Clinic Express Care Clinic,Poor,1
9996,7a73d727-b0eb-48e2-8c01-b6faf07ea776,"Excellent, friendly staff. Got me in and out i...",,urgentCare: CareFirst Urgent Care - Kenwood,Excelent,5
9997,baf90288-9dc4-4de4-9b8f-f31322f48747,Awesome staff!!,,urgentCare: CareFirst Urgent Care - Kenwood,Excelent,5
9998,672cf0b1-0bc6-45aa-be18-b80d196870f7,I'm really glad that CareFirst open this facil...,,urgentCare: CareFirst Urgent Care - Kenwood,Excelent,5


In [74]:
### Step 1 ###

# This function will create a csv in a format (mostly changing columns names) that we need for training models
# The following columns are needed: data_category_number, data_title, data_string, data_category, data_id, 2d_coor

# Note: This function will have to be modified according to the need as not all datasets have labels but above
# mentioned columns should be there
def create_structured_csv(csv_file_name):
    import pandas as pd
    data = pd.read_csv(csv_file_name)
    #data = data.drop(columns=['Unnamed: 0', 'year'])
    data = data.rename(columns={'seq':'data_category_number', 'title':'data_title', 'abstract':'data_string', 'CODE':'data_category','id':'data_id'})
    data['2d_coor'] = ''
    data = data.reindex(columns=['data_id', 'data_string', '2d_coor', 'data_title', 'data_category', 'data_category_number'])
    data.to_csv('Care_review_full.csv')
    return data # data is pd object of a csv

csv_file_name = 'Care_review_full.csv'
data = create_structured_csv(csv_file_name)

In [75]:
### Step 2 ###

# Not all the datasets at DSC are labelled. Hence, we need to label some and we use LDA for that
def apply_lda_on_dataset(df):
    import gensim
    from gensim.utils import simple_preprocess
    from gensim.parsing.preprocessing import STOPWORDS
#     from nltk.stem.porter import *
    from gensim import corpora, models
    import numpy as np
    import pandas as pd
    np.random.seed(2018)

    import nltk
    nltk.download('wordnet')

    # Function that preprocesses all text documents before feeding to lda model
    def preprocess(text):
        result = []
        i = 0
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(token)
        return result
    
    # the file name should be the one generated from the function above     
#     data = pd.read_csv('Care_Reviews.csv')
    data = df
    documents = data
    documents = documents.dropna(subset=['data_string'])
    processed_docs = documents['data_string'].map(preprocess)
    
    # Creates a dictionary from the documents (Note: Here the argument 'preprocessed_docs' is a 'list of lists')
    dictionary = gensim.corpora.Dictionary(processed_docs)
    
    # Creates a bag_of_words corpus     
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    # Creates a tfidf matrix/table required for training
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    
    # Trains an lda model with tfidf
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)
    
    for idx in range(len(data['data_string'])):
        data_string = data['data_string'][idx]
#         print(type(data_string))
#         print(dictionary.doc2bow(preprocess(data_string)))
#         print(lda_model_tfidf.get_document_topics())
        data_string_topic_no = lda_model_tfidf.get_document_topics(dictionary.doc2bow(preprocess(data_string)))[0][0]
#         print(data_string_topic_no)
        data_string_topic = lda_model_tfidf.print_topic(data_string_topic_no)
        data['data_category'][idx] = data_string_topic
        data_string_topic_num = lda_model_tfidf.get_document_topics(dictionary.doc2bow(preprocess(data_string)))[0][0]
        data['data_category_number'][idx] = data_string_topic_num
        
    return data

In [76]:
data= pd.read_csv("Care_review_full.csv")
data = apply_lda_on_dataset(data)

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category'][idx] = data_string_topic
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data_string_topic_num


KeyboardInterrupt: 

In [None]:
data.to_csv('news_articles.csv')

In [11]:
# The following function is responsible for fine-tuning an existing Bert Model (from huggingface) with a DSC dataset
def finetuneBertSeqModelWithCustomDataset(input_file_name="Care_review_full.csv", 
                                         model_max_length=500,
                                         number_of_labels=15,
                                         number_of_epochs=15,
                                         output_file_name="First_Care_review_full.json"):
    # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)     
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Function that returns a fine-tuned model (fine-tuned on DSC dataset) and its score
    model, score = fineTuneModel(df=data, number_of_labels=number_of_labels, number_of_epochs=number_of_epochs)
    
    print('Here\s the fine-tuned model: ', model)
    print('Accuracy of the fine-tuned model on the test dataset is: ', score)
    
    # Function that returns the dataframe with embeddings (UMAP reduces high dimensional embedding to 2D)     
    data_with_embeddings = get_json_file(data, model, output_file_name)

In [12]:
finetuneBertSeqModelWithCustomDataset(input_file_name="Care_review_full.csv", 
                                         model_max_length=500,
                                         number_of_labels=15,
                                         number_of_epochs=15,
                                         output_file_name="First_Care_review_full.json")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
10039,e05ca558-6fae-42bb-8271-8f7427fd22dd,doctor but I think I know the difference betwe...,,urgentCare: CareFirst Urgent Care - West Chester,Poor,1
10040,dba4b761-92ad-4b74-8fdf-f1b0e12debcb,So my experience started out well this morning...,,urgentCare: Eastside Urgent Care,Poor,1
10041,dba4b761-92ad-4b74-8fdf-f1b0e12debcb,what a waste of an hour drive one way and then...,,urgentCare: Eastside Urgent Care,Poor,1
10042,9aa30e73-9c55-47b2-a181-b1073b60058f,I have mixed emotions about this On Demand cen...,,urgentCare: Kettering Health On-Demand Care - ...,Good,3


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
10039,e05ca558-6fae-42bb-8271-8f7427fd22dd,doctor but I think I know the difference betwe...,,urgentCare: CareFirst Urgent Care - West Chester,Poor,1
10040,dba4b761-92ad-4b74-8fdf-f1b0e12debcb,So my experience started out well this morning...,,urgentCare: Eastside Urgent Care,Poor,1
10041,dba4b761-92ad-4b74-8fdf-f1b0e12debcb,what a waste of an hour drive one way and then...,,urgentCare: Eastside Urgent Care,Poor,1
10042,9aa30e73-9c55-47b2-a181-b1073b60058f,I have mixed emotions about this On Demand cen...,,urgentCare: Kettering Health On-Demand Care - ...,Good,3


  from .autonotebook import tqdm as notebook_tqdm
100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 10.12ba/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.92ba/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  9.12ba/s]
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Be

KeyboardInterrupt: 

In [41]:
# The following function is responsible for pretraining a masked bert model and fine-tuning the same pretrained model 
# (from huggingface) with a DSC dataset
def finetuneBertModelAfterPretrainingOfMaskedBertWithCustomDataset(input_file_name="Care_review_full.csv", 
                                         model_max_length=500,
                                         number_of_labels=19,
                                         number_of_epochs_for_masked_bert=2,
                                         number_of_epochs_for_finetuning_masked_bert=5,                 
                                         output_file_name="Second_Care_review_full.json"):
    
    # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Function that pretrains a masked bert model and saves that model in a directory: 'pytorch_model_unsupervised_finetuned'
    train_masked_bert(data, num_epochs=number_of_epochs_for_masked_bert, number_of_labels=number_of_labels)
    
    # Function that fine-tunes the above pretrained masked bert model
    new_model, score = fineTuneModelUnsupervised(df, number_of_labels=number_of_labels, number_of_epochs=number_of_epochs_for_finetuning_masked_bert)
    
    print('Here\s the fine-tuned model: ', model)
    print('Accuracy of the fine-tuned model on the test dataset is: ', score)
    
    # Function that returns the dataframe with embeddings (UMAP reduces high dimensional embedding to 2D)     
    data_with_embeddings = get_json_file(data, model, output_file_name)

In [43]:
finetuneBertModelAfterPretrainingOfMaskedBertWithCustomDataset(input_file_name="Care_review_full.csv", 
                                         model_max_length=500,
                                         number_of_labels=19,
                                         number_of_epochs_for_masked_bert=2,
                                         number_of_epochs_for_finetuning_masked_bert=5,                 
                                         output_file_name="Second_Care_review_full.json")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
33491,37b547d6-931d-417d-b732-95845b0bcc22,anyways the whole experience was awful and I h...,,hospital: Licking Memorial Hospital,Poor,1
33492,26efef6b-9f07-4a25-8a91-f85926b2012e,went to this ER twice and each time the doctor...,,hospital: Mount Carmel East,Poor,1
33493,26efef6b-9f07-4a25-8a91-f85926b2012e,ER. The doctors don't listen or care about the...,,hospital: Mount Carmel East,Poor,1
33494,017ced75-7179-4f6f-b467-32113a9c1704,"I usually review restaurants, but occasionally...",,hospital: University Hospitals Parma Medical C...,Very Good,4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
33491,37b547d6-931d-417d-b732-95845b0bcc22,anyways the whole experience was awful and I h...,,hospital: Licking Memorial Hospital,Poor,1
33492,26efef6b-9f07-4a25-8a91-f85926b2012e,went to this ER twice and each time the doctor...,,hospital: Mount Carmel East,Poor,1
33493,26efef6b-9f07-4a25-8a91-f85926b2012e,ER. The doctors don't listen or care about the...,,hospital: Mount Carmel East,Poor,1
33494,017ced75-7179-4f6f-b467-32113a9c1704,"I usually review restaurants, but occasionally...",,hospital: University Hospitals Parma Medical C...,Very Good,4


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
33491,37b547d6-931d-417d-b732-95845b0bcc22,anyways the whole experience was awful and I h...,,hospital: Licking Memorial Hospital,Poor,1
33492,26efef6b-9f07-4a25-8a91-f85926b2012e,went to this ER twice and each time the doctor...,,hospital: Mount Carmel East,Poor,1
33493,26efef6b-9f07-4a25-8a91-f85926b2012e,ER. The doctors don't listen or care about the...,,hospital: Mount Carmel East,Poor,1
33494,017ced75-7179-4f6f-b467-32113a9c1704,"I usually review restaurants, but occasionally...",,hospital: University Hospitals Parma Medical C...,Very Good,4


RuntimeError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 15.78 GiB total capacity; 4.93 GiB already allocated; 13.00 MiB free; 4.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [46]:
def UMAPWithCustomDataset(input_file_name="Care_review_full.csv", 
                        model_max_length=384,
                        output_json_file_name='Third_Care_review_full.json',
                         use_labels=True,
                         sentence_transformer_name='sentence-transformers/all-mpnet-base-v2'):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_umap = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with UMAP
    import umap.umap_ as umap
    umap_embedding = umap.UMAP().fit_transform(embeddings_for_umap, y=list(data['data_category_number']) if use_labels else None)
    
    data['2d_coor'] = umap_embedding.tolist()
    display(data)
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [47]:
UMAPWithCustomDataset(input_file_name="Care_review_full.csv", 
                        model_max_length=384,
                        output_json_file_name='Third_Care_review_full.json',
                         use_labels=True,
                         sentence_transformer_name='sentence-transformers/all-mpnet-base-v2')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
33498,37b547d6-931d-417d-b732-95845b0bcc22,and asked me like a hundred ridiculous questio...,,hospital: Licking Memorial Hospital,Poor,1
33499,26efef6b-9f07-4a25-8a91-f85926b2012e,went to this ER twice and each time the doctor...,,hospital: Mount Carmel East,Poor,1
33500,26efef6b-9f07-4a25-8a91-f85926b2012e,TO HELP ME. Why would you work in the medical ...,,hospital: Mount Carmel East,Poor,1
33501,017ced75-7179-4f6f-b467-32113a9c1704,"I usually review restaurants, but occasionally...",,hospital: University Hospitals Parma Medical C...,Very Good,4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
33498,37b547d6-931d-417d-b732-95845b0bcc22,and asked me like a hundred ridiculous questio...,,hospital: Licking Memorial Hospital,Poor,1
33499,26efef6b-9f07-4a25-8a91-f85926b2012e,went to this ER twice and each time the doctor...,,hospital: Mount Carmel East,Poor,1
33500,26efef6b-9f07-4a25-8a91-f85926b2012e,TO HELP ME. Why would you work in the medical ...,,hospital: Mount Carmel East,Poor,1
33501,017ced75-7179-4f6f-b467-32113a9c1704,"I usually review restaurants, but occasionally...",,hospital: University Hospitals Parma Medical C...,Very Good,4


RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 15.78 GiB total capacity; 4.93 GiB already allocated; 13.00 MiB free; 4.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [19]:
d = UMAPWithCustomDataset(input_file_name='news_articles.csv', 
                        model_max_length=384,
                        output_json_file_name='news_articles_umap.json',
                         use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[6.541574001312256, 5.994854927062988]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[-0.31491324305534363, 5.71586799621582]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[0.19933286309242249, 8.378273963928223]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[6.3932318687438965, 10.417623519897461]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[7.935702800750732, -7.721569061279297]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[3.2640509605407715, 16.858848571777344]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[3.2133913040161133, 16.883222579956055]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[2.730613946914673, 17.219083786010742]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[9.606926918029785, 17.593767166137695]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[6.541574001312256, 5.994854927062988]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[-0.31491324305534363, 5.71586799621582]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[0.19933286309242249, 8.378273963928223]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[6.3932318687438965, 10.417623519897461]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[7.935702800750732, -7.721569061279297]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[3.2640509605407715, 16.858848571777344]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[3.2133913040161133, 16.883222579956055]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[2.730613946914673, 17.219083786010742]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[9.606926918029785, 17.593767166137695]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


In [20]:
def TSNEWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                         use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_tsne = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with TSNE
    from sklearn.manifold import TSNE
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(embeddings_for_tsne, list(data['data_category_number']) if use_labels else None)
    
    data['2d_coor'] = tsne_results.tolist()
    display(data)
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [21]:
d = TSNEWithCustomDataset(input_file_name='news_articles.csv', 
                        model_max_length=384,
                        output_json_file_name='news_articles_tsne.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                         use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4




[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 193492 samples in 0.069s...
[t-SNE] Computed neighbors for 193492 samples in 1197.803s...
[t-SNE] Computed conditional probabilities for sample 1000 / 193492
[t-SNE] Computed conditional probabilities for sample 2000 / 193492
[t-SNE] Computed conditional probabilities for sample 3000 / 193492
[t-SNE] Computed conditional probabilities for sample 4000 / 193492
[t-SNE] Computed conditional probabilities for sample 5000 / 193492
[t-SNE] Computed conditional probabilities for sample 6000 / 193492
[t-SNE] Computed conditional probabilities for sample 7000 / 193492
[t-SNE] Computed conditional probabilities for sample 8000 / 193492
[t-SNE] Computed conditional probabilities for sample 9000 / 193492
[t-SNE] Computed conditional probabilities for sample 10000 / 193492
[t-SNE] Computed conditional probabilities for sample 11000 / 193492
[t-SNE] Computed conditional probabilities for sample 12000 / 193492
[t-SNE] Computed conditional pro

[t-SNE] Computed conditional probabilities for sample 118000 / 193492
[t-SNE] Computed conditional probabilities for sample 119000 / 193492
[t-SNE] Computed conditional probabilities for sample 120000 / 193492
[t-SNE] Computed conditional probabilities for sample 121000 / 193492
[t-SNE] Computed conditional probabilities for sample 122000 / 193492
[t-SNE] Computed conditional probabilities for sample 123000 / 193492
[t-SNE] Computed conditional probabilities for sample 124000 / 193492
[t-SNE] Computed conditional probabilities for sample 125000 / 193492
[t-SNE] Computed conditional probabilities for sample 126000 / 193492
[t-SNE] Computed conditional probabilities for sample 127000 / 193492
[t-SNE] Computed conditional probabilities for sample 128000 / 193492
[t-SNE] Computed conditional probabilities for sample 129000 / 193492
[t-SNE] Computed conditional probabilities for sample 130000 / 193492
[t-SNE] Computed conditional probabilities for sample 131000 / 193492
[t-SNE] Computed con

Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[4.397582054138184, -3.5684354305267334]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[-0.3279964327812195, 8.61110782623291]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[4.169193744659424, -4.005739212036133]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[-2.864100933074951, -0.9488449692726135]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[0.2897432744503021, 10.175537109375]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[-5.589747428894043, 3.172562599182129]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[4.083289623260498, 9.9985933303833]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[4.8006367683410645, 9.934613227844238]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[4.684049606323242, 9.922048568725586]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[4.397582054138184, -3.5684354305267334]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[-0.3279964327812195, 8.61110782623291]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[4.169193744659424, -4.005739212036133]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[-2.864100933074951, -0.9488449692726135]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[0.2897432744503021, 10.175537109375]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[-5.589747428894043, 3.172562599182129]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[4.083289623260498, 9.9985933303833]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[4.8006367683410645, 9.934613227844238]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[4.684049606323242, 9.922048568725586]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


In [22]:
def PCAWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_pca = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with PCA
    from sklearn.decomposition import PCA

    pca_2d_class = PCA(n_components=p_components).fit(embeddings_for_pca, list(data['data_category_number']) if use_labels else None)

    pca_2d = pca_2d_class.transform(embeddings_for_pca)
    
    data['2d_coor'] = pca_2d.tolist()
    display(data)
    
    
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
        tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [23]:
d = PCAWithCustomDataset(input_file_name='news_articles.csv', 
                        model_max_length=384,
                        output_json_file_name='news_articles_pca.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,,The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,,Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,,An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...",,Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,,Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...",,Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[-0.21102119982242584, -0.1436595618724823]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[0.12903806567192078, 0.03234309330582619]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[-0.14857056736946106, -0.17932890355587006]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[0.33694982528686523, -0.17755252122879028]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[-0.026972472667694092, 0.012225701473653316]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[0.3518408536911011, -0.1886298656463623]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[0.31185394525527954, -0.04102354869246483]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[0.1637495756149292, 0.08634164929389954]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[0.14689122140407562, 0.08248250186443329]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,115,A collaboration between artist Christina Kelly...,"[-0.21102119982242584, -0.1436595618724823]",The History of Gowanus Cemented in Sculpture,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
1,118,As Hurricane Irma draws closer to the Florida ...,"[0.12903806567192078, 0.03234309330582619]",Emergency Services Rush to Save Expensive Wine...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
2,119,Raúl Ortega Ayala’s new exhibition at Proyecto...,"[-0.14857056736946106, -0.17932890355587006]",An Artist Serves Up Food for Thought About Exc...,"0.005*""colbert"" + 0.005*""corden"" + 0.004*""week...",7
3,122,"Welcome to the public markets, Snapchat. Stock...","[0.33694982528686523, -0.17755252122879028]",Snap stock took a beating Monday and fell more...,"0.003*""apple"" + 0.003*""facebook"" + 0.002*""goog...",11
4,125,Vox Sentences is written by Dylan Matthews and...,"[-0.026972472667694092, 0.012225701473653316]",Vox Sentences: There’s a coup underway in Turkey,"0.005*""percent"" + 0.004*""reuters"" + 0.004*""com...",12
...,...,...,...,...,...,...
193487,813146,VNO's case approximately 90% of EBITDA will be...,"[0.3518408536911011, -0.1886298656463623]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193488,813146,"Individually or Collectively, Lead to Negative...","[0.31185394525527954, -0.04102354869246483]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193489,813146,THIS SITE. DIRECTORS AND SHAREHOLDERS RELEVANT...,"[0.1637495756149292, 0.08634164929389954]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4
193490,813146,"work of experts, including independent auditor...","[0.14689122140407562, 0.08248250186443329]",Fitch Affirms American Assets Trust's IDR at '...,"0.011*""million"" + 0.009*""versus"" + 0.009*""eiko...",4


In [50]:
def KMeansAndPCAWithCustomDataset(input_file_name="Care_review_full.csv", 
                        model_max_length=384,
                        output_json_file_name='Fifth_Care_review_full.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_kmeans = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with PCA After applying KMeans
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    
    kmeans = KMeans(n_clusters=len(data['data_category_number'].unique()))
    labels = kmeans.fit_predict(embeddings_for_kmeans, list(data['data_category_number']) if use_labels else None)
    labels_scale = kmeans.labels_

    pca_2d_class = PCA(n_components=p_components).fit(embeddings_for_kmeans, labels.tolist())

    pca_2d = pca_2d_class.transform(embeddings_for_kmeans)
    
    data['2d_coor'] = pca_2d.tolist()
    display(data)
    
    new_labels = labels.tolist()
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
#         tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_category_number"] = str(new_labels[idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [51]:
d = KMeansAndPCAWithCustomDataset(input_file_name='news_articles.csv', 
                        model_max_length=384,
                        output_json_file_name='news_articles_kmeanspca.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True)
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


ValueError: cannot set a row with mismatched columns

In [48]:
def PCAAndKMeansWithCustomDataset(input_file_name='Care_Reviews.csv', 
                        model_max_length=384,
                        output_json_file_name='test.json',
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True):
     # Function that returns a new dataframe with reduced sentence sizes (as most bert models have a max_seq_length)
    data = updateDataFrame(input_file_name, model_max_length=model_max_length)
    print('Dataframe with reduced sentence sizes: \n')
    display(data)
    
    # Replacing all NaN fields under '2d_coor' column with an empty string
    print('Dataframe with NaN removed: \n')
    data['2d_coor'] = ''
    display(data)
    
    # Using a bert model from sentence_transformers to generate embeddings
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(sentence_transformer_name)
    sentences = data['data_string']
    embeddings_for_kmeans = model.encode(sentences)
    
    # Reducing the dimensionality of embeddings with PCA After applying KMeans
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    
    pca_2d_class = PCA(n_components=p_components).fit(embeddings_for_kmeans, list(data['data_category_number']) if use_labels else None)
    pca_2d = pca_2d_class.transform(embeddings_for_kmeans)
    
    kmeans = KMeans(n_clusters=len(data['data_category_number'].unique()))
    labels = kmeans.fit_predict(pca_2d, list(data['data_category_number']) if use_labels else None)
    labels_scale = kmeans.labels_

    data['2d_coor'] = pca_2d.tolist()
    display(data)
    
    new_labels = labels.tolist()
    list_of_points = []
    for idx in range(len(data['data_string'])):
        tmp_dict = {}
        tmp_dict["data_x"] = str(data['2d_coor'][idx][0])
        tmp_dict["data_y"] = str(data['2d_coor'][idx][1])
#         tmp_dict["data_category_number"] = str(data['data_category_number'][idx])
        tmp_dict["data_category_number"] = str(new_labels[idx])
        tmp_dict["data_id"] = str(data['data_id'][idx])

        tmp_dict["data_title"] = str(data['data_title'][idx])
        tmp_dict["data_category"] = str(data['data_category'][idx])

        list_of_points.append(tmp_dict)
        
    import json
    with open(output_json_file_name, "w") as outfile:
        json.dump(list_of_points, outfile)
        
    return data

In [49]:
d = PCAAndKMeansWithCustomDataset(input_file_name="Care_review_full.csv", 
                        model_max_length=384,
                        output_json_file_name="Forth_Care_review_full.json",
                         sentence_transformer_name='all-MiniLM-L6-v2',
                        p_components=2,
                        # Keep use_labels=True always for KMeans+PCA (reason: there can be countless clusters without labels)                                   
                        use_labels=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['data_category_number'][idx] = data['data_category_number'][idx]


Dataframe with reduced sentence sizes: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
33498,37b547d6-931d-417d-b732-95845b0bcc22,and asked me like a hundred ridiculous questio...,,hospital: Licking Memorial Hospital,Poor,1
33499,26efef6b-9f07-4a25-8a91-f85926b2012e,went to this ER twice and each time the doctor...,,hospital: Mount Carmel East,Poor,1
33500,26efef6b-9f07-4a25-8a91-f85926b2012e,TO HELP ME. Why would you work in the medical ...,,hospital: Mount Carmel East,Poor,1
33501,017ced75-7179-4f6f-b467-32113a9c1704,"I usually review restaurants, but occasionally...",,hospital: University Hospitals Parma Medical C...,Very Good,4


Dataframe with NaN removed: 



Unnamed: 0,data_id,data_string,2d_coor,data_title,data_category,data_category_number
0,a224545f-1a07-47eb-96ee-a0d0cab23100,Good place to visit,,urgentCare: Access Urgent Medical Care Pickeri...,Very Good,4
1,01a8cfc8-e207-4c72-a9fa-fa6353fde529,Went here for a swollen Jaw. Even though I was...,,urgentCare: Access Urgent Medical Care Pickeri...,Poor,1
2,1a4fb4a6-6c56-4610-a151-b4b25a774cc8,I was seen relatively quickly and the staff wa...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
3,9356a4fd-54fd-4604-a6a8-2d3067eba7cc,Reception and service couldn't have been more ...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
4,77526eed-5ff3-4f2f-80ce-da1048137cf4,I came in they were very busy the receptionist...,,urgentCare: Access Urgent Medical Care Pickeri...,Excelent,5
...,...,...,...,...,...,...
33498,37b547d6-931d-417d-b732-95845b0bcc22,and asked me like a hundred ridiculous questio...,,hospital: Licking Memorial Hospital,Poor,1
33499,26efef6b-9f07-4a25-8a91-f85926b2012e,went to this ER twice and each time the doctor...,,hospital: Mount Carmel East,Poor,1
33500,26efef6b-9f07-4a25-8a91-f85926b2012e,TO HELP ME. Why would you work in the medical ...,,hospital: Mount Carmel East,Poor,1
33501,017ced75-7179-4f6f-b467-32113a9c1704,"I usually review restaurants, but occasionally...",,hospital: University Hospitals Parma Medical C...,Very Good,4


RuntimeError: CUDA out of memory. Tried to allocate 46.00 MiB (GPU 0; 15.78 GiB total capacity; 4.93 GiB already allocated; 13.00 MiB free; 4.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF