In [None]:
!pip install transformers tensorflow-text xlsxwriter pandas

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Preparation
- Create a function that create excel to hold the results of the models (Will be used later).
- Extract the relevant data (the client sentences) from the dataset and hold it in a list.

In [4]:
import pandas as pd

def creating_excel() -> pd.DataFrame:
    # Set the column names
    data = {'Sentence': [],'category': []}

    # Create an empty DataFrame.
    output_excel = pd.DataFrame(data)
    print('DataFrame Created')
    return output_excel


In [5]:
import pandas
from xlsxwriter import Workbook

input_excel = pandas.read_csv('/content/drive/MyDrive/src/GPTClassification.csv') # data set

client_message = [] # 364 sentence by client with nouns

# rows num
n_rows = len(input_excel.index)

# columns num
n_cols_ = len(input_excel.columns)


for row in range(n_rows):
    message = input_excel.iloc[row][0]
    client_message.append(message)



# Unsupervised  classification

In unsupervised classification, the algorithm tries to identify patterns or structures within the data by analyzing the similarities and differences between data points. The goal is to partition the data into groups or clusters based on their similarity, so that data points within the same cluster are more similar to each other than to those in other clusters.

- First step: We use Named Entity Recognition with Bert module to get the nouns in the sentence.
- Second Step: We use smaller-LaBSE(Language-agnostic BERT Sentence Embedding) model to get the sentences embeddings.

A note about smaller-LaBSE : LaBSE is a pre-trained model that produces fixed-length vectors to represent input sentences in a way that preserves their semantic meaning across languages.

# First step
We use Named Entity Recognition with Bert module to get the nouns in the sentence.

In [None]:

import torch
import numpy as np

# Load the model and the tokenizer from the downloaded files.
# I added 'map_location=torch.device('cpu')' bcz I use only cpu
model = torch.load(r"/content/drive/MyDrive/src/my_model_3.pth", map_location=torch.device('cpu'))
tokenizer = torch.load(r"/content/drive/MyDrive/src/my_tokenizer.pth", map_location=torch.device('cpu'))

all_sentence_nouns = []

# Our input
for sentence in client_message:
  tokenized_sentence = tokenizer.encode(sentence)  # list of numbers represent each word
  input_ids = torch.tensor([tokenized_sentence])  # I removed the .cuda() bcz I used only cpu on my computer


  tag_values = ['DT', 'POS', 'NNS', 'VBG', 'CD', ';', 'JJS', 'NN', 'RP', '.', 'WP', 'PRP', 'CC', 'WRB', 'RBR', 'MD', 'VBZ', 'UH', 'FW', 'PDT',
                'NNP', ':', 'JJ', 'JJR', 'RRB', '$', 'VB', ',', 'VBP', 'PRP$', 'NNPS', '``', 'IN', 'EX', 'TO', 'RB', 'VBN', 'RBS', 'WDT', 'LRB', 'VBD', 'WP$', 'PAD']

  with torch.no_grad():
      output = model(input_ids)

  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

  new_tokens, new_labels, nouns_from_sentence = [] ,[], []

  for token, label_idx in zip(tokens, label_indices[0]):
      if token.startswith("##"):
          new_tokens[-1] = new_tokens[-1] + token[2:]
      else:
          new_labels.append(tag_values[label_idx])
          new_tokens.append(token)

  for token, label in zip(new_tokens, new_labels):
      if 'NN' in label and '[SEP]' not in token and '[CLS]' not in token and '?' not in token:
          nouns_from_sentence.append(token)

  all_sentence_nouns.append(nouns_from_sentence) # [['noun1','noun2',..],[]]

print(f'all_sentence_nouns = {all_sentence_nouns}')
print(f'client_message = {client_message}')


# Second step
We use smaller-LaBSE(Language-agnostic BERT Sentence Embedding) model to get the sentences embeddings.

We have 3 options:
1. Get the vector for *all* the sentence.
2. Get the vector for a *concatenation string of the nouns* in the sentence.
3. Get a vector *for each noun in the sentence* and sum the values for each category you get and then calc the avg of all the nouns in the sentence and return the max.

For each option we check what category the sentence belong to (by calc the arithmetic distance between the vector that represent the sentence and the vectors that represent the categories).

# The first option
Get the vector for *all* the sentence:

In [None]:
import tensorflow as tf
import tensorflow_text 
import tensorflow_hub as hub
from xlsxwriter import Workbook

TRESHOLD = 0.25

# Loading models from tfhub.dev
encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

# Constructing model to encode texts into high-dimensional vectors
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
encoder_inputs = preprocessor(sentences)
sentence_representation = encoder(encoder_inputs)["pooled_output"]
normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
model = tf.keras.Model(sentences, normalized_sentence_representation)

# Start Algo

index_category = {0:'Environment and climate resilience',1:'Mobility (transport)',2:'Local identity',3:'Future of work',4:'Land use'}

output_excel = creating_excel()  # create an Excel file
excel_index = 0

for sentence in client_message:
  # Encoding the messages and the categories sentences.
  messages_sentences = tf.constant([sentence])
  categories_sentences = tf.constant(["Environment and climate resilience", "Mobility (transport)", "Local identity", "Future of work", "Land use"])

  messages_embeds = model(messages_sentences)
  categories_embeds = model(categories_sentences)

  # Messages-categories similarity
  result = tf.tensordot(messages_embeds, categories_embeds, axes=[[1], [1]])
  

  # write the sentence in the excel
  output_excel.loc[excel_index, 'Sentence'] = sentence

  category = ''
  counter = 0
  for value in result: # result = [[3432 34234 234 324234 23]]
    for i,v in enumerate(value): # for each number in the list
      if float(v) > TRESHOLD: # needs to be change accorindg to the result from ChatGPT
        if counter > 0:
          category += ', ' + index_category.get(i)
        else: 
          category += index_category.get(i)
          counter += 1
  
  output_excel.loc[excel_index, 'category'] = category
  excel_index += 1

output_excel.to_excel("Results1.xlsx", index=False)  # save the Excel file

# The second option
Get the vector for a *concatenation string of the nouns* in the sentence:

In [None]:
import tensorflow as tf
import tensorflow_text  # noqa
import tensorflow_hub as hub
from xlsxwriter import Workbook

TRESHOLD = 0.27

# Loading models from tfhub.dev
encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

# Constructing model to encode texts into high-dimensional vectors
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
encoder_inputs = preprocessor(sentences)
sentence_representation = encoder(encoder_inputs)["pooled_output"]
normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
model = tf.keras.Model(sentences, normalized_sentence_representation)

# Start Algo
index_category = {0:'Environment and climate resilience',1:'Mobility (transport)',2:'Local identity',3:'Future of work',4:'Land use'}
results_vectors = []

output_excel = creating_excel()  # create an Excel file
excel_index = 0


for nouns,sentence in zip(all_sentence_nouns,client_message):

  # write the sentence in the excel
  output_excel.loc[excel_index, 'Sentence'] = sentence

  # when list of nouns is empty, continue to the next iteration
  if len(nouns) == 0:
    output_excel.loc[excel_index, 'category'] = ''
    excel_index += 1
    continue

  # creates a concatenated string of all nouns
  conca_string = ' '.join(nouns)

  # Encoding the messages and the categories sentences.
  messages_sentences = tf.constant([conca_string])
  categories_sentences = tf.constant(["Environment and climate resilience", "Mobility (transport)", "Local identity", "Future of work", "Land use"])

  messages_embeds = model(messages_sentences)
  categories_embeds = model(categories_sentences)

  # Messages-categories similarity
  result = tf.tensordot(messages_embeds, categories_embeds, axes=[[1], [1]])
  results_vectors.append(result[0]) # save all the vectors for concatenated nouns

  category = ''
  counter = 0
  for value in result: # result = [[3432 34234 234 324234 23]]
    for i,v in enumerate(value): # for each number in the list
      if float(v) > TRESHOLD: # needs to be change accorindg to the result from ChatGPT
        if counter > 0:
          category += ', ' + index_category.get(i)
        else: 
          category += index_category.get(i)
          counter += 1

  output_excel.loc[excel_index, 'category'] = category
  excel_index += 1

output_excel.to_excel("Results2.xlsx", index=False)  # save the Excel file



# The third option
Get a vector *for each noun in the sentence* and sum the values for each category you get and then calc the avg of all the nouns in the sentence and return the max:

**example:**

  sentence = "I think there should be many various sitting and studying    places for students, both inside and outside of the building."

  nouns = places students building

  call the model on each noun -> we get [value1,value2,...,value5]

  sum all the values for each categoty and then return the category with the max value.

  

smaller-LaBSE.py

In [None]:
import tensorflow as tf
import tensorflow_text  # noqa
import tensorflow_hub as hub

TRESHOLD = 0.25

# Loading models from tfhub.dev
encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

# Constructing model to encode texts into high-dimensional vectors
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
encoder_inputs = preprocessor(sentences)
sentence_representation = encoder(encoder_inputs)["pooled_output"]
normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
model = tf.keras.Model(sentences, normalized_sentence_representation)

# Start Algo
index_category = {0:'Environment and climate resilience',1:'Mobility (transport)',2:'Local identity',3:'Future of work',4:'Land use'}

output_excel = creating_excel()  # create an Excel file
excel_index = 0
sum_result_column = [0 for i in range(5)]

for nouns,sentence in zip(all_sentence_nouns,client_message):

  output_excel.loc[excel_index, 'Sentence'] = sentence

  # when list of nouns is empty
  if len(nouns) == 0:
    # write the category in the excel
    output_excel.loc[excel_index, 'category'] = ''
    excel_index += 1
    continue

  for noun in nouns:

    # Encoding the messages and the categories sentences.
    messages_sentences = tf.constant([noun])
    categories_sentences = tf.constant(["Environment and climate resilience", "Mobility (transport)", "Local identity", "Future of work", "Land use"])

    messages_embeds = model(messages_sentences)
    categories_embeds = model(categories_sentences)

    # Messages-categories similarity
    result = tf.tensordot(messages_embeds, categories_embeds, axes=[[1], [1]])

    for value in result: # result = [[3432 34234 234 324234 23]]
      for i,v in enumerate(value): # for each number in the list
        sum_result_column[i] += float(v)

  print(f'before sum_result  {sum_result_column}')
  sum_result_column = [num / 5 for num in sum_result_column] # calac avg
  print(f'sum_result  {sum_result_column}')

  category = ''
  counter = 0
  for i,value in enumerate(sum_result_column):
    if float(value) > TRESHOLD:
      if counter > 0:
        category += ', ' + index_category.get(i)
      else: 
        category += index_category.get(i)
        counter += 1

  output_excel.loc[excel_index, 'category'] = category
  excel_index += 1
  

output_excel.to_excel("Results3.xlsx", index=False)  # save the Excel file

# Analyze the data Unsupervised
* Convert the results from the unsupervised methods and ChatGpt to vectors such that each element of the vector represents a category (1 = in the category, 0 = no).
* Check for each option the accuracy, ... (use the results of ChatGpt as a truth ground)

Prepare ChatGPT results

In [None]:
import pandas as pd
import tensorflow as tf
def organize_data(path: str, is_csv: bool):
  if is_csv:
    df_original = pd.read_csv(path)
  else:
    df_original = pd.read_excel(path)

  df_original = df_original[~df_original["Sentence"].duplicated()]

  # Replace ";" with ","
  df_original['category'] = df_original['category'].str.replace(';', ',')
  df_original['category'] = df_original['category'].fillna('other') # replace null in category with "other"

  # change the data in all the category column to be lists that containing the categories ['Land use', 'Mobil...]
  df_original['category'] = df_original['category'].apply(lambda x: x.split(', '))
  df_original['category'] = df_original['category'].apply(tuple)
  return df_original


# Organzie the DATA

In [None]:
# organize the data 
df_clean_GPT = organize_data('/content/drive/MyDrive/src/GPTClassification.csv', True)

df_clean_option_1 = organize_data('/content/drive/MyDrive/src/Results1.xlsx', False)
df_clean_option_2 = organize_data('/content/drive/MyDrive/src/Results2.xlsx',False)
df_clean_option_3 = organize_data('/content/drive/MyDrive/src/Results3.xlsx',False)

categories_GPT = tf.ragged.constant(df_clean_GPT["category"].values)

categories_option_1 = tf.ragged.constant(df_clean_option_1["category"].values)
categories_option_2 = tf.ragged.constant(df_clean_option_2["category"].values)
categories_option_3 = tf.ragged.constant(df_clean_option_3["category"].values)

# creating a vector to represent the labels
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot", num_oov_indices=0)
lookup.adapt(categories_GPT)
vocab = lookup.get_vocabulary()

def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)


label_binarized_chatGPT = lookup(categories_GPT).numpy()

label_binarized_option_1 = lookup(categories_option_1).numpy()
label_binarized_option_2 = lookup(categories_option_2).numpy()
label_binarized_option_3 = lookup(categories_option_3).numpy()

## Option 1

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from tensorflow.python.keras import metrics
from tensorflow.keras import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns

print(f"option 1:\n")

for i in range(label_binarized_option_1.shape[1]):
    print(f"Category {i+1}:\n")
    true_labels = label_binarized_chatGPT[:, i]
    predicted_labels_1 = label_binarized_option_1[:, i]

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels_1)
    precision = precision_score(true_labels, predicted_labels_1)
    recall = recall_score(true_labels, predicted_labels_1)
    f1 = f1_score(true_labels, predicted_labels_1)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

    # Plot confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels_1)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title(f'Confusion matrix for category {i+1}')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

## Option 2

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


print(f"option 2:\n")

for i in range(label_binarized_option_2.shape[1]):
    print(f"Category {i+1}:\n")
    true_labels = label_binarized_chatGPT[:, i]
    predicted_labels_2 = label_binarized_option_2[:, i]

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels_2)
    precision = precision_score(true_labels, predicted_labels_2)
    recall = recall_score(true_labels, predicted_labels_2)
    f1 = f1_score(true_labels, predicted_labels_2)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

    # Plot confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels_2)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title(f'Confusion matrix for category {i+1}')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

## Option 3

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import numpy as np

print(f"option 3:\n")

for i in range(label_binarized_option_3.shape[1]):
    print(f"Category {i+1}:\n")
    true_labels = label_binarized_chatGPT[:, i]
    predicted_labels_3 = label_binarized_option_3[:, i]

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels_3)
    precision = precision_score(true_labels, predicted_labels_3)
    recall = recall_score(true_labels, predicted_labels_3)
    f1 = f1_score(true_labels, predicted_labels_3)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

    # Plot confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels_3)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title(f'Confusion matrix for category {i+1}')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()