<a href="https://colab.research.google.com/github/vlavrent/NLP_Dash/blob/main/Topic_Modelling_for_English_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install bertopic

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

class English_Preprocess():
    def __init__(self,data_path,comment_col,normalised_col):
        self.path = data_path
        self.data = pd.read_csv(self.path,index_col=0)
        self.col = comment_col
        self.norm_col = normalised_col


    # Read data
    def read_data(self):
        return pd.read_csv(self.path,index_col=0)

    # Remove emoji
    def remove_emojis(self,text):
      emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)

      return emoji_pattern.sub(r'', text)

    # Remove emoji per row
    def convert_emoji(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x:  self.remove_emojis(x))
        self.data = self.data.reset_index()

    # Remove null rows
    def remove_null(self):
        self.data = self.data[self.data[self.col]!='\\N']
        self.data = self.data.dropna()

    # Add space between numbers and text
    def create_number_string_space(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub('\d{5,}', r' ', x))
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub('i x e', r' ', x))
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub('(\d+(\.\d+)?)', r' \1 ', x))


    # Lowercase text
    def lower_data(self):
        self.data[self.norm_col] = self.data[self.col].str.lower()

    # Remove Punctuation
    def remove_punctuation(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    def replace_words_characters(self,x):

        x = re.sub("don’t|dont|don't","do not",x)
        x = re.sub("doesn’t|doesnt","does not",x)
        x = re.sub("didn’t|didnt","did not",x)
        x = re.sub("i'm|i’m","i am",x)
        x = re.sub("fair","fare",x)
        x = re.sub("\\\\n"," ",x)

        return x

   # Replace words
    def replace_values(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: self.replace_words_characters(x))

    # Remove two or more spaces
    def remove_multiple_space(self):
        #self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub(' +', ' ',x))

        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: " ".join(x.split()))

    # Dropna
    def empty_string(self):
      self.data = self.data[self.data[self.norm_col]!='']

    # Remove 3 character words
    def filter_char(self):
      self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
      #self.data = self.data[self.data['char_bool']!=True]


    def return_data(self):

        self.remove_null()

        self.lower_data()

        self.convert_emoji()

        self.replace_values()

        self.create_number_string_space()

        self.remove_multiple_space()

        self.empty_string()

        return self.data

# Split train and label
def split_data(data):
  X = data[['normalised']]
  y = data[['New_Topic']]
  return X,y

# Define Encoder
def encoder(train):
  le = LabelEncoder().fit(train)
  return le

# Label Encoder
def label_encode_decode(train,predict,action,le):

  if action=='encode':
    return le.transform(train)
  elif action=='decode':
    return le.inverse_transform(predict)



In [4]:
clean_data = English_Preprocess('/content/drive/My Drive/NLP Dash/train_test/courier_csat_train_set.csv','csat_comment','normalised')
data  = clean_data.return_data()
data
X,y = split_data(data)
enc = encoder(y)
y_encoded = label_encode_decode(y,None,'encode',enc)

<H1>Topic model</H1>

In [9]:
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model
)

topics, probs = topic_model.fit_transform(X['normalised'].values.tolist(), y=y_encoded)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

<H1>Predict Topics</H1>

In [12]:
clean_data = English_Preprocess('/content/drive/My Drive/NLP Dash/train_test/courier_csat_test_set.csv','reason','normalised')
pred_data  = clean_data.return_data()


topic, _ = topic_model.transform(pred_data['normalised'].values.tolist())
pred_topic = label_encode_decode(y,topic,'decode')
pred_topic = pd.DataFrame(pred_topic,columns = ['Topic'])
pred_data['Topic'] = pred_topic
pred_data

Unnamed: 0,driver_uuid,datestr,score,reason,normalised,Topic
0,bf63836f-01cf-41e9-b655-60b97b92b076,2024-01-01,10,All the things are feb and excellent,all the things are feb and excellent,Account deactivation
1,35dd1b42-b547-4764-830a-4bc127fbcf48,2024-01-01,7,As a plus The setup is good\nBut as a negative...,as a plus the setup is good but as a negative ...,Demand
2,03b0300d-7d73-404e-9a1c-ed0439b341e4,2024-01-01,10,Because am very happy and Lecky to work with U...,because am very happy and lecky to work with u...,Account deactivation
3,461fad3f-7934-47c0-a581-03abf381729d,2024-01-01,10,Because is the best delivery and app I love uber,because is the best delivery and app i love uber,Account deactivation
4,bc5a4a73-ef73-47e6-8a3d-6edd4320f16c,2024-01-01,10,Because it's a best service. And it's easy to ...,because it's a best service. and it's easy to ...,Account deactivation
...,...,...,...,...,...,...
5191,665ba686-9cb6-44a8-b998-b655be796ebd,2024-05-31,10,Good for acquisitions my money 💰 money 💴,good for acquisitions my money money,
5192,431000e5-dc2b-4928-abac-e50b7e35c52f,2024-05-31,6,The pay rate could be a little better when you...,the pay rate could be a little better when you...,
5193,a6d7def1-9d64-415a-a7be-543fbcfddcaa,2024-05-31,10,Worth it,worth it,
5194,86a76278-f166-4fcd-840c-b566fff070b8,2024-05-31,10,you are good,you are good,


In [13]:
pred_topic

Unnamed: 0,Topic
0,Account deactivation
1,Demand
2,Account deactivation
3,Account deactivation
4,Account deactivation
...,...
5179,Account deactivation
5180,Flexible hours
5181,Account deactivation
5182,Account deactivation
