<a href="https://colab.research.google.com/github/vlavrent/NLP_Dash/blob/main/Topic_Modelling_for_English_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install bertopic

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

class English_Preprocess():
    def __init__(self,data_path,comment_col,normalised_col):
        self.path = data_path
        self.data = pd.read_csv(self.path,index_col=0)
        self.col = comment_col
        self.norm_col = normalised_col


    # Read data
    def read_data(self):
        return pd.read_csv(self.path,index_col=0)

    # Remove emoji
    def remove_emojis(self,text):
      emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)

      return emoji_pattern.sub(r'', text)

    # Remove emoji per row
    def convert_emoji(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x:  self.remove_emojis(x))
        self.data = self.data.reset_index()

    # Remove null rows
    def remove_null(self):
        self.data = self.data[self.data[self.col]!='\\N']
        self.data = self.data.dropna()

    # Add space between numbers and text
    def create_number_string_space(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub('\d{5,}', r' ', x))
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub('i x e', r' ', x))
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub('(\d+(\.\d+)?)', r' \1 ', x))


    # Lowercase text
    def lower_data(self):
        self.data[self.norm_col] = self.data[self.col].str.lower()

    # Remove Punctuation
    def remove_punctuation(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    def replace_words_characters(self,x):

        x = re.sub("don’t|dont|don't","do not",x)
        x = re.sub("doesn’t|doesnt","does not",x)
        x = re.sub("didn’t|didnt","did not",x)
        x = re.sub("i'm|i’m","i am",x)
        x = re.sub("fair","fare",x)
        x = re.sub("\\\\n"," ",x)

        return x

   # Replace words
    def replace_values(self):
        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: self.replace_words_characters(x))

    # Remove two or more spaces
    def remove_multiple_space(self):
        #self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: re.sub(' +', ' ',x))

        self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: " ".join(x.split()))

    # Dropna
    def empty_string(self):
      self.data = self.data[self.data[self.norm_col]!='']

    # Remove 3 character words
    def filter_char(self):
      self.data[self.norm_col] = self.data[self.norm_col].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
      #self.data = self.data[self.data['char_bool']!=True]


    def return_data(self):

        self.remove_null()

        self.lower_data()

        self.convert_emoji()

        self.replace_values()

        self.create_number_string_space()

        self.remove_multiple_space()

        self.empty_string()

        return self.data

# Split train and label
def split_data(data):
  X = data[['normalised']]
  y = data[['New_Topic']]
  return X,y

# Label Encoder
def label_encode_decode(train,predict,action):

  le = LabelEncoder().fit(train)

  if action=='encode':
    return le.transform(train)
  elif action=='decode':
    return le.inverse_transform(predict)



In [None]:
clean_data = English_Preprocess('/content/drive/My Drive/NLP Dash/train_test/courier_csat_train_set.csv','csat_comment','normalised')
data  = clean_data.return_data()
data
X,y = split_data(data)
y_encoded = label_encode_decode(y,None,'encode')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
