In [17]:
# !pip install --upgrade googletrans==4.0.0-rc1
# !pip install --upgrade pyspellchecker
# !pip install --upgrade spacy

In [18]:
import numpy as np
import pandas as pd 
import pickle
import re

import nltk
from googletrans import Translator
from spellchecker import SpellChecker
import spacy

In [19]:
less_informative_words = pd.read_csv('final_words_to_remove_updated.csv')
less_informative_words = less_informative_words['words_to_remove'].tolist()

In [20]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [21]:
text1 ='login issue'

text2 = '-verified user details.(employee# & manager name)_x000D_\n-checked the user name in ad and reset the password._x000D_\n-advised the user to login and check._x000D_\n-caller confirmed that he was able to login._x000D_\n-issue resolved.'

In [22]:
class TextProcessor:
    def __init__(self):
        self.translator = Translator()
        self.nlp = spacy.load('en_core_web_sm')

    def text_cleaning_steps_1(self, short_text, long_text):
        text = short_text + ' ' + long_text
        text = text.lower()
        translated = self.translator.translate(text, src='de', dest='en')
        text = translated.text
        text = text.replace('x000d', ' ')
        text = text.replace('\n', ' ')
        text = text.replace('â€', ' ')
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', ' ', text)
        text = re.sub(r'\S+\.\S+@gmail\.com', ' ', text)
        text = re.sub(r'\[\s*cid:[^\]]+\]', ' ', text)
        text = re.sub(r'\d+', ' ', text)
        text = re.sub(r"[^\w\s']", ' ', text)
        text = text.encode("ascii", "ignore").decode()
        text = re.sub(r"[^a-zA-Z0-9\s]", ' ', text)
        text = ' '.join(text.split())
        text = text.strip()

        return text

    def lemmatize_text(self, text):
        lemmas = []
        for token in self.nlp(text):
            if token.text.strip():
                lemmas.append(token.lemma_.lower())
        return lemmas
    
    def remove_stopwords(self,lemmas):
        tokens = [token for token in lemmas if token not in stop_words]
        return tokens

    def removing_noise(self,tokens):
        tokens = [token for token in tokens if len(token) > 1 and token not in less_informative_words]
        return tokens
    
    def additional_cleaning(self,tokens):
        tokens = ['abended' if token == 'evening' else token for token in tokens]
        tokens = list(set(tokens))
        text = ' '.join(tokens)
        return text
    
    def process_text(self, short_text, long_text):
        cleaned_text = self.text_cleaning_steps_1(short_text, long_text)
        lemmas = self.lemmatize_text(cleaned_text)
        tokens = self.remove_stopwords(lemmas)
        tokens = self.removing_noise(tokens)
        text = self.additional_cleaning(tokens)
        
        print('The text has been cleaned')
        return text

In [23]:
processor = TextProcessor()
cleaned_text = processor.process_text(text1, text2)
cleaned_text

The text has been cleaned


'name check ad advise caller resolve issue verified login employee reset password manager confirmed user details'

In [24]:
import joblib

In [25]:
text_length = len(cleaned_text)

In [26]:
text_length

111

In [27]:
import sklearn
print(sklearn.__version__)

1.5.2


In [28]:
label_encoder = joblib.load('LabelEncoder.pkl')
scaler = joblib.load('MinMaxScaler.pkl')
vectorize = joblib.load('TfidfVectorizer.pkl')
model = joblib.load('OVO_LR_model.pkl')

In [33]:
def predict_label(text_length, cleaned_text, scaler, vectorize, model, label_encoder):
    test_description_dense = np.hstack((
        vectorize.transform([cleaned_text]).toarray(),
        scaler.transform(np.array(text_length).reshape(-1, 1))
    ))
    
    pred = model.predict(test_description_dense)
    print(pred)
    return label_encoder.inverse_transform(pred)

In [34]:
predicted_lable = predict_label(text_length, cleaned_text, scaler, vectorize, model, label_encoder)[0]
predicted_lable

[0]


'GRP_0'

In [37]:
label_encoder.classes_

array(['GRP_0', 'GRP_10', 'GRP_12', 'GRP_13', 'GRP_14', 'GRP_19', 'GRP_2',
       'GRP_24', 'GRP_25', 'GRP_3', 'GRP_33', 'GRP_4', 'GRP_5', 'GRP_6',
       'GRP_8', 'GRP_9'], dtype=object)