# Part 4 - Build Logistic Regression Model

## 1. Import and analyse the data set.

In [1]:
import pandas as pd # read data file, data processing
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting graph for EDA , Metrics analysis
%matplotlib inline
import seaborn as sns # plotting graph for EDA , Metrics analysis

### Load the data 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Input data files has been processed for 
# 1. carriage return characters like '_x000D_' and \n 
# 2. Accented encoding character like äº§å“æ‰€åœ¨ä»“åº“å‡ºé”™ã€ , è¿žæŽ¥åŽè‡ªåŠ¨æ–­å¼€ï¼Œæ
# 3. Translation of words in non english language especially German, Italian, French
# Above 3 steps are done separately and output from these steps are used for further processing in Part 2
# 4. Update of Assigment group - fewer data groups , grouped to Group others
# 5. Pre-process for having only English data after translation, removal of spaces 
# 6. Treatment of Null values
# Above step 4,5,6 are done in part2 and processed data is stored in input_data_trans_preprocess.csv

data_dir = "/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/"
data_file_name='input_data_trans_preprocess.csv'
data_file_path = data_dir+data_file_name
data_file_path

'/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/input_data_trans_preprocess.csv'

In [4]:
#df_data = pd.read_excel(data_file_path)
df_data = pd.read_csv(data_file_path)

In [5]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Short description       8467 non-null   object
 1   Description             8467 non-null   object
 2   Caller                  8467 non-null   object
 3   Assignment group        8467 non-null   object
 4   orig_desc               8466 non-null   object
 5   orig_short_desc         8459 non-null   object
 6   Lang                    8467 non-null   object
 7   Translated_ShortDesc    8450 non-null   object
 8   Translated_Description  8467 non-null   object
 9   orig_assign_group       8467 non-null   object
dtypes: object(10)
memory usage: 661.6+ KB


**Feature with both description - Merging both Description and Short description**

In [6]:
df_data['Desc_All'] = df_data['Short description'] + ' '+ df_data['Description']
# Strip unwanted spaces
df_data['Desc_All'] = df_data['Desc_All'].apply(lambda x: x.strip())

**Feature with description and caller - Merging both Description ,  Short description and Caller**

In [7]:
# Merge Desc, Short Desc and Caller 
df_data['Desc_all_Caller'] = df_data['Desc_All']  + ' '+ df_data['Caller']

In [8]:
# Import stop words list from NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # Import stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

● Text preprocessing
include lemmatization

In [29]:

from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
    
def preprocess_vocab(df_column):
    corpus=[]
    stop_words=set(stopwords.words('english'))
    #stem=PorterStemmer()
    lem=WordNetLemmatizer()
    for tickets in df_column:
      words=[w for w in word_tokenize(tickets) if (w not in stop_words)]
      words=[lem.lemmatize(w) for w in words if len(w)>2]
      corpus.append(words)

    df_column_upd = df_column.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    
    return  df_column_upd, corpus


## Train a simple ML Model - Logistic Regression

In [92]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

def calc_metrics(actual,predicted):
  print('Accuracy score: ', round(accuracy_score(actual, predicted),2))
  print("precision_weighted:", round(precision_score(actual, predicted,average='weighted', zero_division=1),2))
  print("recall_weighted:", round(recall_score(actual, predicted,average='weighted', zero_division=1 ),2))
  print("f1_weighted:", round(f1_score(actual, predicted,average='weighted', zero_division=1 ),2))
  #print("Classification Report:")
  #print(classification_report(y_test_mdl_cat, y_pred_bow,zero_division=1))

In [93]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

def run_model(model, X_train, X_val, y_train, y_val):

  # convert X_train to BOW values  - Featurization
  vectorizer = CountVectorizer(binary=True)
  X_train_bow = vectorizer.fit_transform(X_train)
  X_val_bow = vectorizer.transform(X_val)

  # Convert Target variable to categorical value using label encoding
  y = df_data['Assignment group'].values
  le = preprocessing.LabelEncoder()
  le.fit(y)
  y_train_mdl_lbl_enc = le.transform(y_train)
  y_train_mdl_cat = to_categorical(y_train_mdl_lbl_enc)
  y_val_mdl_lbl_enc = le.transform(y_val)
  y_val_mdl_cat = to_categorical(y_val_mdl_lbl_enc)

  # Train the model
  history = model.fit(X_train_bow, y_train_mdl_cat)
  print("Model trainded")
  # Predict for given validation value

  y_pred_bow = model.predict(X_val_bow)
  print("Model Prediction completed")
  actual = y_val_mdl_cat
  predicted = y_pred_bow

  calc_metrics(actual,predicted)
  return history

In [94]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

def run_LR_model(X_train, X_val, y_train, y_val):
  clf = LogisticRegression(solver='lbfgs', max_iter=250)
  clf = OneVsRestClassifier(clf)
  #model = clf
  history = run_model(clf, X_train, X_val, y_train, y_val)
  return clf, history

In [62]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Short description       8467 non-null   object
 1   Description             8467 non-null   object
 2   Caller                  8467 non-null   object
 3   Assignment group        8467 non-null   object
 4   orig_desc               8466 non-null   object
 5   orig_short_desc         8459 non-null   object
 6   Lang                    8467 non-null   object
 7   Translated_ShortDesc    8450 non-null   object
 8   Translated_Description  8467 non-null   object
 9   orig_assign_group       8467 non-null   object
 10  Desc_All                8467 non-null   object
 11  Desc_all_Caller         8467 non-null   object
dtypes: object(12)
memory usage: 793.9+ KB


####Model : Logistic Regression ; Feature type : Bag of words;  Features : Description


In [80]:
feature_name = "Description"
model1, history1, xp, yp = run_LR_model(feature_name)


feature is Description
Prod Shape (424,)
Train shape 6032
Val shape 2011
Model trainded
Model Prediction completed
Accuracy score:  0.52
precision_weighted: 0.78
recall_weighted: 0.53
f1_weighted: 0.56


####Model : Logistic Regression ; Feature type : Bag of words;  Features : Short Description


In [67]:
feature_name = "Short description"
model2, history2, xp, yp = run_LR_model(feature_name)


feature is Short description
Prod Shape (424,)
Train shape 6032
Val shape 2011
Model trainded
Model Prediction completed
Accuracy score:  0.52
precision_weighted: 0.83
recall_weighted: 0.53
f1_weighted: 0.56


####Model : Logistic Regression ; Feature type : Bag of words;  Features :  Description & Short Description


In [68]:
feature_name = "Desc_All"
model3, history3, xp, yp = run_LR_model(feature_name)

feature is Desc_All
Prod Shape (424,)
Train shape 6032
Val shape 2011
Model trainded
Model Prediction completed
Accuracy score:  0.55
precision_weighted: 0.82
recall_weighted: 0.57
f1_weighted: 0.6


####Model : Logistic Regression ; Feature type : Bag of words;  Features :  Description & Short Description & Caller


In [95]:
feature_name = "Desc_all_Caller"
X, corpus = preprocess_vocab(df_data[feature_name])
y = df_data['Assignment group'].values
# Split data into Train, and Test - Test data would be used for testing the model 
X_train, X_prod, y_train, y_prod = train_test_split(X, y, test_size=0.05, random_state=0, stratify=y)
print('Prod Shape', X_prod.shape )
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)
print('Train shape', len(X_train))
print('Val shape', len(X_val))
model4, history4 = run_LR_model(X_train, X_val, y_train, y_val )

Prod Shape (424,)
Train shape 6350
Val shape 2117
Model trainded
Model Prediction completed
Accuracy score:  0.55
precision_weighted: 0.81
recall_weighted: 0.56
f1_weighted: 0.6


Test model with best model metrics

In [135]:
vectorizer4 = CountVectorizer(binary=True)
vectorizer4.fit(X_train)
#print(X_prod)
X_prod_bow = vectorizer4.transform(X_prod)
#print(X_prod_bow)
y_pred_bow = model4.predict(X_prod_bow)
print("Model Prediction completed")

y = df_data['Assignment group'].values
le = preprocessing.LabelEncoder()
le.fit(y)
y_prod_mdl_lbl_enc = le.transform(y_prod)
y_prod_mdl_cat = to_categorical(y_prod_mdl_lbl_enc)

actual = y_prod_mdl_cat
predicted = y_pred_bow

print(X_prod[:10])
print(y_prod[:10])

print("Input ticket", X_prod.iloc[2])
print("Actual assignment group", y_prod[2])
print(predicted[2])



Model Prediction completed
6053    scanning recording comes message path exist sc...
4145    expense report error manager needs authorizati...
4252    dell system slow mscrm slow dell system slow c...
2624    unable inwarehouse tool received nuhfwplj ojcw...
3426    erp pur wrong subcontracting demand nd materia...
7629    customer service enters order acct get error s...
6755    job job failed job scheduler received monitori...
140     prtgghj k password reset please reset hr tool ...
6550    cannot open pptx file attached email gives rep...
127       update inplant update inplant rbozivdq gmlhrtvp
Name: Desc_all_Caller, dtype: object
['GRP_33' 'GRP_2' 'GRP_0' 'GRP_13' 'GRP_29' 'GRP_13' 'GRP_8' 'GRP_0'
 'GRP_3' 'GRP_0']
Input ticket dell system slow mscrm slow dell system slow connected user system using teamviewer cleared cache cookies temp files updated symantec user updated system bios restarted pc advised user try check user launched mscrm working fine issue resolved c jziwhldq qs

In [124]:
le.inverse_transform([0])

array(['GRP_0'], dtype=object)