# <a name="Part4_2_5">2.5. Hyper parameter tuning for Multinomial Naive Bayes - TF-IDF Vectorizer</a>

## 1. Import and analyse the data set.

In [None]:
import pandas as pd # read data file, data processing
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting graph for EDA , Metrics analysis
%matplotlib inline
import seaborn as sns # plotting graph for EDA , Metrics analysis

from sklearn.pipeline import Pipeline

### Load the data 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os

py_file_location = "/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/"
sys.path.append(os.path.abspath(py_file_location))



In [None]:
# Input data files has been processed for 
# 1. carriage return characters like '_x000D_' and \n 
# 2. Accented encoding character like äº§å“æ‰€åœ¨ä»“åº“å‡ºé”™ã€ , è¿žæŽ¥åŽè‡ªåŠ¨æ–­å¼€ï¼Œæ
# 3. Translation of words in non english language especially German, Italian, French
# Above 3 steps are done separately and output from these steps are used for further processing in Part 2
# 4. Update of Assigment group - fewer data groups , grouped to Group others
# 5. Pre-process for having only English data after translation, removal of spaces 
# 6. Treatment of Null values
# Above step 4,5,6 are done in part2 and processed data is stored in input_data_trans_preprocess.csv

data_dir = "/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/"
data_file_name='input_data_trans_preprocess.csv'
#data_file_name='input_data.xlsx'
data_file_path = data_dir+data_file_name
data_file_path

'/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/input_data_trans_preprocess.csv'

In [None]:
#df_data = pd.read_excel(data_file_path)
df_data = pd.read_csv(data_file_path)

In [None]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Short description       8467 non-null   object
 1   Description             8467 non-null   object
 2   Caller                  8467 non-null   object
 3   Assignment group        8467 non-null   object
 4   orig_desc               8466 non-null   object
 5   orig_short_desc         8459 non-null   object
 6   Lang                    8467 non-null   object
 7   Translated_ShortDesc    8450 non-null   object
 8   Translated_Description  8467 non-null   object
 9   orig_assign_group       8467 non-null   object
dtypes: object(10)
memory usage: 661.6+ KB


In [None]:
df_data.head(5)

Unnamed: 0,Short description,Description,Caller,Assignment group,orig_desc,orig_short_desc,Lang,Translated_ShortDesc,Translated_Description,orig_assign_group
0,login issue,verified user details employee manager name ch...,spxjnwir pjlcoqds,GRP_0,-verified user details.(employee# & manager na...,login issue,en,login issue,-verified user details.(employee# & manager na...,GRP_0
1,outlook,received from hmjdrvpb komuaywn gmail com hell...,hmjdrvpb komuaywn,GRP_0,_x000D_\n_x000D_\nreceived from: hmjdrvpb.komu...,outlook,en,outlook,received from: hmjdrvpb.komuaywn@gmail.com...,GRP_0
2,cant log in to vpn,received from eylqgodm ybqkwiam gmail com hi i...,eylqgodm ybqkwiam,GRP_0,_x000D_\n_x000D_\nreceived from: eylqgodm.ybqk...,cant log in to vpn,en,cant log in to vpn,received from: eylqgodm.ybqkwiam@gmail.com...,GRP_0
3,unable to access hr tool page,unable to access hr tool page,xbkucsvz gcpydteq,GRP_0,unable to access hr_tool page,unable to access hr_tool page,en,unable to access hr_tool page,unable to access hr_tool page,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0,skype error,skype error,no,skype error,skype error,GRP_0


**Feature with both description - Merging both Description and Short description**

In [None]:
# Create a new column by merging both description field and use this for model training and prediction
df_data['Desc_All'] = df_data['Short description'] + ' '+ df_data['Description']

#### Create Train Test data

In [None]:
from sklearn.model_selection import train_test_split

feature_name = "Desc_All"
X= df_data[feature_name]
y = df_data['Assignment group'].values
# Split data into Train, and Test - Test data would be used for testing the model 
X_train, X_prod, y_train, y_prod = train_test_split(X, y, test_size=0.05, random_state=0, stratify=y)
print('Prod Shape', X_prod.shape )
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)
print('Train shape', len(X_train))
print('Val shape', len(X_val))



Prod Shape (424,)
Train shape 6350
Val shape 2117


#### Create TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
#tfidf = TfidfVectorizer(strip_accents='unicode', lowercase=True, 
#                        preprocessor=preProcessData, 
#                        ngram_range=(1,1))

#### Convert Target variable to Categorical type

In [None]:
# Convert Target variable to categorical value using label encoding
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

y = df_data['Assignment group'].values
le = preprocessing.LabelEncoder()
le.fit(y)
num_classes = len(le.classes_)
y_train_mdl_lbl_enc = le.transform(y_train)
y_train_mdl_cat = to_categorical(y_train_mdl_lbl_enc, num_classes)
y_val_mdl_lbl_enc = le.transform(y_val)
y_val_mdl_cat = to_categorical(y_val_mdl_lbl_enc, num_classes)

## Hyper parameter tuning - Feature Type : TF-IDF Vectorizer

### Hyper parameter tuning - Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

MNB_pipeline_hyper_tuning = Pipeline( steps = [ 
                            ('vectorizer', tfidf), 
                            ('clf_MNB', OneVsRestClassifier(MultinomialNB()))
                            ])




In [None]:
MNB_pipeline_hyper_tuning.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vectorizer', 'clf_MNB', 'vectorizer__analyzer', 'vectorizer__binary', 'vectorizer__decode_error', 'vectorizer__dtype', 'vectorizer__encoding', 'vectorizer__input', 'vectorizer__lowercase', 'vectorizer__max_df', 'vectorizer__max_features', 'vectorizer__min_df', 'vectorizer__ngram_range', 'vectorizer__norm', 'vectorizer__preprocessor', 'vectorizer__smooth_idf', 'vectorizer__stop_words', 'vectorizer__strip_accents', 'vectorizer__sublinear_tf', 'vectorizer__token_pattern', 'vectorizer__tokenizer', 'vectorizer__use_idf', 'vectorizer__vocabulary', 'clf_MNB__estimator__alpha', 'clf_MNB__estimator__class_prior', 'clf_MNB__estimator__fit_prior', 'clf_MNB__estimator', 'clf_MNB__n_jobs'])

In [None]:
from sklearn.model_selection import GridSearchCV

grid_param = [{
                'vectorizer__lowercase': [True],
                'vectorizer__strip_accents': ['unicode'],
                'vectorizer__ngram_range': [(1,1), (1, 2), (1, 3)],
                'clf_MNB__estimator__alpha': [1, 1e-1, 1e-2]
               }]

gridsearch = GridSearchCV(MNB_pipeline_hyper_tuning, grid_param, cv=5, scoring='f1_weighted', verbose=0,n_jobs=-1) 
best_model = gridsearch.fit(X_train,y_train_mdl_lbl_enc)


In [None]:
#sklearn.metrics.SCORERS.keys()

In [None]:
best_model.best_params_

{'clf_MNB__estimator__alpha': 0.01,
 'vectorizer__lowercase': True,
 'vectorizer__ngram_range': (1, 2),
 'vectorizer__strip_accents': 'unicode'}

In [None]:
best_model.score(X_val,y_val_mdl_lbl_enc)

0.6335323454521343

In [None]:
# Access the best set of parameters
best_params = gridsearch.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_MNB_pipe = gridsearch.best_estimator_
print(best_MNB_pipe)
 
result_df = pd.DataFrame.from_dict(gridsearch.cv_results_, orient='columns')
print(result_df.columns)

{'clf_MNB__estimator__alpha': 0.01, 'vectorizer__lowercase': True, 'vectorizer__ngram_range': (1, 2), 'vectorizer__strip_accents': 'unicode'}
Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(ngram_range=(1, 2), strip_accents='unicode')),
                ('clf_MNB',
                 OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01)))])
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_clf_MNB__estimator__alpha', 'param_vectorizer__lowercase',
       'param_vectorizer__ngram_range', 'param_vectorizer__strip_accents',
       'params', 'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')


In [None]:
predictions = gridsearch.predict(X_val)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

def calc_metrics(actual,predicted):
  print('Accuracy score: ', round(accuracy_score(actual, predicted),2))
  print("precision_weighted:", round(precision_score(actual, predicted,average='weighted', zero_division=1),2))
  print("recall_weighted:", round(recall_score(actual, predicted,average='weighted', zero_division=1 ),2))
  print("f1_weighted:", round(f1_score(actual, predicted,average='weighted', zero_division=1 ),2))
  print("Classification Report:")
  print(classification_report(actual, predicted,zero_division=1))

In [None]:
calc_metrics(y_val_mdl_lbl_enc,predictions)

Accuracy score:  0.68
precision_weighted: 0.68
recall_weighted: 0.68
f1_weighted: 0.63
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.96      0.82       994
           1       1.00      0.25      0.40         8
           2       0.58      0.43      0.49        35
           3       0.00      0.00      0.00         8
           4       0.54      0.61      0.57        64
           5       0.52      0.64      0.57        36
           6       0.64      0.30      0.41        30
           7       0.25      0.10      0.14        10
           8       0.80      0.19      0.31        21
           9       1.00      0.85      0.92        20
          10       0.50      0.50      0.50        22
          11       0.50      0.31      0.39        54
          12       0.63      0.52      0.57        60
          13       0.33      0.11      0.17         9
          14       1.00      0.00      0.00         7
          15       0.00  

In [None]:
# Save the pipeline as a pickle file.
import pickle

pickle.dump(best_MNB_pipe, open(data_dir+"saved_pipeline_MNB.pkl", 'wb'))

In [None]:
print("Lets try to use the stored Pickled pipeline\n")
# Load the pickled model
pickled_pipeline = pickle.load(open(data_dir+"saved_pipeline_MNB.pkl", 'rb'))
pickled_le = pickle.load(open(data_dir+"label_encoder.pkl", 'rb'))

Lets try to use the stored Pickled pipeline



In [None]:
print("\nPrediction from the pickel model for the input feature values\n")
# Use the loaded pickled model to make predictions
y_pred = pickled_pipeline.predict(X_prod)


Prediction from the pickel model for the input feature values



In [None]:
result_lbl_enc = pickled_le.inverse_transform(y_pred)
print("Predicted :" , result_lbl_enc[0:10])

Predicted : ['GRP_0' 'GRP_0' 'GRP_0' 'GRP_13' 'GRP_29' 'GRP_13' 'GRP_8' 'GRP_0'
 'GRP_3' 'GRP_0']


In [None]:
print("Actual   :" ,y_prod[0:10])

Actual   : ['GRP_33' 'GRP_2' 'GRP_0' 'GRP_13' 'GRP_29' 'GRP_13' 'GRP_8' 'GRP_0'
 'GRP_3' 'GRP_0']


In [None]:
# End of the program
print("HYper tuning of Model MNB ; Vectorizer TF-IDF completed")

HYper tuning of Model MNB ; Vectorizer TF-IDF completed
