# <a name="Part4_2_4">2.4. Hyper parameter tuning for Random Forest - TF-IDF Vectorizer</a>

## 1. Import and analyse the data set.

In [None]:
import pandas as pd # read data file, data processing
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting graph for EDA , Metrics analysis
%matplotlib inline
import seaborn as sns # plotting graph for EDA , Metrics analysis

from sklearn.pipeline import Pipeline
from pprint import pprint

### Load the data 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os

py_file_location = "/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/"
sys.path.append(os.path.abspath(py_file_location))



In [None]:
# Input data files has been processed for 
# 1. carriage return characters like '_x000D_' and \n 
# 2. Accented encoding character like äº§å“æ‰€åœ¨ä»“åº“å‡ºé”™ã€ , è¿žæŽ¥åŽè‡ªåŠ¨æ–­å¼€ï¼Œæ
# 3. Translation of words in non english language especially German, Italian, French
# Above 3 steps are done separately and output from these steps are used for further processing in Part 2
# 4. Update of Assigment group - fewer data groups , grouped to Group others
# 5. Pre-process for having only English data after translation, removal of spaces 
# 6. Treatment of Null values
# Above step 4,5,6 are done in part2 and processed data is stored in input_data_trans_preprocess.csv

data_dir = "/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/"
data_file_name='input_data_trans_preprocess.csv'
#data_file_name='input_data.xlsx'
data_file_path = data_dir+data_file_name
data_file_path

'/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/input_data_trans_preprocess.csv'

In [None]:
#df_data = pd.read_excel(data_file_path)
df_data = pd.read_csv(data_file_path)

In [None]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Short description       8467 non-null   object
 1   Description             8467 non-null   object
 2   Caller                  8467 non-null   object
 3   Assignment group        8467 non-null   object
 4   orig_desc               8466 non-null   object
 5   orig_short_desc         8459 non-null   object
 6   Lang                    8467 non-null   object
 7   Translated_ShortDesc    8450 non-null   object
 8   Translated_Description  8467 non-null   object
 9   orig_assign_group       8467 non-null   object
dtypes: object(10)
memory usage: 661.6+ KB


In [None]:
df_data.head(5)

Unnamed: 0,Short description,Description,Caller,Assignment group,orig_desc,orig_short_desc,Lang,Translated_ShortDesc,Translated_Description,orig_assign_group
0,login issue,verified user details employee manager name ch...,spxjnwir pjlcoqds,GRP_0,-verified user details.(employee# & manager na...,login issue,en,login issue,-verified user details.(employee# & manager na...,GRP_0
1,outlook,received from hmjdrvpb komuaywn gmail com hell...,hmjdrvpb komuaywn,GRP_0,_x000D_\n_x000D_\nreceived from: hmjdrvpb.komu...,outlook,en,outlook,received from: hmjdrvpb.komuaywn@gmail.com...,GRP_0
2,cant log in to vpn,received from eylqgodm ybqkwiam gmail com hi i...,eylqgodm ybqkwiam,GRP_0,_x000D_\n_x000D_\nreceived from: eylqgodm.ybqk...,cant log in to vpn,en,cant log in to vpn,received from: eylqgodm.ybqkwiam@gmail.com...,GRP_0
3,unable to access hr tool page,unable to access hr tool page,xbkucsvz gcpydteq,GRP_0,unable to access hr_tool page,unable to access hr_tool page,en,unable to access hr_tool page,unable to access hr_tool page,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0,skype error,skype error,no,skype error,skype error,GRP_0


**Feature with both description - Merging both Description and Short description**

In [None]:
# Create a new column by merging both description field and use this for model training and prediction
df_data['Desc_All'] = df_data['Short description'] + ' '+ df_data['Description']

#### Create Train Test data

In [None]:
from sklearn.model_selection import train_test_split

feature_name = "Desc_All"
X= df_data[feature_name]
y = df_data['Assignment group'].values
# Split data into Train, and Test - Test data would be used for testing the model 
X_train, X_prod, y_train, y_prod = train_test_split(X, y, test_size=0.05, random_state=0, stratify=y)
print('Prod Shape', X_prod.shape )
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)
print('Train shape', len(X_train))
print('Val shape', len(X_val))



Prod Shape (424,)
Train shape 6350
Val shape 2117


#### Create TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
#tfidf = TfidfVectorizer(strip_accents='unicode', lowercase=True, 
#                        preprocessor=preProcessData, 
#                        ngram_range=(1,1))

#### Convert Target variable to Categorical type

In [None]:
# Convert Target variable to categorical value using label encoding
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

y = df_data['Assignment group'].values
le = preprocessing.LabelEncoder()
le.fit(y)
num_classes = len(le.classes_)
y_train_mdl_lbl_enc = le.transform(y_train)
y_train_mdl_cat = to_categorical(y_train_mdl_lbl_enc, num_classes)
y_val_mdl_lbl_enc = le.transform(y_val)
y_val_mdl_cat = to_categorical(y_val_mdl_lbl_enc, num_classes)

## Hyper parameter tuning - Feature Type : TF-IDF Vectorizer

### Hyper parameter tuning - Multinomial NB

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

RF_pipeline_hyper_tuning = Pipeline( steps = [ 
                            ('vectorizer', tfidf), 
                            ('clf_RF', RandomForestClassifier())
                            ])




In [None]:
RF_pipeline_hyper_tuning.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vectorizer', 'clf_RF', 'vectorizer__analyzer', 'vectorizer__binary', 'vectorizer__decode_error', 'vectorizer__dtype', 'vectorizer__encoding', 'vectorizer__input', 'vectorizer__lowercase', 'vectorizer__max_df', 'vectorizer__max_features', 'vectorizer__min_df', 'vectorizer__ngram_range', 'vectorizer__norm', 'vectorizer__preprocessor', 'vectorizer__smooth_idf', 'vectorizer__stop_words', 'vectorizer__strip_accents', 'vectorizer__sublinear_tf', 'vectorizer__token_pattern', 'vectorizer__tokenizer', 'vectorizer__use_idf', 'vectorizer__vocabulary', 'clf_RF__bootstrap', 'clf_RF__ccp_alpha', 'clf_RF__class_weight', 'clf_RF__criterion', 'clf_RF__max_depth', 'clf_RF__max_features', 'clf_RF__max_leaf_nodes', 'clf_RF__max_samples', 'clf_RF__min_impurity_decrease', 'clf_RF__min_samples_leaf', 'clf_RF__min_samples_split', 'clf_RF__min_weight_fraction_leaf', 'clf_RF__n_estimators', 'clf_RF__n_jobs', 'clf_RF__oob_score', 'clf_RF__random_state', 'clf_RF__verbose'

Lets try Random Search , we are not trying every combination, but selecting at random to sample a wide range of values.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {
                'vectorizer__lowercase': [True],
                'vectorizer__strip_accents': ['unicode'],
                'vectorizer__ngram_range': [(1,1), (1, 2), (1, 3)],
                'clf_RF__n_estimators': n_estimators,
                'clf_RF__max_features': max_features,
                'clf_RF__max_depth': max_depth,
                'clf_RF__min_samples_split': min_samples_split,
                'clf_RF__min_samples_leaf': min_samples_leaf,
                'clf_RF__bootstrap': bootstrap
               }


In [None]:
print(random_grid)


{'vectorizer__lowercase': [True], 'vectorizer__strip_accents': ['unicode'], 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf_RF__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'clf_RF__max_features': ['auto', 'sqrt'], 'clf_RF__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'clf_RF__min_samples_split': [2, 5, 10], 'clf_RF__min_samples_leaf': [1, 2, 4], 'clf_RF__bootstrap': [True, False]}


In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
randomsearch = RandomizedSearchCV(estimator = RF_pipeline_hyper_tuning, param_distributions = random_grid, n_iter = 100, cv = 3, scoring='f1_weighted', verbose=0,n_jobs=-1)
# Fit the random search model
best_model = randomsearch.fit(X_train,y_train_mdl_lbl_enc)
best_model.best_params_




{'vectorizer__strip_accents': 'unicode', 'vectorizer__ngram_range': (1, 1), 'vectorizer__lowercase': True, 'clf_RF__n_estimators': 1000, 'clf_RF__min_samples_split': 5, 'clf_RF__min_samples_leaf': 1, 'clf_RF__max_features': 'auto', 'clf_RF__max_depth': None, 'clf_RF__bootstrap': False}

In [None]:
pprint(best_model.best_params_)

{'clf_RF__bootstrap': False,
 'clf_RF__max_depth': None,
 'clf_RF__max_features': 'auto',
 'clf_RF__min_samples_leaf': 1,
 'clf_RF__min_samples_split': 5,
 'clf_RF__n_estimators': 1000,
 'vectorizer__lowercase': True,
 'vectorizer__ngram_range': (1, 1),
 'vectorizer__strip_accents': 'unicode'}


Random search allowed us to narrow down the range for each hyperparameter, we can use GridSearch to explicitly specify every combination of settings

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

def calc_metrics(actual,predicted):
  print('Accuracy score: ', round(accuracy_score(actual, predicted),2))
  print("precision_weighted:", round(precision_score(actual, predicted,average='weighted', zero_division=1),2))
  print("recall_weighted:", round(recall_score(actual, predicted,average='weighted', zero_division=1 ),2))
  print("f1_weighted:", round(f1_score(actual, predicted,average='weighted', zero_division=1 ),2))
  print("Classification Report:")
  print(classification_report(actual, predicted,zero_division=1))

In [None]:
from sklearn.model_selection import GridSearchCV

n_estimators = [800, 1000, 1200]
max_features = ['auto']
max_depth = [None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2]
bootstrap = [True]

grid_param = [{
                'vectorizer__lowercase': [True],
                'vectorizer__strip_accents': ['unicode'],
                'vectorizer__ngram_range': [(1,1)],
                'clf_RF__n_estimators': n_estimators,
                'clf_RF__max_features': max_features,
                'clf_RF__max_depth': max_depth,
                'clf_RF__min_samples_split': min_samples_split,
                'clf_RF__min_samples_leaf': min_samples_leaf,
                'clf_RF__bootstrap': bootstrap
               }]

gridsearch = GridSearchCV(RF_pipeline_hyper_tuning, grid_param, cv=5, scoring='f1_weighted', verbose=0,n_jobs=-1) 
best_model = gridsearch.fit(X_train,y_train_mdl_lbl_enc)

best_model.best_params_

best_model.score(X_val,y_val_mdl_lbl_enc)

# Access the best set of parameters
best_params = gridsearch.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_RF_pipe = gridsearch.best_estimator_
print(best_RF_pipe)
 
result_df = pd.DataFrame.from_dict(gridsearch.cv_results_, orient='columns')
print(result_df.columns)

predictions = gridsearch.predict(X_val)

calc_metrics(y_val_mdl_lbl_enc,predictions)



{'clf_RF__bootstrap': True, 'clf_RF__max_depth': None, 'clf_RF__max_features': 'auto', 'clf_RF__min_samples_leaf': 1, 'clf_RF__min_samples_split': 2, 'clf_RF__n_estimators': 800, 'vectorizer__lowercase': True, 'vectorizer__ngram_range': (1, 1), 'vectorizer__strip_accents': 'unicode'}
Pipeline(steps=[('vectorizer', TfidfVectorizer(strip_accents='unicode')),
                ('clf_RF', RandomForestClassifier(n_estimators=800))])
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_clf_RF__bootstrap', 'param_clf_RF__max_depth',
       'param_clf_RF__max_features', 'param_clf_RF__min_samples_leaf',
       'param_clf_RF__min_samples_split', 'param_clf_RF__n_estimators',
       'param_vectorizer__lowercase', 'param_vectorizer__ngram_range',
       'param_vectorizer__strip_accents', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'ra

In [None]:
# Save the pipeline as a pickle file.
import pickle

pickle.dump(best_RF_pipe, open(data_dir+"saved_pipeline_RF.pkl", 'wb'))

In [None]:
print("Lets try to use the stored Pickled pipeline\n")
# Load the pickled model
pickled_pipeline = pickle.load(open(data_dir+"saved_pipeline_RF.pkl", 'rb'))
pickled_le = pickle.load(open(data_dir+"label_encoder.pkl", 'rb'))

Lets try to use the stored Pickled pipeline



In [None]:
print("\nPrediction from the pickel model for the input feature values\n")
# Use the loaded pickled model to make predictions
y_pred = pickled_pipeline.predict(X_prod)


Prediction from the pickel model for the input feature values



In [None]:
result_lbl_enc = pickled_le.inverse_transform(y_pred)
print("Predicted :" , result_lbl_enc[0:10])

Predicted : ['GRP_0' 'GRP_0' 'GRP_0' 'GRP_0' 'GRP_0' 'GRP_0' 'GRP_8' 'GRP_0' 'GRP_0'
 'GRP_0']


In [None]:
print("Actual   :" ,y_prod[0:10])

Actual   : ['GRP_33' 'GRP_2' 'GRP_0' 'GRP_13' 'GRP_29' 'GRP_13' 'GRP_8' 'GRP_0'
 'GRP_3' 'GRP_0']


In [None]:
# End of the program
print("HYper tuning of Model MNB ; Vectorizer TF-IDF completed")

HYper tuning of Model MNB ; Vectorizer TF-IDF completed
