In [None]:

# # Automatic Ticket Assignment - Capstone Project - MEthod 1
# 
# ## Problem Statement - 
# 
# In most of the IT organizations, the assignment of incidents to appropriate IT groups is still a manual process. Manual assignment of incidents is time consuming and requires human efforts. There may be mistakes due to human errors and resource consumption is carried out ineffectively because of the misaddressing. On the other hand, manual assignment increases the response and resolution times which result in user satisfaction deterioration / poor customer service. 
# 
# _<font color=blue>This capstone project intends to reduce the manual intervention of IT operations or Service desk teams by automating the ticket assignment process.The goal here is to create a text classification based ML model that can automatically  classify any new tickets by analysing ticket description to one of the relevant Assignment groups, which could be later integrated to any ITSM tool like Service Now. Based on the ticket description our model will output the probability of assigning it to one of the 74 Groups.</font>_
# 
# The solution would be implemented using below approach:
# - Approach 1 - Using a traditional machine learning algorithm we would be classifying the tickets into one of the groups having more than 100 tickets.
# 

# ### Import necessary libraries


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None

import warnings 
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import cross_val_score

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline


In [None]:
df_tranlated_inc = pd.read_csv('../dataset/cleaned_data.csv',encoding='utf-8')


In [None]:
df_tranlated_inc.head()


In [None]:
df_tranlated_inc.rename(columns={'Assignment group':'Assignment_group'},inplace=True)

In [None]:
df_tranlated_inc.head()

In [None]:
df_inc_sample = df_tranlated_inc[df_tranlated_inc['Assignment_group'].map(df_tranlated_inc['Assignment_group'].value_counts()) > 100]
x = df_inc_sample['token_desc']
y = df_inc_sample['Assignment_group']


In [None]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
# encoding train labels 
encoder.fit(y)
y = encoder.transform(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13,stratify=y)

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights

In [None]:
w_array = np.ones(y_train.shape[0], dtype = 'float')
for i, val in enumerate(y_train):
    w_array[i] = class_weights[val]
    
log_cols=["Classifier", "accuracy","f1_score"]
log = pd.DataFrame(columns=log_cols)


In [None]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota


In [None]:
# ### Naive Bayes classifier for multinomial models


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

predictions = nb.predict_proba(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test)) 
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 
print ("logloss: %0.3f " % multiclass_logloss(y_test,predictions))

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))

In [None]:
log_entry = pd.DataFrame([["MultinomialNB",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)


In [None]:

from sklearn.svm import LinearSVC

svc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', OneVsRestClassifier(LinearSVC(loss='hinge',random_state=42,class_weight='balanced'))),
               ])
svc.fit(X_train, y_train)

In [None]:
y_pred = svc.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 
#print (focal_loss(alpha=.25, gamma=2))

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))

log_entry = pd.DataFrame([["LinearSVC",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None,class_weight='balanced')),
               ])
sgd.fit(X_train, y_train)

In [None]:
y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 


print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))
log_entry = pd.DataFrame([["SGDClassifier",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)


In [None]:
from sklearn.linear_model import LogisticRegression

logreg_1 = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5,class_weight='balanced')),
               ])
logreg_1.fit(X_train, y_train)


# In[13]:


y_pred = logreg_1.predict(X_test)
predictions = logreg_1.predict_proba(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 
print ("logloss: %0.3f " % multiclass_logloss(y_test,predictions))

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))
log_entry = pd.DataFrame([["LogisticRegression",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)

In [None]:
# ### Xgboost

# In[14]:


import xgboost as xgb

xgboost = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)),
               ])
xgboost.fit(X_train, y_train,clf__sample_weight=w_array)


# In[15]:


y_pred = xgboost.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))

log_entry = pd.DataFrame([["Xgboost",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)



In [None]:
# ## Hyperparameter Tune GridSearchCV

# ### LinearSVC

# In[16]:


from sklearn.model_selection import GridSearchCV

params = {"clf__estimator__C": [0.1, 1, 10, 100, 1000],  
              'clf__estimator__loss': ['hinge','squared_hinge'],}  
  
clf_svc = GridSearchCV(svc, param_grid=params, refit = True, verbose = 1,scoring='f1_weighted') 
# fitting the model for grid search 
clf_svc.fit(X_train, y_train)

print("Best Score: ", clf_svc.best_score_)
print("Best Params: ", clf_svc.best_params_)


In [None]:
y_pred = clf_svc.best_estimator_.predict(X_test)
#predictions = clf_svc.best_estimator_.predict_proba(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 
#print ("logloss: %0.3f " % multiclass_logloss(y_test,predictions))

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))

log_entry = pd.DataFrame([["LinearSVC_best_estimator_gcv",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)


In [None]:
# ### SGD Classifier



params = {
    "clf__loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "clf__alpha" : [0.0001, 0.001, 0.01, 0.1],
    "clf__penalty" : ["l2", "l1", "none"],
}

clf_sgd = GridSearchCV(sgd, param_grid=params,refit = True, verbose = 1,scoring='f1_weighted')
clf_sgd.fit(X_train, y_train)

print("Best Score: ", clf_sgd.best_score_)
print("Best Params: ", clf_sgd.best_params_)


In [None]:

y_pred = clf_sgd.best_estimator_.predict(X_test)
#predictions = clf_svc.best_estimator_.predict_proba(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 
#print ("logloss: %0.3f " % multiclass_logloss(y_test,predictions))

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))

log_entry = pd.DataFrame([["SGD_best_estimator_gcv",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)

In [None]:
# ### Logistic Regression


In [None]:

params = {
  'clf__penalty': ['l2'],
  'clf__C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0,1e2,1e4,1e5],
  'clf__max_iter': [100,4000,5000],
}

clf_lr = GridSearchCV(logreg_1, param_grid=params,refit = True,verbose = 1,scoring='f1_weighted')
clf_lr.fit(X_train, y_train)

print("Best Score: ", clf_lr.best_score_)
print("Best Params: ", clf_lr.best_params_)


In [None]:
y_pred = clf_lr.best_estimator_.predict(X_test)
#predictions = clf_svc.best_estimator_.predict_proba(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred, y_test,average='weighted')) 
#print ("logloss: %0.3f " % multiclass_logloss(y_test,predictions))

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test,y_pred))

log_entry = pd.DataFrame([["LogisticRegression_best_estimator_gcv",accuracy_score(y_pred, y_test),f1_score(y_pred, y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)


In [None]:
# ### Word2vec embedding and Logistic Regression
# 

# Let's use pretrined glove embeddings to train the model 

In [None]:
import logging
import nltk
from gensim.models import Word2Vec
import gensim


In [None]:
wv = gensim.models.KeyedVectors.load_word2vec_format("dataset/glove.6B/glove.6B.100d.w2vformat.txt")
wv.init_sims(replace=True)

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])


In [None]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [None]:
df_inc_sample.head()


In [None]:
train, test = train_test_split(df_inc_sample, test_size=0.2, random_state = 13,stratify=df_inc_sample['Assignment_group'])

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['token_desc']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['token_desc']), axis=1).values


In [None]:
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)



In [None]:
# ### Logistic Regression 


In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5,class_weight='balanced')
logreg = logreg.fit(X_train_word_average, train['Assignment_group'])
y_pred = logreg.predict(X_test_word_average)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,f1_score
print('accuracy %s' % accuracy_score(y_pred, test.Assignment_group))
print('f1 score %s' % f1_score(y_pred, test.Assignment_group,average='weighted'))
print(classification_report(test.Assignment_group, y_pred))


print(confusion_matrix(test.Assignment_group,y_pred))
log_entry = pd.DataFrame([["Word2Vec - LogisticRegression",accuracy_score(y_pred, test.Assignment_group),f1_score(y_pred, test.Assignment_group,average='weighted')]], columns=log_cols)
log = log.append(log_entry)


In [None]:
# <b> We see that the performance is very poor than the benchmark model, as the pretrained embedding used is not specific to ITSM data and more related to generic english texts </b>

# ### Xgboost



xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
xgb_clf = xgb_clf.fit(X_train_word_average, train['Assignment_group'],sample_weight=w_array)
y_pred = xgb_clf.predict(X_test_word_average)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,f1_score
print('accuracy %s' % accuracy_score(y_pred, test.Assignment_group))
print('f1 score %s' % f1_score(y_pred, test.Assignment_group,average='weighted'))
print(classification_report(test.Assignment_group, y_pred))

print(confusion_matrix(test.Assignment_group,y_pred))


In [None]:
# ### Doc2vec and Logistic Regression
# 
# Let's try using Doc2vec, doc2vec is an extension to the word2vec-approach towards documents. Its intention is encode (whole) docs, consisting of lists of sentences, rather than lists of ungrouped sentences. 
# 
# There are two approaches in Doc2vec:
#     - Distributed Memory
#     - Distributed BOW

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re


In [None]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), tags=[label]))
    return labeled


In [None]:
X_train, X_test, y_train, y_test  = train_test_split(df_inc_sample.token_desc, df_inc_sample.Assignment_group, test_size=0.2, random_state = 13,stratify=df_inc_sample['Assignment_group'])
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test



from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
# encoding train labels 
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)


In [None]:
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights

w_array = np.ones(y_train.shape[0], dtype = 'float')
for i, val in enumerate(y_train):
    w_array[i] = class_weights[val]


# In[36]:


X_train[:2]


In [None]:
# ### Distributed BOW
# DBOW is the doc2vec model analogous to Skip-gram model in word2vec. The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph.



In [None]:
def train_doc2vec(corpus):
    logging.info('Building Doc2Vec model')
    model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
    model_dbow.build_vocab(corpus)
    return model_dbow

In [None]:
# #### Building a Vocabulary


In [None]:

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])


In [None]:
#### Training DBOW model for 30 epochs


In [None]:
for epoch in range(40):
    model_dbow.train(utils.shuffle([x for x in all_data]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha


In [None]:
# #### Building the Final Vector Feature for the Classifier


In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors


In [None]:
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')


In [None]:
# #### Train the Logistic Regression Classifier.


In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e5,class_weight='balanced')
logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)

In [None]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred,y_test,average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

log_entry = pd.DataFrame([["Doc2Vec (dbow) - LogisticRegression",accuracy_score(y_pred, y_test),f1_score(y_pred,y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)



In [None]:
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
xgb_clf = xgb_clf.fit(train_vectors_dbow, y_train,sample_weight=w_array)
y_pred = xgb_clf.predict(test_vectors_dbow)

print('accuracy %s' % accuracy_score(y_pred, y_test))
#print('f1 score %s' % f1_score(y_pred,y_test,average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
# Distributed Memory (DM) acts as a memory that remembers what is missing from the current context — or as the topic of the paragraph. While the word vectors represent the concept of a word, the document vector intends to represent the concept of a document. We again instantiate a Doc2Vec model with a vector size with 300 words and iterating over the training corpus 30 times.


In [None]:

model_dm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=3, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dm.build_vocab([x for x in all_data])


In [None]:
for epoch in range(40):
    model_dm.train(utils.shuffle([x for x in all_data]), total_examples=len(all_data), epochs=1)
    model_dm.alpha -= 0.002
    model_dm.min_alpha = model_dm.alpha


In [None]:
train_vectors_dm = get_vectors(model_dm, len(X_train), 300, 'Train')
test_vectors_dm = get_vectors(model_dm, len(X_test), 300, 'Test')



In [None]:
logreg = logreg.fit(train_vectors_dm, y_train)
y_pred = logreg.predict(test_vectors_dm)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred,y_test,average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

log_entry = pd.DataFrame([["Doc2Vec (dm) - LogisticRegression",accuracy_score(y_pred, y_test),f1_score(y_pred,y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)


In [None]:
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
xgb_clf = xgb_clf.fit(train_vectors_dm, y_train,sample_weight=w_array)
y_pred = xgb_clf.predict(test_vectors_dm)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print('f1 score %s' % f1_score(y_pred,y_test,average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
# We also tried combining a paragraph vector from Distributed Bag of Words (DBOW) and Distributed Memory (DM) together for evaluation to check if it improves performance. 


In [None]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)


In [None]:
# #### Concatenate two models with feature vectors


In [None]:
def get_concat_vectors(model1,model2, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
    return vector

In [None]:

train_vecs_dbow_dm = get_concat_vectors(model_dbow,model_dm, len(X_train), 600, 'Train')
test_vecs_dbow_dm = get_concat_vectors(model_dbow,model_dm, len(X_test), 600, 'Test')


In [None]:
# #### Train Logistic Regression Models


In [None]:
logreg = logreg.fit(train_vecs_dbow_dm, y_train)
y_pred = logreg.predict(test_vecs_dbow_dm)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

log_entry = pd.DataFrame([["Doc2Vec (dbow + dm) - LogisticRegression",accuracy_score(y_pred, y_test),f1_score(y_pred,y_test,average='weighted')]], columns=log_cols)
log = log.append(log_entry)


In [None]:
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
xgb_clf = xgb_clf.fit(train_vecs_dbow_dm, y_train)
y_pred = xgb_clf.predict(test_vecs_dbow_dm)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))



In [None]:
# <b> We dont see much improvement in the performance using these word embeddings compared to the benchmark model using TFID approach <\b>


In [None]:

log.set_index(["Classifier"],inplace=True)
log.sort_values(by=['f1_score'])


In [None]:
log.sort_values(by=['f1_score']).plot(kind='barh',figsize=[7,6])


In [None]:

from scipy import spatial
sentence = ['The job did not start this morning on time']


In [None]:
print(encoder.inverse_transform(xgboost.predict(sentence)))
print(encoder.inverse_transform(logreg_1.predict(sentence)))

In [None]:
### Save the model
from sklearn.externals import joblib
joblib.dump(logreg_1, '..dataset/auto_ticket_assignment.pkl', compress=1)

In [None]:
le_name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
le_name_mapping


In [None]:
from sklearn.externals import joblib
model = joblib.load('../dataset/auto_ticket_assignment.pkl')

sentence = 'not able to connect to my system'
encoder.inverse_transform(model.predict([sentence]))

In [None]:
# # Finals Conclusions fo Approach1
# 
# - We first analysed the dataset provided to us, undestood the structure of the data provided - number of columns, field , datatypes etc.
# - We did Exploratory Data Analysis to derive further insights from this data set and we found that
#     - Data is very much imbalanced, there are around ~45% of the Groups with less than 20 tickets.
#     - Few of the tickets are in foreign language like German
#     - The data has lot of noise in it, for eg- few tickets related to account setup are spread across multiple assignment groups.
#     
# - We performed the data cleaning and preprocessing
#     - Translation: A small number of tickets were written in German. Hence, we used the Google translate python api  to convert German to English to generate the input data for the next steps. However, the google translator rest api can only process a limited number of texts on a daily basis, so we translated the text in batches and saved the file for further processing.
#     - Make text all lowercase so that the algorithm does not treat the same words in different cases as different
#     - Removing Noise i.e everything that isn’t in a standard number or letter i.e Punctuation, Numerical values
#     - Removing extract spaces
#     - Removed punctuations
#     - Removed words containing numbers
#     - Stopword Removal: Sometimes, some extremely common words which would appear to be of little value in helping select documents matching a user need are excluded from the vocabulary entirely. These words are called stop words
#     - Lemmatization
#     - Tokenization: Tokenization is just the term used to describe the process of converting the normal text strings into a list of tokens i.e words that we actually want. Sentence tokenizer can be used to find the list of sentences and Word tokenizer can be used to find the list of words in strings.
#     
# 
# - We then ran a basic benchmarck model using the cleaned and preprocessed dataset
#     - Since the dataset is very imbalanced, We considered a subset of groups for predictions.  In 74 groups, 46% of tickets belong to group 1 and 16 groups just have more than 100 tickets, rest of the Assignment groups have very less ticket counts which might not add much value to the model prediction. If we conducted random sampling towards all the subcategories, then we would face a problem that we might miss all the tickets in some categories. Hence, we considered the groups that have more than 100 tickets. 
#     - We trained the data using below models:
#         - Multinomial NB
#         - Linear Support vector Machine
#         - Logistic regression
#         - Xgboost
#         
# - LinearSVC gives better performance with 
#     - accuracy 0.833642
#     - f1 score 0.818053
# 
# <b> Although, it seems like the call is biased towards GRP_0 which has a majority of samples. </b>
# 
# 
# 
# - Even after downsampling the data we see that the predictions are biased towards GRP_0 which has a majority of samples.
# - Imbalance causes two problems:
#     - Training is inefficient as most samples are easy examples that contribute no useful learning signal;
#     - The easy examples can overwhelm training and lead to degenerate models.
#     A common solution is to perform some form of hard negative mining that samples hard examples during training or more complex sampling/re weighing schemes.In order to handle the imbalance problem  we used class_weight=balanced hyperparameter while training the model, which tells the model to "pay more attention" to samples from an under-represented class.  
# - Although, the accuracy and f1_score went down. This ensured that the classes were being correctly classified with lesser number of missclassification and good precision/recall scores for all the classes
# 
# - Next, we also tried using pretrained word embedding, but the only challenge was that we could not find any embeddings trained on ITSM data. We used the glove model with 100d for this, and then used logistic regression and Xgboost to train the model. But, the scores were poorer than the benchmark model.
#  
# - Then, we also tried vector space modelling using Doc2Vec with DistributedBOW and Distributed Memory approach, though ‘Doc2Vec’ is a more advanced model in NLP rather than ‘Tf-Idf’, but still in our case, it is not giving proper results. We have tried with a linear  & boosting based classifier respectively.
#   
#   In our dataset, ‘texts’ are domain-specific. Furthermore, ‘Doc2Vec’ model is more suitable for very well written grammatically correct texts. In our case, texts are quite rough in nature.It is also proven in various examples and Data Sc ientist’s experiments that though ‘Tf-Idf’ model is inferior as compared to ‘Doc2Vec’, but still it gives better result while classifying very domain specific texts.
#  
#  
#  - Linear SVC gave better performance with hyperparameter tuning and this model would be used for classifying the tickets into one of the groups.
#     - accuracy 0.797441
#     - f1 score 0.797100
# 
# The performance can be further improved by collecting more data for tickets and by running deep learning models like RNN and LSTM's.
