In [17]:
import pandas as pd
import numpy as np
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
from langdetect import detect

import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import pickle 
from collections import Counter
import utils.processing_helping_functions as phf
import utils.list_helping_functions as lhf
import utils.model_helping_functions as mhf

import heapq
import xgboost as xgb

## Edit data

In [18]:
df=pd.read_excel("DFI -Ticketing.xlsx")
df.dropna(subset = ["Team"], inplace=True)
df.dropna(subset = ["Ticket-Description"], inplace=True)
df = df[~df['Team'].isin(['SUPPORT BT','INFOGERANCE GLOBALE','OZITEM'])]
corpus=df['Incident-Subject']+df['Ticket-Description']
corpus=corpus.tolist()
y,drop_ind=phf.cleaning(corpus)
df.drop(df.index[drop_ind], inplace=True)
print(df.shape[0]==len(y))
#corpus=phf.Lem_stopwords(y) 
#lhf.write_list(corpus,'documents_processed_subject_description.txt')
corpus=lhf.read_list('documents_processed_subject_description.txt')

True


In [3]:
### Identify tickets by type
X,y=corpus,df['Team'].tolist()
Counter(y)

Counter({'PILOTAGE': 53719,
         'SUPPORT BT NETAPP': 77,
         'SAUVEGARDE': 3144,
         'AS400': 6433,
         'RESEAUX': 2846,
         'WINDOWS': 8807,
         'LINUX': 2320,
         'COBICOM': 1461,
         'SDM': 1781,
         'BT TOOLS': 1325,
         'DBA': 861,
         'OPENINFO': 45,
         'PROJET RC': 53,
         'SUPPORT QUICK-EDD': 5,
         'PROD APPLICATIVE': 1018,
         'DIR-PROD': 136,
         'SAP': 216,
         'IT INTERNE': 3516,
         'SUPPORT PLA': 1195,
         'SG-IS-SUPPORT': 32,
         'DATACENTER': 8,
         'TOYS-INNERWORK': 8,
         'AIX': 9})

## Turn words to features

In [4]:
# turn words to features
vectorizer = CountVectorizer(max_features=3000, max_df=0.6, stop_words=list(fr_stop) + list(en_stop))
X = vectorizer.fit_transform(corpus).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

  'stop_words.' % sorted(inconsistent))


## Resampling data for Nearly balanced Teams

In [5]:
oversampl_dict={'PILOTAGE': 53719,
         'SUPPORT BT NETAPP': 500,
         'SAUVEGARDE': 3144,
         'AS400': 6437,
         'RESEAUX': 2849,
         'WINDOWS': 8815,
         'LINUX': 2322,
         'COBICOM': 1461,
         'SDM': 1785,
         'BT TOOLS': 1325,
         'DBA': 861,
         'OPENINFO': 500,
         'PROJET RC': 500,
         'SUPPORT QUICK-EDD': 500,
         'PROD APPLICATIVE': 1018,

         'DIR-PROD': 1000,
         'SAP': 1000,
         'IT INTERNE': 3517,
         'SUPPORT PLA': 1195,
         'SG-IS-SUPPORT': 500,
         'DATACENTER': 500,
         'TOYS-INNERWORK':500,
         'AIX': 500}


In [6]:
#oversampling minority classes
sm = SMOTE(random_state=42,sampling_strategy=oversampl_dict,k_neighbors=3,n_jobs=-1)
X, y = sm.fit_resample(X, y)

sm = SMOTE(random_state=42,sampling_strategy=oversampl_dict,n_jobs=-1)
X, y = sm.fit_resample(X, y)
pd.value_counts(y)

## XGboost for team classification

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=0)
xgb =  xgb.XGBClassifier(n_estimators=1000, random_state=0,n_jobs=-1)
xgb.fit(X_train, Y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [9]:
# Save the Modle to file in the current working directory
Pkl_Filename = "XGB_teams_class_.pkl" 
"""

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(xgb, file)
"""
 

with open(Pkl_Filename, 'rb') as file:  
    model = pickle.load(file)


## XGboost Evaluation

In [10]:
model_train_pred=model.predict(X_train)
model_test_pred=model.predict(X_test)

mhf.evaluate_models(Y_train, Y_test, model_train_pred,model_test_pred)

Accuracy on Training set 0.9691588643685811
Accuracy on Testing set 0.9193223928004235
[[  158     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     1     0     0     0     0     0     0]
 [    0  1680     0     0     0     1     3     3     3     0   170     6
      0     3     0     2    36     0     0     2     0     0     6]
 [    0     2   216     1     0     2     0     4   108     1    19     0
      0     9     3     0     9     0     0     0     0     0    11]
 [    0     1     0   433     0     0     0     2     1     0     9     0
      0     1     0     1     6     0     0     0     0     0     2]
 [    0     0     0     0   141     0     0     0     0     0     1     0
      0     0     0     0     0     0     0     0     0     0     0]
 [    0     5     8     0     0   191     0     0     3     0    53     0
      0     0     0     1     9     0     0     0     0     0     3]
 [    0     1     1     0     0     1   280     2     0

## Testing a new ticket type

In [11]:
Ticket = input("Enter Ticket Subject: ") 


Enter Ticket Subject: Bonjour, mon outil de supervision m'a detectee l'alerte suivante  <https://supervision.infogerance>


In [12]:
ticket_type=mhf.predict(Ticket,vectorizer,tfidfconverter,xgb)[0]
ticket_type


'PILOTAGE'

## Showing 5 similar tickets

In [16]:
nlp = spacy.load('fr_core_news_md')
ticket_=nlp(Ticket)
similar_type_list_processed=lhf.read_list('subjects_teams//'+ticket_type+'preprocessed.txt')
similar_type_list_original=lhf.read_list('subjects_teams//'+ticket_type+'original.txt')
similarity=[]
if (len(similar_type_list_processed)>3000):
    n=3000
else:
    n=len(similar_type_list_processed)

print("Please wait few minutes for processing")
for i in range(n):

    similarity.append(ticket_.similarity(nlp(similar_type_list_processed[i])))
biggest_match_index=heapq.nlargest(5, range(len(similarity)), key=lambda x: similarity[x])
similar_type_list_original=np.array(similar_type_list_original)
res=list(similar_type_list_original[biggest_match_index])
for i in range(5):
    print("Match", i+1,"\n")
    print(res[i])
    

Please wait few minutes for processing


  "__main__", mod_spec)


Match 1 

 ﻿Bonjour,     Notre outil de supervision nous a remonté l'alerte suivante :     Cordialement,   Pilotage D-FI﻿   
Match 2 

   Bonjour,     Notre outil de supervision nous remonte l'alerte suivante:           Cordialement.   Pilotage D-FI        
Match 3 

Bonjour,                       Notre plateforme de supervision nous remonte l'alerte suivante:         Host: FRVE-SRV-PHIND-WEB07  Service:  Service : W32Time Status :  Stopped    Best regards / Cordialement Mohamed / Pilotage D-FI 
Match 4 

 Bonjour,  Veuillez suspendre la supervision des serveurs suivants :  Du : 10/11/2018 à 11h  Au : Nous informerons de l'heure de la reprise  Motif : PRA X86    BEDAAD02  ESBAAD01  FRDAERELIA2  FRDAERELIA3  FRDAERELIA4  FRDAERELIA5  FRDAERELIA6  FRDABARTENDER1  FRDAED01  FRDAED02   FRDAED03  TEDAFDV3  FRREET30  FRDATSE2   FRDATL06  FRDABI02  FRDABI04     Cordialement ,  
Match 5 

 Bonjour,                         Notre outil de supervision nous remonte l'alerte suivante:          
