In [18]:
import pandas as pd
import numpy as np
import nltk
import spacy
import heapq
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
import imblearn
from imblearn.over_sampling import SMOTE
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
import pickle 
import utils.helping_functions as hf
import utils.processing_helping_functions as phf
import utils.list_helping_functions as lhf
import utils.model_helping_functions as mhf

from collections import Counter


## Discovering Data


In [2]:
df=pd.read_excel("DFI -Ticketing.xlsx")
print(df.shape)
df.head()

(89058, 29)


Unnamed: 0,ID,Structure,Team,Operator,Type,Status,Ticket Title,Incident-Subject,Ticket-Description,Number of Changes,...,Achievement%,Charge réelle,Charge estimée,SLA,Priority,Application,Environment,ServerName,Category,Sub-Category
0,55019,NUTRITION ET SANTE,PILOTAGE,ADIL ABBOU,Incidents,Clôturé définitivement,[CRITICAL] () FRDAPLANIF1 / Service: _SMAOpCon...,Bonsoir Notre outil de supervision a remo...,"Bonsoir, Le service Minos a redémarré aut...",2,...,100,00h 10min,,SLA-INC-C1,C1,Système,Windows,FRDAPLANIF1 / C1,Supervision,
1,55020,POCLAIN HYDRAULICS,PILOTAGE,ADIL ABBOU,Incidents,Clôturé définitivement,[CRITICAL] FRVE-SRV-PHIND-DB06 / Partition : E,"Bonjour, Notre plateforme de supervision ...","Bonjour, Partition OK E:\ - total: 19...",4,...,100,00h 15min,,SLA-INCIDENT-CRITICITE-2-HNO,C2-HNO,,CEGID/LINKKI,,,
2,55021,RATPDEV,PILOTAGE,PHILIPPE DUVAL,Incidents,Clôturé définitivement,Job RENUM.VAC en message,"Bonjour, Message reçu dans QSYSOPR T...",Bonjour Je clôture ce ticket : Trava...,3,...,100,00h 23min,,Incident sévérité 1,Incident Sévérité 2,AS400,AS/400,RATP-OPERA400 / C1,,
3,55023,D-FI,SUPPORT BT NETAPP,AURELIEN CHASSAIN,Incidents,Clôturé définitivement,srvsanrdcb // NetApp : Etat Filer,"Bonjour, Notre outil de supervision nous ...","bon pour cloture Cordialement, Aurélien Cha...",2,...,100,00h 15min,,,3-Haut,Autre,Autre,,,
4,55024,POCLAIN HYDRAULICS,PILOTAGE,ADIL ABBOU,Incidents,Clôturé définitivement,[FRVE-SRV-H34VM] Service PeopleSoft_D__PS_HOME...,"Bonjour, Host: FRVE-SRV-H34VM @NAT : 10....","Bonjour, Le service a été relancé avec su...",2,...,100,00h 10min,,SLA-INCIDENT-CRITICITE-2-HNO,C2-HNO,,CEGID/LINKKI,,,


In [8]:
df.isnull().sum()

ID                        0
Structure                 0
Team                     17
Operator                 49
Type                      0
Status                    0
Ticket Title              0
Incident-Subject          0
Ticket-Description       24
Number of Changes         0
Author                    0
Site                      0
Creation-date             0
update-date               0
Validation Time           0
Treatment Date            2
Delay                     0
Action-Time           11735
Wait Time                 0
Achievement%              0
Charge réelle             2
Charge estimée        89050
SLA                   11445
Priority               3098
Application           23126
Environment            4202
ServerName            60311
Category              31646
Sub-Category          76718
dtype: int64

In [9]:
for col in df:
    print(col," ",len(df[col].unique()))

ID   89058
Structure   105
Team   27
Operator   114
Type   3
Status   7
Ticket Title   60596
Incident-Subject   80064
Ticket-Description   69076
Number of Changes   93
Author   874
Site   62
Creation-date   85177
update-date   18210
Validation Time   1
Treatment Date   84519
Delay   2
Action-Time   8728
Wait Time   13289
Achievement%   9
Charge réelle   797
Charge estimée   6
SLA   199
Priority   165
Application   219
Environment   226
ServerName   1244
Category   148
Sub-Category   36


## Edit Data


In [3]:
 #Delet Validation Time column since it has only 1 value
df.drop(['Validation Time'],axis=1,inplace=True)
df.dropna(subset = ["Team"], inplace=True)
df.dropna(subset = ["Ticket-Description"], inplace=True)
df = df[~df['Team'].isin(['SUPPORT BT','INFOGERANCE GLOBALE','OZITEM'])]
corpus=df['Incident-Subject']+df['Ticket-Description']
corpus=corpus.tolist()


## Preprocessing Subject and description (cleaning and lemmatization)

In [4]:
y,drop_ind=phf.cleaning(corpus)
df.drop(df.index[drop_ind], inplace=True)
print(df.shape[0]==len(y))
#corpus=phf.Lem_stopwords(y) 
#lhf.write_list(corpus,'documents_processed_subject_description.txt')
corpus=lhf.read_list('documents_processed_subject_description.txt')

True


## Transforming Corpus to word vector

In [5]:
tfidf = TfidfVectorizer(smooth_idf=True,norm='l1',stop_words=list(fr_stop) + list(en_stop))
docs_tfidf = tfidf.fit_transform(corpus)


  'stop_words.' % sorted(inconsistent))


## Get relevance of each vocabulary through summation of TF IDF for every document

In [6]:
#Get relevance of each vocabulary through summation of TF IDF for every document
sum_tfidf=docs_tfidf.sum(axis=0)
#len(sum_tfidf.transpose()z.tolist())
sum_tfidf=sum_tfidf.transpose().tolist()

## Extracting top 20 Keywords

In [7]:
keywords_index= heapq.nlargest(20, range(len(sum_tfidf)), sum_tfidf.__getitem__)
top_keywords= [hf.get_key(x,tfidf.vocabulary_) for x in keywords_index]
top_keywords
# FI was originaly D-FI but TfidfVectorizer automatically remove dashes

['cordialement',
 'bonjour',
 'fr',
 'supervision',
 'host',
 'cgi',
 'effectuer',
 'météo',
 'service',
 'envoi',
 'jour',
 'message',
 'infogerance',
 'pilotage',
 'https',
 'srv',
 'ticket',
 'alerte',
 'gb',
 'pouvoir']

## Discover Class Balance

In [8]:
### Identify tickets by type
X,y=corpus,df['Type']
print(y.value_counts())
Counter(y)
# The data is imbalanced, Changements is a minor class

Incidents      48944
Demandes       39876
Changements      195
Name: Type, dtype: int64


Counter({'Incidents': 48944, 'Demandes': 39876, 'Changements': 195})

## CountVectorizer of corpus for text classification

In [9]:
vectorizer = CountVectorizer(max_features=1500, max_df=0.7, stop_words=list(fr_stop) + list(en_stop))
X = vectorizer.fit_transform(corpus).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

  'stop_words.' % sorted(inconsistent))


## Oversampling changements class

In [10]:
sm = SMOTE(random_state=42,sampling_strategy={'Incidents': 48952, 'Demandes': 39900, 'Changements': 10000},n_jobs=-1)
X, y = sm.fit_resample(X, y)
pd.value_counts(y)


  n_samples_majority))


Incidents      48952
Demandes       39900
Changements    10000
dtype: int64

## Random forset model training 

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=0)
rf = RandomForestClassifier(n_estimators=1000, random_state=0,n_jobs=-1)
rf.fit(X_train, Y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [14]:
# Save the Modle to file in the current working directory
Pkl_Filename = "RF_type_class_.pkl"  

"""

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(rf, file)
"""
with open(Pkl_Filename, 'rb') as file:  
    model = pickle.load(file)


## Random forest evaluation

In [15]:
model_train_pred=model.predict(X_train)
model_test_pred=model.predict(X_test)
mhf.evaluate_models(Y_train, Y_test, model_train_pred,model_test_pred)

Accuracy on Training set 0.9990028325336725
Accuracy on Testing set 0.9432492581602374
[[ 2960     7     0]
 [    7 11416   583]
 [    0  1086 13597]]
              precision    recall  f1-score   support

 Changements       1.00      1.00      1.00      2967
    Demandes       0.91      0.95      0.93     12006
   Incidents       0.96      0.93      0.94     14683

    accuracy                           0.94     29656
   macro avg       0.96      0.96      0.96     29656
weighted avg       0.94      0.94      0.94     29656



## Testing with an example

In [17]:
mytest="si il vous plait j'ai detecte cet erreur opera cordialement nadine"
mhf.predict(mytest,vectorizer,tfidfconverter,model)[0]

'Incidents'