In [227]:
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from nltk import word_tokenize
import seaborn as sns

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn import metrics

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yihuiwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yihuiwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yihuiwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yihuiwang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [228]:
df = pd.read_csv('Final_augmented.csv')

In [229]:
df.head()

Unnamed: 0,text,label2,translated
0,Le boitier métallique sur le toit de cabine es...,11,0
1,Le bon fonctionnement du système appel de seco...,0,0
2,Les données du propriétaire sont à placer en c...,4,0
3,Le test parachute ne fonctionne pas ( pas d ' ...,9,0
4,Il manque l ' attestation de régularisation da...,16,0


In [230]:
print(df['text'].apply(lambda x: len(x.split(' '))).sum())

7714


In [231]:
special_character_remover = re.compile('[/(){}\[\]\|@,;:]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = nltk.corpus.stopwords.words('french')

In [232]:
def clean_text(text):
    #text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ''.join(c for c in text if not c.isdigit())
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    
    return text
    
df['text'] = df['text'].apply(clean_text)

In [233]:
df.head()

Unnamed: 0,text,label2,translated
0,e boitier mtallique toit cabine refixer correc...,11,0
1,e bon fonctionnement systme appel secours cabi...,0,0
2,donnes propritaire placer cabine,4,0
3,e test parachute fonctionne enclenchement bobi...,9,0
4,manque attestation rgularisation dossier scuri...,16,0


In [234]:
print(df['text'].apply(lambda x: len(x.split(' '))).sum())

4096


In [235]:
df = df.drop_duplicates(subset='text', keep="last")

In [236]:
print(df['text'].apply(lambda x: len(x.split(' '))).sum())

3832


In [237]:
df.head()

Unnamed: 0,text,label2,translated
0,e boitier mtallique toit cabine refixer correc...,11,0
1,e bon fonctionnement systme appel secours cabi...,0,0
2,donnes propritaire placer cabine,4,0
3,e test parachute fonctionne enclenchement bobi...,9,0
5,e rapport analyse risque manque dossier scurit...,17,0


In [238]:
df=df.rename(columns={"label2": "label"})

## Split the train and test

In [239]:
from sklearn.model_selection import train_test_split
X = df.text
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [240]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((392,), (99,), (392,), (99,))

## Apply Logistic Regression

In [241]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', TfidfVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_pred,y_test)}")

Accuracy is : 0.8686868686868687


In [242]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      1.00      1.00         1
           2       1.00      0.50      0.67         2
           3       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         3
           9       1.00      1.00      1.00         2
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         2
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         3
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         3
          17       1.00      0.67      0.80         3
          18       1.00    

## Applying Naive Bayes Classifier

In [243]:
from sklearn.naive_bayes import MultinomialNB


naivebayes = Pipeline([('vect', TfidfVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
naivebayes.fit(X_train, y_train)

y_pred = naivebayes.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')

accuracy 0.7171717171717171


In [244]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      1.00      1.00         1
           2       0.50      0.50      0.50         2
           3       1.00      1.00      1.00         2
           4       0.00      0.00      0.00         0
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      0.67      0.80         3
           9       1.00      1.00      1.00         2
          10       0.00      0.00      0.00         1
          11       1.00      1.00      1.00         2
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         2
          14       1.00      0.67      0.80         3
          15       1.00      1.00      1.00         1
          16       1.00      0.33      0.50         3
          17       0.12    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Confusion Matrix

In [245]:
# Confusion Matrix
cf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix \n', cf_matrix)

Confusion Matrix 
 [[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 1]]


## Applying Xgboost Classifier

In [None]:
from xgboost import XGBClassifier

xgboost = Pipeline([('vect', TfidfVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')



accuracy 0.42424242424242425


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         2
           3       1.00      0.50      0.67         2
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       1.00      0.33      0.50         3
           9       1.00      1.00      1.00         2
          10       0.50      1.00      0.67         1
          11       1.00      0.50      0.67         2
          12       0.50      1.00      0.67         1
          13       1.00      0.50      0.67         2
          14       0.50      0.33      0.40         3
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         3
          17       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
text = "Eclairage de la gaine , à compléter rappel : 2"
cleaned_text = clean_text(text)
print(cleaned_text)

clairage gaine complter rappel


In [None]:
prediction = lr.predict([cleaned_text])
print(prediction)

[44]


In [None]:
df_original = pd.read_csv('Clean_dataset.csv')

In [None]:
df_original.head()

Unnamed: 0.1,Unnamed: 0,text,label,translated
0,0,"Eclairage de la gaine , à compléter rappel : 2",SH16,0
1,4,Le boitier métallique sur le toit de cabine es...,CAR23,0
2,5,Plaque d ' identification n ' est pas complète...,CAR26,0
3,6,Le bon fonctionnement du système appel de seco...,CAR01,0
4,8,Les données du propriétaire sont à placer en c...,CAR06,0


In [None]:
#Create a labels map
X = df_original["text"].tolist()
y = pd.get_dummies(df_original['label'])
mapping = {i: name for i, name in enumerate(y.columns)}
mapping

{0: 'CAR01',
 1: 'CAR02',
 2: 'CAR03',
 3: 'CAR04',
 4: 'CAR06',
 5: 'CAR08',
 6: 'CAR09',
 7: 'CAR12',
 8: 'CAR15',
 9: 'CAR16',
 10: 'CAR18',
 11: 'CAR23',
 12: 'CAR25',
 13: 'CAR26',
 14: 'CD02',
 15: 'CW01',
 16: 'DOC01',
 17: 'DOC02',
 18: 'DOC03',
 19: 'DOC04',
 20: 'DOC08',
 21: 'DOC09',
 22: 'DOC10',
 23: 'LD01',
 24: 'LD05',
 25: 'LD11',
 26: 'LD13',
 27: 'LD14',
 28: 'LD16',
 29: 'MOD',
 30: 'MR04',
 31: 'MR05',
 32: 'MR07',
 33: 'MR08',
 34: 'MR15',
 35: 'MR16',
 36: 'OOS',
 37: 'OTHER',
 38: 'SH01',
 39: 'SH02',
 40: 'SH03',
 41: 'SH07',
 42: 'SH11',
 43: 'SH14',
 44: 'SH16',
 45: 'SH19',
 46: 'SH23',
 47: 'SH26',
 48: 'SH27',
 49: 'SI02'}

In [None]:
print(prediction[0])

44


In [None]:
mapping[prediction[0]]

'SH16'