rozpoznawanie typów reklamacji

In [1]:
import sys, os, re, time
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

# import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import metrics

from imblearn.over_sampling import RandomOverSampler

import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers

from tensorflow.keras import backend as K

Ładowanie danych z pliku xlsx

In [2]:
datapath='/mnt/c/dev/DOP-categories/'
datafile='Cases categorization.xlsx'
dane_surowe=pd.read_excel(os.path.join(datapath,datafile))


In [None]:
#dane_surowe.info()

In [3]:
dane_surowe.columns

Index(['casenumber', 'jira', 'contactemail', 'origin', 'status', '_type',
       'record_name', 'record_desc', 'createddate', 'lastmodifieddate',
       'closeddate', 'moneyrefund', 'owner_name', 'lastmod_name', 'case_desc',
       'Unnamed: 15'],
      dtype='object')

In [4]:
input_data=pd.DataFrame()
input_data[['content','category']]=dane_surowe[['case_desc','Unnamed: 15']]

In [5]:
input_data.describe()

Unnamed: 0,content,category
count,190,189
unique,186,102
top,Zawieszanie transmisji danych.,network; data transmission
freq,4,22


In [6]:
input_data.head()

Unnamed: 0,content,category
0,Problem z wyświetlaniem informacji w FAQ - w ...,FAQ; wrong copy;
1,W żadnej z kategorii nie ma sekcji 'najczęstsz...,FAQ; content categorization;
2,moje konto> mój plan > szczegóły oferty aplika...,app crashes
3,Klient zgłsza problem z wyborem numerów. Przy ...,UX
4,Klienta nie może p.rzejść przez weryfikację nu...,OTP


In [7]:
# clearing data
# remove duplicates
input_data.drop_duplicates(inplace=True)
# remove empty
input_data=input_data.dropna()

In [8]:
# find duplicates
input_data[input_data.duplicated(keep=False)]

Unnamed: 0,content,category


In [None]:
# input_data['content']

załadowanie słowników tłumaczeń

In [9]:
# Słownik synonimów / podmian

podmiany=pd.read_excel(os.path.join(datapath,'preproc_dict.xlsx'))

preprocessing of content text

In [10]:
def preprocess_texts(raw_texts,replacements):
    """
    texts: np.Series containing strings to be preprocessed
    replacements: pairs of what convert to what
    return np.Series with corrected texts
    """
    resulttext=raw_texts.str.lower()
    for [co,naco,_] in replacements.values:
       resulttext=resulttext.str.replace(re.compile(str(co)),str(naco))
    return resulttext


In [11]:
prep_texts=preprocess_texts(input_data['content'],podmiany)

In [12]:
prep_texts.head()

0    problem z wyświetlaniem informacji w faq w kat...
1    w żadnej z kategorii nie ma sekcji najczęstsze...
2    moje konto mój plan szczegóły oferty aplikacja...
3    klient zgłsza problem z wyborem numerów przy w...
4    klienta nie może p rzejść przez weryfikację nu...
Name: content, dtype: object

In [None]:
#type(prep_texts)

In [None]:
# prep_texts.to_csv(os.path.join(datapath,'texts_for_emb.txt'))
# incorrect <- validation set użyty do nauki - trzeba poprawić
# prep_texts.to_csv(os.path.join(datapath,'texts_for_emb.txt'),sep='\n',index=False)

In [19]:
def preprocess_labels(raw_labels):
    result_labels=raw_labels.split(';')  # split on ';'
    # remove leading space and replace spaces inside to underscore
    result_labels=['__' + x.strip().replace(' ','_') + '__' for x in result_labels]
    result_string=' '.join(result_labels)
    return result_string

In [21]:

prep_labels=input_data['category'].apply(preprocess_labels)

In [22]:
type(prep_labels)

pandas.core.series.Series

In [None]:
def categorize_labels(labels,dictionary=None):
    if dictionary==None:
        cat_labels, uniques = pd.factorize(labels)
    else:
        None # dorobić mapowania jeśli słownik już był podany
    return cat_labels, uniques

In [None]:
cat_labels,label_dict=categorize_labels(prep_labels)


In [None]:
# pd.Series(cat_labels).value_counts()

In [23]:
prep_labels.head()


0                __FAQ__ __wrong_copy__ ____
1    __FAQ__ __content_categorization__ ____
2                            __app_crashes__
3                                     __UX__
4                                    __OTP__
Name: category, dtype: object

split data to train and validation parts

In [None]:
texts_train,texts_val,y_train,y_val=train_test_split(prep_texts.values,
                                                     cat_labels,test_size=0.25,random_state=1, shuffle=True)

In [None]:
#save file for fasttext
pd.Series(texts_train).to_csv(os.path.join(datapath,'texts_for_emb.txt'),sep='\n',index=False)

In [None]:
# oversampling to boost minority classes
ros = RandomOverSampler(random_state=0)

texts_train, y_train = ros.fit_resample(np.reshape(texts_train, (-1, 1)),y_train)

# shuffle to be sure 
texts_train, y_train = shuffle(texts_train, y_train, random_state=0)

texts_train=texts_train.reshape(-1)

In [None]:
# prepare files for fasstext
fasttext_train_set=[]
for i,t in enumerate(y_train):
    fasttext_train_set.append('__label__'+str(t)+' '+texts_train[i])

fasttext_val_set=texts_val
    

In [None]:
pd.Series(fasttext_train_set).to_csv(os.path.join(datapath,'fasttext_train_set.txt'),sep='\n',index=False)
pd.Series(fasttext_val_set).to_csv(os.path.join(datapath,'fasttext_val_set.txt'),sep='\n',index=False)

In [None]:
# teraz odpalamy fasttext i w wyniku otrzymujemy predictions.txt

In [None]:
y_val_predicted=pd.read_csv(os.path.join(datapath,'predictions.txt'), header=None)

In [None]:
drop_fasttext_label = lambda x: int(re.sub('__label__','',x))

In [None]:
y_val_predicted=y_val_predicted.applymap(drop_fasttext_label)

In [None]:
y_val_pred=y_val_predicted.values
#print(y_val_predicted)
print(y_val_pred.shape)

In [None]:
# oversampling also validation set 
#texts_val, y_val = ros.fit_resample(np.reshape(texts_val, (-1, 1)),y_val)
#texts_val=texts_val.reshape(-1)

In [None]:

print(texts_train.shape)
print(y_train.shape)
print(texts_val.shape)
print(y_val.shape)

Data analysis

In [None]:
# oversampling to boost minority classes
# ros = RandomOverSampler(random_state=0)

# x_train, y_train = ros.fit_resample(x_train,y_train)


create mlp model

calculate statistics per class

In [None]:
#y_prob = mymodel.predict(x_val) 
#y_preds = y_prob.argmax(axis=-1)
y_preds = y_val_pred
cm = metrics.confusion_matrix(y_val, y_preds)

In [None]:
print(metrics.classification_report(list(y_val),list(y_preds),labels=[1,2,3,4],target_names=label_dict[1:]))
#print(metrics.classification_report(list(y_val),list(y_preds),target_names=label_dict))

In [None]:
cm

In [None]:
df_cm = pd.DataFrame(cm, label_dict, label_dict)
plt.figure(figsize = (10,7))
sn.set(font_scale=1.0)#for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 12})# font size

In [None]:
pd.Series(y_val).value_counts()

In [None]:
print(mymodel.summary())

In [None]:
y_prob_train = mymodel.predict(x_train) 
y_preds_train = y_prob_train.argmax(axis=-1)
cm_train = metrics.confusion_matrix(y_train, y_preds_train)

In [None]:
cm_train

In [None]:
df_cm_train = pd.DataFrame(cm_train, label_dict, label_dict)
plt.figure(figsize = (10,7))
sn.set(font_scale=1.0)#for label size
sn.heatmap(df_cm_train, annot=True, annot_kws={"size": 12})# font size

In [None]:
print(metrics.classification_report(list(y_train),list(y_preds_train),labels=[1,2,3,4],target_names=label_dict[1:]))

In [None]:
def tune_ngram_model(data):
    """Tunes n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
    """
    
    # Select parameter values to try.
    num_layers = [1, 2, 3]
    num_units = [4, 6, 10]
    #dropouts =[0.3,0.4,0.5]

    # Save parameter combination and results.
    params = {
        'layers': [],
        'units': [],
        'accuracy': [],
        'loss':[],
        'f1':[],
    }
    
    (x_train, y_train), (x_val, y_val) = data

    # Iterate over all parameter combinations.
    for layers in num_layers:
        for units in num_units:
                params['layers'].append(layers)
                params['units'].append(units)
                print(f'parameters: layers-{layers}, units-{units}')
                myaccuracy, myloss, mymodel = train_ngram_model(data,
                      num_classes=len(label_dict),
                      learning_rate=4e-3,
                      epochs=7,
                      batch_size=128,
                      layers=layers,
                      units=units,
                      dropout_rate=0.4,
                      l2=0.005)
                y_prob = mymodel.predict(x_val) 
                y_preds = y_prob.argmax(axis=-1)
                myf1=metrics.f1_score(list(y_val),list(y_preds),labels=[1,2,3,4])
                print((f'Accuracy: {myaccuracy}, Loss: {myloss}, F1: {myf1}, Parameters: (layers={layers}, units={units})'))
                params['accuracy'].append(myaccuracy)
                params['loss'].append(myloss)
                params['f1'].append(myf1)
    #_plot_parameters(params)
    return params
    
def _plot_parameters(params):
    """Creates a 3D surface plot of given parameters.

    # Arguments
        params: dict, contains layers, units and accuracy value combinations.
    """
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot_trisurf(params['layers'],
                    params['units'],
                    params['accuracy'],
                    cmap=cm.coolwarm,
                    antialiased=False)
    plt.show()

In [None]:
wyniki = tune_ngram_model(mydata)