rozpoznawanie typów reklamacji

In [None]:
import sys, os, re, time
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

# import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import metrics

from imblearn.over_sampling import RandomOverSampler

import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K

In [None]:
# set for 4 cores
K.set_session(tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4)))

Ładowanie danych z pliku xlsx

In [None]:
datapath='/mnt/c/dev/reklamacje/'
datafile='reklamacje_20181106_train.xlsx'
dane_surowe=pd.read_excel(os.path.join(datapath,datafile))


In [None]:
# dane_surowe.info()

In [None]:
dane_surowe.columns

In [None]:
input_data=pd.DataFrame()
input_data[['content','category']]=dane_surowe[['tresc_zgl','typ_train']]

In [None]:
input_data.describe()

In [None]:
# clearing data
# remove duplicates
input_data.drop_duplicates(inplace=True)
# remove empty
input_data=input_data.dropna()

In [None]:
# find duplicates
input_data[input_data.duplicated(keep=False)]

In [None]:
# input_data['content']

załadowanie słowników tłumaczeń

In [None]:
# Słownik synonimów / podmian

podmiany=pd.read_excel(os.path.join(datapath,'roboczy_slownik_synonimow.xlsx'))

preprocessing of content text

In [None]:
def preprocess_texts(raw_texts,replacements):
    """
    texts: np.Series containing strings to be preprocessed
    replacements: pairs of what convert to what
    return np.Series with corrected texts
    """
    resulttext=raw_texts.str.lower()
    for [co,naco] in replacements.values:
       resulttext=resulttext.str.replace(re.compile(str(co)),str(naco))
    return resulttext


In [None]:
prep_texts=preprocess_texts(input_data['content'],podmiany)

In [None]:
prep_texts.head()

In [None]:
def preprocess_labels(raw_labels,interesting_labels):
    """
    raw_labels: np.Series with labels
    interesting_labels: list of labels you are interested in
    
    returns np.Series with corrected labels
    """
    other_label='OTHER'
    result_labels=raw_labels
    result_labels=result_labels.apply(lambda x: x if x in interesting_labels else other_label)
    return result_labels

In [None]:
interesting_categories=['XDR','XOA','XRF','XSP']
prep_labels=preprocess_labels(input_data['category'],interesting_categories)

In [None]:
prep_labels.value_counts()

In [None]:
def categorize_labels(labels,dictionary=None):
    if dictionary==None:
        cat_labels, uniques = pd.factorize(labels)
    else:
        None
    return cat_labels, uniques

In [None]:
cat_labels,label_dict=categorize_labels(prep_labels)


In [None]:
# pd.Series(cat_labels).value_counts()

In [None]:
label_dict

In [None]:
# save dictionary to the disk
pd.DataFrame(label_dict).to_excel(os.path.join(datapath,'slownik_kategorii.xlsx'))

split data to train and validation parts

In [None]:
texts_train,texts_val,y_train,y_val=train_test_split(prep_texts.values,
                                                     cat_labels,test_size=0.25,random_state=1, shuffle=True)

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_val).value_counts()

In [None]:
# oversampling also validation set 
#texts_val, y_val = ros.fit_resample(np.reshape(texts_val, (-1, 1)),y_val)
#texts_val=texts_val.reshape(-1)

In [None]:

print(texts_train.shape)
print(y_train.shape)
print(texts_val.shape)
print(y_val.shape)

In [None]:
pd.Series(y_train).value_counts()

Data analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def get_num_words_per_sample(sample_texts):
    """Returns the median number of words per sample given corpus.

    # Arguments
        sample_texts: list, sample texts.

    # Returns
        int, median number of words per sample.
    """
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)

def plot_sample_length_distribution(sample_texts):
    """Plots the sample length distribution.

    # Arguments
        samples_texts: list, sample texts.
    """
    plt.hist([len(s) for s in sample_texts], 50)
    plt.xlabel('Length of a sample')
    plt.ylabel('Number of samples')
    plt.title('Sample length distribution')
    plt.show()


In [None]:
get_num_words_per_sample(texts_train)

In [None]:
plot_sample_length_distribution(texts_train)

In [None]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. 
TOP_K = 500

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

# Maximum relative document/corpus frequency below which a token will be discarded.
MAX_DOCUMENT_FREQUENCY = 1.0

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
# MAX_SEQUENCE_LENGTH = 800

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """

    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
            'max_df': MAX_DOCUMENT_FREQUENCY,
        
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)
    print(x_train.shape)
    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    print(x_train.shape)
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val, vectorizer

In [None]:
tic=time.time()
print('vectorizing texts...')
x_train, x_val, myvectorizer = ngram_vectorize(texts_train, y_train, texts_val)
toc=time.time()
print('time:',toc-tic)


create mlp model

In [None]:
# balance classes in training set

# oversampling to boost minority classes
ros = RandomOverSampler(random_state=0)

x_train_balanced, y_train_balanced = ros.fit_resample(x_train,y_train)
# shuffle to be sure 
x_train_balanced, y_train_balanced = shuffle(x_train_balanced, y_train_balanced, random_state=0)

#x_train_balanced = x_train_balanced.reshape(-1)

In [None]:
x_train_balanced, y_train_balanced = x_train, y_train

In [None]:
y_train_balanced.shape

In [None]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes,l2):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu',kernel_regularizer=regularizers.l2(l2)))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=num_classes, activation='softmax'))
    return model

In [None]:
def train_ngram_model(data,num_classes,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2,
                      l2=0):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test x and y.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (x_train, y_train), (x_val, y_val) = data


    # Create model instance.
    tic=time.time()
    print('creating model...')
    model = mlp_model(layers=layers,
                                  units=units,
                                  dropout_rate=dropout_rate,
                                  input_shape=x_train.shape[1:],
                                  num_classes=num_classes,
                                  l2=l2)

    # Compile model with learning parameters.

    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5)]

    # Train and validate model.
    print('training...')
    
    history = model.fit(
            x_train,
            y_train,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, y_val),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)
    toc=time.time()
    print('time:',toc-tic)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
    # Plot training & validation accuracy values
    plt.plot(history['acc'])
    plt.plot(history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
    # Plot training & validation loss values
    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
    # Save model.
    model.save('reklamacje_mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1], model

In [None]:
mydata=((x_train_balanced, y_train_balanced), (x_val, y_val))
myaccuracy, myloss, mymodel = train_ngram_model(mydata,
                      num_classes=len(label_dict),
                      learning_rate=3e-3,
                      epochs=80,
                      batch_size=128,
                      layers=1,
                      units=8,
                      dropout_rate=0.4,
                      l2=0.000)

calculate statistics per class

In [None]:
y_prob = mymodel.predict(x_val) 
y_preds = y_prob.argmax(axis=-1)
cm = metrics.confusion_matrix(y_val, y_preds)

In [None]:
cm


In [None]:
df_cm = pd.DataFrame(cm, label_dict, label_dict)
plt.figure(figsize = (10,7))
sn.set(font_scale=1.0)#for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 12})# font size

In [None]:
#print(metrics.classification_report(list(y_val),list(y_preds),labels=[1,2,3,4],target_names=label_dict[1:]))
print(metrics.classification_report(list(y_val),list(y_preds),target_names=label_dict))


In [None]:
pd.Series(y_val).value_counts()

In [None]:
print(mymodel.summary())

In [None]:
y_prob = mymodel.predict(x_train) 
y_preds = y_prob.argmax(axis=-1)
cm_train = metrics.confusion_matrix(y_train, y_preds)

In [None]:
cm_train

In [None]:
df_cm_train = pd.DataFrame(cm_train, label_dict, label_dict)
plt.figure(figsize = (10,7))
sn.set(font_scale=1.0)#for label size
sn.heatmap(df_cm_train, annot=True, annot_kws={"size": 12})# font size

In [None]:
print(metrics.classification_report(list(y_train),list(y_preds)))


In [None]:
def tune_ngram_model(data):
    """Tunes n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
    """
    
    # Select parameter values to try.
    num_layers = [1, 2, 3]
    num_units = [4, 6, 10]
    #dropouts =[0.3,0.4,0.5]

    # Save parameter combination and results.
    params = {
        'layers': [],
        'units': [],
        'accuracy': [],
        'loss':[],
        'f1':[],
    }
    
    (x_train, y_train), (x_val, y_val) = data

    # Iterate over all parameter combinations.
    for layers in num_layers:
        for units in num_units:
                params['layers'].append(layers)
                params['units'].append(units)
                print(f'parameters: layers-{layers}, units-{units}')
                myaccuracy, myloss, mymodel = train_ngram_model(data,
                      num_classes=len(label_dict),
                      learning_rate=4e-3,
                      epochs=7,
                      batch_size=128,
                      layers=layers,
                      units=units,
                      dropout_rate=0.4,
                      l2=0.005)
                y_prob = mymodel.predict(x_val) 
                y_preds = y_prob.argmax(axis=-1)
                myf1=metrics.f1_score(list(y_val),list(y_preds),labels=[1,2,3,4])
                print((f'Accuracy: {myaccuracy}, Loss: {myloss}, F1: {myf1}, Parameters: (layers={layers}, units={units})'))
                params['accuracy'].append(myaccuracy)
                params['loss'].append(myloss)
                params['f1'].append(myf1)
    #_plot_parameters(params)
    return params
    
def _plot_parameters(params):
    """Creates a 3D surface plot of given parameters.

    # Arguments
        params: dict, contains layers, units and accuracy value combinations.
    """
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot_trisurf(params['layers'],
                    params['units'],
                    params['accuracy'],
                    cmap=cm.coolwarm,
                    antialiased=False)
    plt.show()

In [None]:
wyniki = tune_ngram_model(mydata)