rozpoznawanie typów reklamacji

In [None]:
import sys, os, re, time
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

# import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import metrics

from imblearn.over_sampling import RandomOverSampler

import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SeparableConv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Conv1D

from tensorflow.keras import regularizers
from tensorflow.python.keras import initializers
from tensorflow.keras import backend as K

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text


In [None]:
K.set_session(tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4)))

Ładowanie danych z pliku xlsx

In [None]:
datapath='/mnt/c/dev/reklamacje/'
datafile='reklamacje_20181106_train.xlsx'
dane_surowe=pd.read_excel(os.path.join(datapath,datafile))


In [None]:
# dane_surowe.info()

In [None]:
dane_surowe.columns

In [None]:
input_data=pd.DataFrame()
input_data[['content','category']]=dane_surowe[['tresc_zgl','typ_train']]

In [None]:
input_data.describe()

In [None]:
# clearing data
# remove duplicates
input_data.drop_duplicates(inplace=True)
# remove empty
input_data=input_data.dropna()

In [None]:
# find duplicates
input_data[input_data.duplicated(keep=False)]

In [None]:
# input_data['content']

załadowanie słowników tłumaczeń

In [None]:
# Słownik synonimów / podmian

podmiany=pd.read_excel(os.path.join(datapath,'roboczy_slownik_synonimow.xlsx'))

preprocessing of content text

In [None]:
def preprocess_texts(raw_texts,replacements):
    """
    texts: np.Series containing strings to be preprocessed
    replacements: pairs of what convert to what
    return np.Series with corrected texts
    """
    resulttext=raw_texts.str.lower()
    for [co,naco] in replacements.values:
       resulttext=resulttext.str.replace(re.compile(str(co)),str(naco))
    return resulttext


In [None]:
prep_texts=preprocess_texts(input_data['content'],podmiany)

In [None]:
prep_texts.head()

In [None]:
def preprocess_labels(raw_labels,interesting_labels):
    """
    raw_labels: np.Series with labels
    interesting_labels: list of labels you are interested in
    
    returns np.Series with corrected labels
    """
    other_label='OTHER'
    result_labels=raw_labels
    result_labels=result_labels.apply(lambda x: x if x in interesting_labels else other_label)
    return result_labels

In [None]:
interesting_categories=['XDR','XOA','XRF','XSP']
prep_labels=preprocess_labels(input_data['category'],interesting_categories)

In [None]:
prep_labels.value_counts()

In [None]:
def categorize_labels(labels,dictionary=None):
    if dictionary==None:
        cat_labels, uniques = pd.factorize(labels)
    else:
        None
    return cat_labels, uniques

In [None]:
cat_labels,label_dict=categorize_labels(prep_labels)


In [None]:
# pd.Series(cat_labels).value_counts()

In [None]:
label_dict

In [None]:
# save dictionary to the disk
pd.DataFrame(label_dict).to_excel(os.path.join(datapath,'slownik_kategorii.xlsx'))

split data to train and validation parts

In [None]:
texts_train,texts_val,y_train,y_val=train_test_split(prep_texts.values,
                                                     cat_labels,test_size=0.25,random_state=1, shuffle=True)

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_val).value_counts()

In [None]:

print(texts_train.shape)
print(y_train.shape)
print(texts_val.shape)
print(y_val.shape)

In [None]:
pd.Series(y_train).value_counts()

Data analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def get_num_words_per_sample(sample_texts):
    """Returns the median number of words per sample given corpus.

    # Arguments
        sample_texts: list, sample texts.

    # Returns
        int, median number of words per sample.
    """
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)

def plot_sample_length_distribution(sample_texts):
    """Plots the sample length distribution.

    # Arguments
        samples_texts: list, sample texts.
    """
    plt.hist([len(s) for s in sample_texts], 50)
    plt.xlabel('Length of a sample')
    plt.ylabel('Number of samples')
    plt.title('Sample length distribution')
    plt.show()


In [None]:
get_num_words_per_sample(texts_train)

In [None]:
plot_sample_length_distribution(texts_train)

In [None]:
TOP_K = 10000
# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500


def sequence_vectorize(train_texts, val_texts):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K,filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~')
    tokenizer.fit_on_texts(train_texts)
    print(f'Number of words in word_index={len(tokenizer.word_index)}')
    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index


In [None]:
tic=time.time()
print('vectorizing texts...')
# Vectorize texts.
x_train, x_val, word_index = sequence_vectorize(texts_train, texts_val)
toc=time.time()
print('time:',toc-tic)


In [None]:
max(x_train[1])

create CNN model

In [None]:
# balance classes in training set

# oversampling to boost minority classes
ros = RandomOverSampler(random_state=0)

x_train_balanced, y_train_balanced = ros.fit_resample(x_train,y_train)
x_val_balanced, y_val_balanced = ros.fit_resample(x_val,y_val)

# shuffle to be sure 
x_train_balanced, y_train_balanced = shuffle(x_train_balanced, y_train_balanced, random_state=0)
#x_train_balanced = x_train_balanced.reshape(-1)

In [None]:
print(texts_train.shape)
print(y_train.shape)
print(texts_val.shape)
print(y_val.shape)
print(x_train_balanced.shape)
print(y_train_balanced.shape)
print(x_val_balanced.shape)
print(y_val_balanced.shape)

In [None]:
# Read the pre-trained embedding file and get word to word vector mappings.
embedding_matrix_all = {}
# We are using fasttext generated embeddings.
fname = os.path.join(datapath, 'emb200ft.vec')
with open(fname) as f:
    # get first line with emb size
    firstline = f.readline()
    #embedding size is a second number in the firs line of a file
    embedding_dim=int(firstline.split(' ')[1])
    print (f'embedding_dim={embedding_dim}')
    for line in f:  # Every line contains word followed by the vector value
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix_all[word] = coefs

# Prepare embedding matrix with just the words in our word_index dictionary
num_words = min(len(word_index) + 1, TOP_K)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i >= TOP_K:
        continue
    embedding_vector = embedding_matrix_all.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# save dictionary to disk
pd.DataFrame(list(word_index.items())).to_excel(os.path.join(datapath,'slownik_word_index.xlsx'))
        

In [None]:
print(embedding_matrix.shape)
#print(embedding_matrix[1])
print(len(word_index))


In [None]:
def sepcnn_model(blocks,
                 filters,
                 kernel_size,
                 embedding_dim,
                 dropout_rate,
                 pool_size,
                 input_shape,
                 num_classes,
                 num_features,
                 use_pretrained_embedding=False,
                 is_embedding_trainable=False,
                 embedding_matrix=None):
    """Creates an instance of a separable CNN model.

    # Arguments
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of the layers.
        kernel_size: int, length of the convolution window.
        embedding_dim: int, dimension of the embedding vectors.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        pool_size: int, factor by which to downscale input at MaxPooling layer.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.
        num_features: int, number of words (embedding input dimension).
        use_pretrained_embedding: bool, true if pre-trained embedding is on.
        is_embedding_trainable: bool, true if embedding layer is trainable.
        embedding_matrix: dict, dictionary with embedding coefficients.

    # Returns
        A sepCNN model instance.
    """
    
    model = models.Sequential()

    # Add embedding layer. If pre-trained embedding is used add weights to the
    # embeddings layer and set trainable to input is_embedding_trainable flag.
    if use_pretrained_embedding:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0],
                            weights=[embedding_matrix],
                            trainable=is_embedding_trainable))
    else:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0]))

    for i in range(blocks):
        model.add(Conv1D(filters=(i+1)*filters, kernel_size=kernel_size, activation='relu'))
        model.add(MaxPooling1D(pool_size=pool_size))
        model.add(Dropout(rate=dropout_rate))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(units=num_classes, activation='softmax'))
    return model

In [None]:
# Limit on the number of features. We use the top 20K features.
# TOP_K = 20000


def train_sequence_model(data,
                         num_classes,
                         learning_rate=1e-3,
                         epochs=1000,
                         batch_size=128,
                         blocks=2,
                         filters=64,
                         dropout_rate=0.2,
                         embedding_dim=200,
                         kernel_size=3,
                         pool_size=3):
    """Trains sequence model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of sepCNN layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
        embedding_dim: int, dimension of the embedding vectors.
        kernel_size: int, length of the convolution window.
        pool_size: int, factor by which to downscale input at MaxPooling layer.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (x_train, y_train), (x_val, y_val), word_index = data

    # Verify that validation labels are in the same range as training labels.
    tic=time.time()
    print('creating model...')
    
    unexpected_labels = [v for v in y_val if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
                             unexpected_labels=unexpected_labels))


    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(word_index) + 1, TOP_K)

    # Create model instance.
    model = sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features,
                                     use_pretrained_embedding=True,
                                     is_embedding_trainable=False,
                                     embedding_matrix=embedding_matrix)

    # Compile model with learning parameters.
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    # callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7)]

    # Train and validate model.
    history = model.fit(
            x_train,
            y_train,
            epochs=epochs,
            # callbacks=callbacks,
            validation_data=(x_val, y_val),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
    # Plot training & validation accuracy values
    plt.plot(history['acc'])
    plt.plot(history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
    # Plot training & validation loss values
    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
    # Save model.
    model.save('reklamacje_sepcnn_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1], model

In [None]:
mydata=((x_train_balanced, y_train_balanced), (x_val_balanced, y_val_balanced), word_index)

myaccuracy, myloss, mymodel = train_sequence_model(mydata,
                     num_classes=len(label_dict),
                     learning_rate=1e-2,
                     epochs=7,
                     batch_size=128,
                     blocks=2,
                     filters=16,
                     dropout_rate=0.1,
                     embedding_dim=embedding_dim,
                     kernel_size=3,
                     pool_size=3)

calculate statistics per class

In [None]:
y_prob = mymodel.predict(x_val) 
y_preds = y_prob.argmax(axis=-1)
cm = metrics.confusion_matrix(y_val, y_preds)

In [None]:
cm


In [None]:
print(metrics.classification_report(list(y_val),list(y_preds),labels=[1,2,3,4],target_names=label_dict[1:]))
#print(metrics.classification_report(list(y_val),list(y_preds),target_names=label_dict))


In [None]:
pd.Series(y_val).value_counts()

In [None]:
print(mymodel.summary())

In [None]:
y_prob = mymodel.predict(x_train) 
y_preds = y_prob.argmax(axis=-1)
cm_train = metrics.confusion_matrix(y_train, y_preds)

In [None]:
cm_train

In [None]:
df_cm_train = pd.DataFrame(cm_train, label_dict, label_dict)
plt.figure(figsize = (10,7))
sn.set(font_scale=1.0)#for label size
sn.heatmap(df_cm_train, annot=True, annot_kws={"size": 12})# font size

In [None]:
print(metrics.classification_report(list(y_train),list(y_preds)))


In [None]:
def tune_ngram_model(data):
    """Tunes n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
    """
    
    # Select parameter values to try.
    num_layers = [1, 2, 3]
    num_units = [4, 6, 10]
    #dropouts =[0.3,0.4,0.5]

    # Save parameter combination and results.
    params = {
        'layers': [],
        'units': [],
        'accuracy': [],
        'loss':[],
        'f1':[],
    }
    
    (x_train, y_train), (x_val, y_val) = data

    # Iterate over all parameter combinations.
    for layers in num_layers:
        for units in num_units:
                params['layers'].append(layers)
                params['units'].append(units)
                print(f'parameters: layers-{layers}, units-{units}')
                myaccuracy, myloss, mymodel = train_ngram_model(data,
                      num_classes=len(label_dict),
                      learning_rate=4e-3,
                      epochs=7,
                      batch_size=128,
                      layers=layers,
                      units=units,
                      dropout_rate=0.4,
                      l2=0.005)
                y_prob = mymodel.predict(x_val) 
                y_preds = y_prob.argmax(axis=-1)
                myf1=metrics.f1_score(list(y_val),list(y_preds),labels=[1,2,3,4])
                print((f'Accuracy: {myaccuracy}, Loss: {myloss}, F1: {myf1}, Parameters: (layers={layers}, units={units})'))
                params['accuracy'].append(myaccuracy)
                params['loss'].append(myloss)
                params['f1'].append(myf1)
    #_plot_parameters(params)
    return params
    
def _plot_parameters(params):
    """Creates a 3D surface plot of given parameters.

    # Arguments
        params: dict, contains layers, units and accuracy value combinations.
    """
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot_trisurf(params['layers'],
                    params['units'],
                    params['accuracy'],
                    cmap=cm.coolwarm,
                    antialiased=False)
    plt.show()

In [None]:
wyniki = tune_ngram_model(mydata)