# rozpoznawanie typów zgłoszeń DOP

In [1]:
import sys, os, re, time
import pandas as pd
import numpy as np
#import seaborn as sn
import matplotlib.pyplot as plt

# import random
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_classif
# from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss
from sklearn.model_selection import KFold

#from imblearn.over_sampling import RandomOverSampler

import fastText



## przygotowanie danych uczących

In [2]:
datapath='/mnt/c/dev/DOP-categories/'
datafile='Cases categorization.xlsx'
dane_surowe=pd.read_excel(os.path.join(datapath,datafile))


In [3]:
#dane_surowe.info()

In [None]:
dane_surowe.columns

In [4]:
input_data=pd.DataFrame()
input_data[['content','category']]=dane_surowe[['case_desc','Unnamed: 15']]

In [None]:
input_data.head()

In [5]:
# clearing data
# remove duplicates
input_data.drop_duplicates(inplace=True)
# remove empty
input_data=input_data.dropna()

In [6]:
# find duplicates
input_data[input_data.duplicated(keep=False)]

Unnamed: 0,content,category


## label freq analysis & selection

In [None]:
def parse_labels(raw_labels):
    label_list=raw_labels.split(';')
    label_list=[x.strip().replace(' ','_') for x in label_list if x != '']
    return label_list


In [None]:
label_lists=input_data['category'].apply(parse_labels)

In [None]:
label_lists.head()

In [None]:
from itertools import chain
from collections import Counter
label_count=Counter(chain.from_iterable(label_lists.values))

In [None]:
plt.barh(list(label_count.keys()),list(label_count.values()))

In [None]:
min_label_freq=5  # minimum occurencies to be used in 
pruned_label_count = {key:value for (key,value) in label_count.items() if value>=min_label_freq}

In [None]:
plt.barh(list(pruned_label_count.keys()),list(pruned_label_count.values()))

In [None]:
interesting_labels=list(pruned_label_count.keys())
pd.Series(interesting_labels).to_csv(os.path.join(datapath,'interesting_labels.csv'),header=['label'],index=False)


## preprocessing

### preprocessing content text

In [7]:
# Słownik synonimów / podmian

podmiany=pd.read_excel(os.path.join(datapath,'preproc_dict.xlsx'))

In [8]:
def preprocess_texts(raw_texts,replacements):
    """
    texts: np.Series containing strings to be preprocessed
    replacements: pairs of what convert to what
    return np.Series with corrected texts
    """
    resulttext=raw_texts.str.lower()
    for [co,naco,_] in replacements.values:
        resulttext=resulttext.str.replace(re.compile(str(co)),str(naco))
    return resulttext


In [9]:
prep_texts=preprocess_texts(input_data['content'],podmiany)

In [10]:
prep_texts[0]

'problem z wyświetlaniem informacji w faq w kategorii faq ogólne pierwszy artykuł ma tytuł tytuł dokumentu i treść przykładowy konwent dokumentu login _email msisdn _phonenumber numer zamówienia _number wystąpiło _number _number _number _number _number jira dsd _number'

### preprocessing labels

In [11]:
interesting_labels=pd.read_csv(os.path.join(datapath,'interesting_labels.csv'))

In [12]:
def preprocess_labels(raw_labels, interesting_labels):
    result_labels=raw_labels.split(';')  # split on ';'
    # remove leading space and replace spaces inside to underscore
    result_labels=[x.strip().replace(' ','_') for x in result_labels]
    #remove not interesting labels
    result_labels=[x for x in result_labels if x in interesting_labels]
    return result_labels

In [13]:

prep_labels=input_data['category'].apply(preprocess_labels, args=(interesting_labels.values,))

In [14]:
prep_labels.tail()

184                        []
185                  [OnFido]
186    [payments, activation]
187                     [app]
188              [UX, layout]
Name: category, dtype: object

## model learning - fasttext

In [None]:
# texts_train,texts_val,y_train,y_val=train_test_split(prep_texts.values, prep_labels.values,
#                                                      test_size=0.25,random_state=1, shuffle=True)

In [None]:
#save file for fasttext emb
# pd.Series(texts_train).to_csv(os.path.join(datapath,'texts_for_emb.txt'),sep='\n',index=False)

In [None]:
# oversampling to boost minority classes
# ros = RandomOverSampler(random_state=0)

# texts_train, y_train = ros.fit_resample(np.reshape(texts_train, (-1, 1)),y_train)

# shuffle to be sure 
# texts_train, y_train = shuffle(texts_train, y_train, random_state=0)

# texts_train=texts_train.reshape(-1)

In [15]:
# prepare files for fasstext
def prepare_fasttext(texts, labels):
    fasttext_set=[]
    for i,l in enumerate(labels):
        l=['__label__'+x for x in l]
        labs=' '.join(l) 
        fasttext_set.append(labs + ' ' + texts[i])
    return fasttext_set

def prepare_fasttext_file(texts, labels, filename):
    fasttext_set=prepare_fasttext(texts, labels)
    pd.Series(fasttext_set).to_csv(os.path.join(datapath,filename),sep='\n',index=False)


In [16]:
# training & evaluation
def train_and_evaluate(texts, labels, params):
    print(params)
    avg_precision = avg_recall = 0
    fold_n=4
    folds = KFold(n_splits=fold_n, shuffle=True, random_state=1)
    for i, (train_index, val_index) in enumerate(folds.split(texts)):
        tic=time.time()
        print(f'Calculating fold {i+1}/{fold_n}...')
        # Generate batches from indices
        texts_train, texts_val = texts[train_index], texts[val_index]
        y_train, y_val = labels[train_index], labels[val_index]
        # prepare files for train and validation
        prepare_fasttext_file(texts_train, y_train, 'fasttext_train.txt')
        prepare_fasttext_file(texts_val, y_val, 'fasttext_val.txt')
        # train classifier
        classifier = fastText.train_supervised(os.path.join(datapath,'fasttext_train.txt'), **params)
        # test classifier
        supp,precision,recall=classifier.test(os.path.join(datapath,'fasttext_val.txt'), k=2)
        f1=2*precision*recall/(precision+recall)
        print(f'precision={precision:.2}, recall={recall:.2}, f1={f1:.2}')
        avg_precision+=precision/fold_n
        avg_recall+=recall/fold_n
        toc=time.time()
        # print(f'Fold {i+1} calcutated in {toc-tic}.')
    avg_f1=2*avg_precision*avg_recall/(avg_precision+avg_recall)
    print(f'** Average results: precision={avg_precision:.2}, recall={avg_recall:.2}, f1={avg_f1:.2}')
    return avg_precision, avg_recall, avg_f1
    

In [19]:
train_and_evaluate(prep_texts.values, prep_labels.values, {'epoch':6000, 'dim':40, 'wordNgrams':1})

{'epoch': 6000, 'dim': 40, 'wordNgrams': 1}
Calculating fold 1/4...
precision=0.37, recall=0.45, f1=0.4
Calculating fold 2/4...
precision=0.55, recall=0.64, f1=0.59
Calculating fold 3/4...
precision=0.47, recall=0.55, f1=0.51
Calculating fold 4/4...
precision=0.44, recall=0.51, f1=0.47
** Average results: precision=0.46, recall=0.54, f1=0.49


(0.45691647194095175, 0.5381419037583421, 0.4942140201494178)

## final model training

In [20]:
# training
best_params = {'epoch':6000, 'dim':50, 'wordNgrams':1}
prepare_fasttext_file(prep_texts.values, prep_labels.values, 'fasttext_final_train.txt')
classifier = fastText.train_supervised(os.path.join(datapath,'fasttext_final_train.txt'),**best_params)

In [21]:
supp,precision,recall=classifier.test(os.path.join(datapath,'fasttext_final_train.txt'), k=2)
f1=2*precision*recall/(precision+recall)
print(f'Final model (on TRAIN data): precision={precision:.2}, recall={recall:.2}, f1={f1:.2}')

Final model (on TRAIN data): precision=0.81, recall=0.95, f1=0.88


In [22]:
classifier.save_model(os.path.join(datapath,'model'))

## result analysis

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
# binarize labels in validation set
mlb = MultiLabelBinarizer()
mlb.fit(y_train)
y_val_bin = mlb.transform(y_val)

In [None]:
mlb.classes_

In [None]:
def make_predictions(clf, texts, threshold=0.5, k=5):
    predicted_labels=[]
    predicted_probs=[]
    for t in texts:
        # print(f't: {t}')
        cleared_labels=[]
        cleared_probs=[]
        labels, probs = clf.predict(t,k=k,threshold=0)
        for i,l in enumerate(labels):
            if probs[i]>=threshold:
                cleared_labels.append(l.replace('__label__',''))
                cleared_probs.append(probs[i])
        predicted_labels.append(cleared_labels)
        predicted_probs.append(cleared_probs)
    return predicted_labels, predicted_probs

In [None]:
pred_labels, pred_probs = make_predictions(classifier, texts_val, threshold=0.4, k=5)

In [None]:
y_val_preds=mlb.transform(pred_labels)

In [None]:
print(y_val_bin[0])
print(y_val_preds[0])

In [None]:
print(f"Hamming loss: {hamming_loss(y_val_bin, y_val_preds)}")
print(f"Hamming score: {hamming_score(y_val_bin, y_val_preds)}")

## fasttext internal test

In [None]:
result=classifier.test(os.path.join(datapath,'fasttext_val.txt'),k=5)

In [None]:
result

In [None]:
result=classifier.test_label(os.path.join(datapath,'fasttext_val.txt'),k=5)

In [None]:
result

## predictions

In [None]:
texts_test=pd.read_csv(os.path.join(datapath,'fasttest_test.txt'), header=None)
test_set=preprocess_texts(texts_test[0],podmiany)

In [None]:
test_set.values.shape

In [None]:
def make_formatted_predictions(clf, texts):
    predictions=[]
    for t in texts:
        labels, probs = clf.predict(t,k=5,threshold=0.1)
        result_line=''
        # print(f'labels:{labels}')
        # print(f'probs:{probs}')
        for i,l in enumerate(labels):
            # print(f'i:{i} l:{l} probs[i]:{probs[i]}')
            l=l.replace('__label__','')
            result_line+=l + ': ' + str(probs[i]) + '; '
        # print(f'result_line: {result_line}')
        predictions.append(result_line)
    return predictions
    

In [None]:
result = make_formatted_predictions(classifier, test_set.values)

In [None]:
pd.Series(result).to_csv(os.path.join(datapath,'predictions.csv'),index=False)