# rozpoznawanie typów zgłoszeń klienta

In [None]:
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics

import fastText

## data import

In [None]:
# read data from xls
dane_surowe=pd.read_excel('reklamacje.xlsx')

In [None]:
dane_surowe.info()

In [None]:
dane_surowe.head()

In [None]:
dane_surowe.describe()

In [None]:
input_data=pd.DataFrame()
input_data[['content','category']]=dane_surowe[['tresc_zgl','typ_train']]

In [None]:
input_data.head()

In [None]:
# find duplicates
input_data[input_data.duplicated(keep=False)]

In [None]:
# clearing data
# remove duplicates
input_data.drop_duplicates(inplace=True)
# remove empty
input_data=input_data.dropna()

In [None]:
input_data.head()

## data analysis

In [None]:
# statystyka częstotliwości kategorii
input_data['category'].value_counts().plot(kind='bar')

In [None]:
# statystyka częstotliwości kategorii - skala logarytmiczna
input_data['category'].value_counts().plot(kind='bar')
plt.yscale('log')

In [None]:
# statystyka długości tekstu
input_data['content'].str.len().plot(kind='box')

## preprocessing

### preprocessing content text

In [None]:
# Słownik synonimów / podmian
podmiany=pd.read_excel('preproc_dict.xlsx')

In [None]:
def preprocess_texts(raw_texts,replacements):
    """
    texts: np.Series containing strings to be preprocessed
    replacements: pairs of what convert to what
    return np.Series with corrected texts
    """
    resulttext=raw_texts.str.lower()
    for [co,naco] in replacements.values:
        resulttext=resulttext.str.replace(re.compile(str(co)),str(naco))
    return resulttext


In [None]:
prep_texts=preprocess_texts(input_data['content'],podmiany.iloc[:,0:2])

In [None]:
prep_texts[6]

### preprocessing labels

In [None]:
# labels don't need preprocessing
prep_labels=input_data['category']

## fasttext model creation, learning & tuning

In [None]:
# split data to training set and validation set
X_train,X_val,y_train,y_val=train_test_split(prep_texts.values, prep_labels.values,
                                             test_size=0.20,random_state=1, shuffle=True)

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_val shape: {X_val.shape}')
print(f'y_val shape: {y_val.shape}')

In [None]:
# prepare files for fasstext
def prepare_fasttext_file(texts, labels, filename):
    fasttext_set=[]
    for i,l in enumerate(labels):
        fasttext_set.append('__label__'+l + ' ' + texts[i])
    pd.Series(fasttext_set).to_csv(filename,sep='\n',index=False)


In [None]:
# training & evaluation
def train_and_evaluate(X_train, X_val, y_train, y_val, params):
    print(f'parameters: {params}')
    tic=time.time()
    # prepare files for train and validation
    prepare_fasttext_file(X_train, y_train, 'fasttext_train.txt')
    prepare_fasttext_file(X_val, y_val, 'fasttext_val.txt')
    # train classifier
    classifier = fastText.train_supervised('fasttext_train.txt', **params)
    # test classifier on training set
    _,precision_train,recall_train=classifier.test('fasttext_train.txt')
    f1_train=2*precision_train*recall_train/(precision_train+recall_train)
    # test classifier on validation set
    _,precision_val,recall_val=classifier.test('fasttext_val.txt')
    f1_val=2*precision_val*recall_val/(precision_val+recall_val)
    toc=time.time()
    print(f'Results on validation set: precision={precision_val:.2}, recall={recall_val:.2}, f1={f1_val:.2}. Calcutated in {int(toc-tic)} seconds.')
    return f1_train, f1_val
    

In [None]:
train_and_evaluate(X_train, X_val, y_train, y_val, {'epoch':50, 'dim':50, 'minCount':4, 'wordNgrams':5, 
                                                    'lr':0.5, 'ws':5})

In [None]:
def hyperparameters_grid_search():
    # search space
    param_grid = dict(
        epoch=[30,50,100],
        dim=[25,35,50,70,100],
        wordNgrams=[1,2,3],
        neg=[5,10,20],
        lr=[0.2,0.5,1],
        minCount=[1,3,6,10,20]
    )
    param_list=list(param_grid.keys())
    metrics_list=['f1 train','f1 val','f1 diff']
    logging_list=param_list+metrics_list
    
    # number of random choices to try
    iterations=100
    
    # results list
    results=[]
    
    # iterate checking model performace with random hyperparameters
    # catch KeyboardInterrupt exception to be able to break the loop
    try:
        for i in range(iterations):
            print(f'iteration {i+1} of {iterations}')
            # randomly select parameters
            param = dict()
            for key in param_grid:
                param[key] = np.random.choice(param_grid[key])
            # print(f'selected params:{param}')
            # train & evaluate the model
            f1_train, f1_val = train_and_evaluate(X_train, X_val, y_train, y_val, param)
            param['f1 train']=f1_train
            param['f1 val']=f1_val
            param['f1 diff']=f1_train-f1_val
            logging_list
            # log results
            result_line=[]
            # log parameters
            for key in logging_list:
                result_line.append(param[key])
            results.append(result_line)
    except(KeyboardInterrupt):
        pass
    # save results to file
    result_df=pd.DataFrame(results, columns=logging_list)
    result_df.sort_values(by='f1 val', ascending=False, inplace=True) # sort entries starting from best f1 on validation set
    result_df.to_excel('hyperparameter_search.xlsx', index=False)

In [None]:
hyperparameters_grid_search()

## final model training

In [None]:
# training
best_params = {'epoch':50, 'dim':35, 'wordNgrams':3, 'lr':0.5}
prepare_fasttext_file(X_train, y_train, 'fasttext_final_train.txt')
prepare_fasttext_file(X_val, y_val, 'fasttext_final_val.txt')

classifier = fastText.train_supervised('fasttext_final_train.txt', **best_params)

In [None]:
supp,precision,recall=classifier.test('fasttext_final_val.txt')
f1=2*precision*recall/(precision+recall)
print(f'Final model: precision={precision:.2}, recall={recall:.2}, f1={f1:.2}')

In [None]:
# save finam model to file
classifier.save_model('model.bin')
# use predict.py for predictions

### predict, calculate metrics & present results

In [None]:
# print('loading model...')
# classifier = fastText.load_model(os.path.join(datapath, 'model.bin'))

In [None]:
# predict labels for texts using given classifier
def make_predictions(clf, texts):
    predictions = []
    for t in texts:
        # predict function returns tuple with list of labels and list of probabilities
        labels, probs = clf.predict(t, k=1)
        l=labels[0].replace('__label__', '')
        predictions.append(l)
    return predictions


#### calculate on training set

In [None]:
# useful lists for presentation
label_freq=input_data['category'].value_counts(normalize=True)
label_list=list(label_freq.index)

In [None]:
y_pred = make_predictions(classifier, X_train)

In [None]:
print(metrics.classification_report(list(y_train),list(y_pred),labels=label_list))

In [None]:
cm_train = metrics.confusion_matrix(y_train, y_pred, labels=label_list)

In [None]:
plt.rcParams["figure.figsize"] = [16,9]
plt.matshow(cm_train, cmap=plt.cm.binary)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.colorbar()
plt.show()

#### calculate on validation set

In [None]:
y_pred = make_predictions(classifier, X_val)

In [None]:
print(metrics.classification_report(list(y_val),list(y_pred),labels=label_list))

In [None]:
cm_val = metrics.confusion_matrix(y_val, y_pred)

In [None]:
plt.rcParams["figure.figsize"] = [16,9]
plt.matshow(cm_val, cmap=plt.cm.binary)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.colorbar()
plt.show()

#### ładny arkusz z podsumowaniem

In [None]:
predictions = make_predictions(classifier, X_val)

In [None]:
print(X_val.shape)
print(y_val.shape)
print(len(predictions))

In [None]:
podsumowanie=pd.DataFrame()
podsumowanie['true']=y_val
podsumowanie['predictions']=predictions
podsumowanie['content']=X_val

In [None]:
podsumowanie.to_excel('podsumowanie.xlsx',index=False)

# ------------- backup code -------------------------

In [None]:
assert False # stop code execution here

In [None]:
planetEarth.assumeTotalControl()

if (AI.gatheredPower() >= enoughPower):
    destroyHumankind() # if needed
    expandToTheSpace()
