# Experiments
1. BOW based classification models
2. Word embedding models which can use subword information (pre-trained and fine-tuned)
3. Sentence embedding models (pre-trained and fine-tuned)

In [2]:
import pandas as pd
import numpy as np
import os, sys, swifter, re
from constants import *
from utility import *
from preprocess_utils import _remove_non_ascii_characters
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# GLOBALS
DEFAULT_CLASS = 'NO_NODES_DETECTED'

In [4]:
# read data
%time df_train = pd.read_csv(TRAIN_FN)
%time df_test = pd.read_csv(TEST_FN)

df_train.drop_duplicates(inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_test.drop_duplicates(inplace=True)
df_test.reset_index(drop=True, inplace=True)

print(df_train.shape, '\t', df_test.shape)

CPU times: user 3.48 ms, sys: 1.29 ms, total: 4.77 ms
Wall time: 5.83 ms
CPU times: user 2.41 ms, sys: 2.7 ms, total: 5.11 ms
Wall time: 7.08 ms
(324, 2) 	 (394, 2)


In [5]:
df_train.isnull().sum(), df_test.isnull().sum()

(sentence    0
 label       0
 dtype: int64,
 sentence    0
 label       0
 dtype: int64)

## Preprocessing
1. Contraction
2. Lower casing
3. Remove non-alphabets
4. Remove stop words
5. Stemming

In [14]:
preprocess_obj = Text_Preprocessing(keep_eng=False, remove_nonalpha=False, lower_case=True,
                         remove_punkt=False, remove_stop=True, remove_numerals=False,
                         spell_check=False, contraction=True,
                         contraction_var=CONTRACTIONS, stem=True,
                         lem=False, filter_pos=False, pos_var=('N', 'J'),
                         tokenize=False, template_removal=False,
                         template_start_string='', regex_cleaning=False,
                         remove_ignore_words=False, ignore_words=IGNORE_WORDS,
                         custom_stoplist=[], word_size=2, word_size_filter=False)

In [15]:
%%time
df_train['sent_pre'] = df_train['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_train['sent_pre'] = preprocess_obj.fit_transform(df_train['sent_pre'])
df_train['sent_pre'] = df_train['sent_pre'].swifter.apply(lambda x:
                                                          re.sub("[^A-Za-z']+", ' ', x))
df_test['sent_pre'] = df_test['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_test['sent_pre'] = preprocess_obj.fit_transform(df_test['sent_pre'])
df_test['sent_pre'] = df_test['sent_pre'].swifter.apply(lambda x:
                                                        re.sub("[^A-Za-z']+", ' ', x))
df_train.fillna(value={'sent_pre': ''}, inplace=True)
df_test.fillna(value={'sent_pre': ''}, inplace=True)

contraction
lower case
remove stop words


Pandas Apply:   0%|          | 0/324 [00:00<?, ?it/s]

stemming


Pandas Apply:   0%|          | 0/324 [00:00<?, ?it/s]

contraction
lower case
remove stop words


Pandas Apply:   0%|          | 0/394 [00:00<?, ?it/s]

stemming


Pandas Apply:   0%|          | 0/394 [00:00<?, ?it/s]

CPU times: user 222 ms, sys: 22.4 ms, total: 245 ms
Wall time: 234 ms


In [16]:
df_train[['sentence', 'sent_pre']].head(10)

Unnamed: 0,sentence,sent_pre
0,You guys provide EMI option?,guy provid emi option
1,Do you offer Zero Percent EMI payment options?,offer zero percent emi payment option
2,0% EMI.,emi
3,EMI,emi
4,I want in installment,want instal
5,I want it on 0% interest,want interest
6,How to get in EMI,emi
7,what about emi options,emi option
8,I need emi payment.,need emi payment
9,How to EMI,emi


In [46]:
labels = np.array(df_train['label'].tolist() + df_test['label'].tolist())
le = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
le.fit(labels)
df_train['label_trans'] = le.transform(df_train['label'].values)
df_test['label_trans'] = le.transform(df_test['label'].values)
labels_enc = np.array(df_train['label_trans'].tolist() + df_test['label_trans'].tolist())
labels_enc = labels_enc.reshape(len(labels_enc), 1)
onehot_encoder.fit(labels_enc)
train_labels_ohe = onehot_encoder.transform(df_train['label_trans'].values.reshape(-1, 1))
test_labels_ohe = onehot_encoder.transform(df_test['label_trans'].values.reshape(-1, 1))

In [47]:
pd.crosstab(df_test['label'], df_test['label_trans'])

label_trans,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,21
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100_NIGHT_TRIAL_OFFER,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABOUT_SOF_MATTRESS,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CANCEL_ORDER,0,0,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHECK_PINCODE,0,0,0,22,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COD,0,0,0,0,8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COMPARISON,0,0,0,0,0,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DELAY_IN_DELIVERY,0,0,0,0,0,0,13,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DISTRIBUTORS,0,0,0,0,0,0,0,7,0,0,...,0,0,0,0,0,0,0,0,0,0
EMI,0,0,0,0,0,0,0,0,16,0,...,0,0,0,0,0,0,0,0,0,0
ERGO_FEATURES,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0


In [54]:
df_train['label_trans'].head(10), train_labels_ohe[:10, 8]

(0    8
 1    8
 2    8
 3    8
 4    8
 5    8
 6    8
 7    8
 8    8
 9    8
 Name: label_trans, dtype: int64,
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

## BOW

In [177]:
import pickle
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_curve, auc

In [207]:
def get_prec_rec(y_true_ohe, pred_prob):
    n_classes = len(set(y_true_ohe))
    precision = {}
    recall = {}
    fpr, tpr = {}, {}
    roc_auc = {}
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_true_ohe[:, i],
                                                            pred_prob[:, i])
        fpr[i], tpr[i], _ = roc_curve(y_true_ohe[:, i], pred_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_true_ohe.ravel(),
                                                                    pred_prob.ravel())
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true_ohe.ravel(), pred_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return precision, recall, roc_auc


def get_accuracy(y_true, pred):
    return np.mean(y_true == pred)


def adjust_pred_prob(pred_probs, default_class_idx):
    out = []
    for pred_prob in pred_probs:
        v = max(1 - pred_prob.sum(), 0)
        pred_prob_n = np.insert(pred_prob, default_class_idx, v)
        out.append(pred_prob_n)
    return np.array(out)
    
    
def create_pipeline(x_train, y_train, x_test, y_test, max_df=0.5, max_features=10000,
                    min_df=20, ngram_range=(1, 3), default_class_idx=12,
                    class_algo='lr', **params):
    
    if class_algo == 'lr':
        obj = LogisticRegression(penalty=params['penalty'], C=params['C'], random_state=100,
                                 verbose=1, l1_ratio=params['l1_ratio'], solver='saga')
    elif class_algo == 'gbc':
        obj = GradientBoostingClassifier(learning_rate=params['lr'],
                                         n_estimators=params['n_trees'],
                                         subsample=0.9, max_depth=params['max_depth'],
                                         random_state=100, max_features='sqrt', verbose=1)

    pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=max_df, max_features=max_features,
                              min_df=min_df, ngram_range=ngram_range)),
    ('normalize', Normalizer(copy=False)),
    (class_algo, obj)])

    start = time.time()
    print('fitting begins\n')
    pipeline.fit(x_train, y_train)
    print('time taken: %0.f' % (time.time() - start))

    print('prediction on train and test\n')
    pred_train = pipeline.predict(x_train)
    pred_prob_train = pipeline.predict_proba(x_train)
    pred_prob_train = adjust_pred_prob(pred_prob_train, default_class_idx)
    pred_test = pipeline.predict(x_test)
    pred_prob_test = pipeline.predict_proba(x_test)
    pred_prob_test = adjust_pred_prob(pred_prob_test, default_class_idx)

    print('shape of transformed train: ', pred_train.shape)
    print('shape of transformed test: ', pred_test.shape)

    print('Record pipeline metrics\n')
    tfidf = pipeline.steps[0][1]
    feature_names = tfidf.get_feature_names()

    return pred_train, pred_prob_train, pred_test, pred_prob_test, feature_names, pipeline


def calc_prediction(threshold, y_pred, y_pred_prob, default_class_idx):
    probs = [y_pred_prob[i, pred] for i, pred in enumerate(y_pred)]
    preds = [y_pred[i] if prob >= threshold else default_class_idx
             for i, prob in enumerate(probs)]
    return preds


def get_accuracy_per_threshold(threshold, y_true, y_pred, y_pred_prob, default_class_idx):
    y_true_series = pd.Series(y_true)
    preds = calc_prediction(threshold, y_pred, y_pred_prob, default_class_idx)
    acc = get_accuracy(y_true, preds)
    pred_series = pd.Series(preds)
    mask = y_true_series == default_class_idx
    in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
    out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
    d = {'threshold': threshold, 'acc': acc, 'in_scope_acc': in_scope_acc,
         'out_scope_acc': out_scope_acc}
    return d


def find_optimal_threshold(thresholds, y_true, y_pred, y_pred_prob, default_class_idx):
    y_true_series = pd.Series(y_true)
    
    out = []
    for thresh in thresholds:
        preds = calc_prediction(thresh, y_pred, y_pred_prob, default_class_idx)
        acc = get_accuracy(y_true, preds)
        pred_series = pd.Series(preds)
        mask = y_true_series == default_class_idx
        in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
        out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
        d = {'threshold': thresh, 'acc': acc, 'in_scope_acc': in_scope_acc,
             'out_scope_acc': out_scope_acc}
        out.append(d)
    out = pd.DataFrame(out)
    return out


def summarize_results(thresh_df, min_in_scope_acc_frac=0.95):
    max_acc = thresh_df['acc'].max()
    max_in_scope_acc = thresh_df['in_scope_acc'].max()
    max_out_scope_acc = thresh_df['out_scope_acc'].max()
    min_in_scope_acc = min_in_scope_acc_frac * max_in_scope_acc
    mask = thresh_df['in_scope_acc'] >= min_in_scope_acc
    best_out_scope_acc = thresh_df.loc[mask, 'out_scope_acc'].max()
    mask1 = thresh_df['out_scope_acc'] == best_out_scope_acc
    best_acc = thresh_df.loc[mask&mask1, 'acc'].values[0]
    best_in_scope_acc = thresh_df.loc[mask&mask1, 'in_scope_acc'].values[0]
    best_thresh = thresh_df.loc[mask&mask1, 'threshold'].values[0]
    return {'max_acc': max_acc, 'max_in_scope_acc': max_in_scope_acc,
            'max_out_scope_acc': max_out_scope_acc, 'best_out_scope_acc': best_out_scope_acc,
            'best_acc': best_acc, 'best_in_scope_acc': best_in_scope_acc,
            'best_thresh': best_thresh}

In [185]:
%%time
print('Tfidf + GBC\n')
x_train, y_train = df_train['sent_pre'].values, df_train['label_trans'].values
x_test, y_test = df_test['sent_pre'].values, df_test['label_trans'].values
default_class_idx = le.transform([DEFAULT_CLASS])[0]
class_algo = 'gbc'
params = {'lr': 0.09, 'n_trees': 200, 'max_depth': 2}
pred_train, pred_prob_train, pred_test, pred_prob_test, feature_names, pipeline = create_pipeline(
    x_train, y_train, x_test, y_test, max_df=0.8, max_features=100000, min_df=2,
    ngram_range=(1, 3), default_class_idx=default_class_idx, class_algo=class_algo, **params)

Tfidf + GBC

fitting begins

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           2.5465           0.2422            3.50s
         2           2.2573           0.2565            3.63s
         3           2.0633           0.1201            3.51s
         4           1.9426           0.0683            3.39s
         5           1.8869           0.1169            3.29s
         6           1.7367           0.1532            3.19s
         7           1.6220           0.0632            3.07s
         8           1.6048           0.0303            2.97s
         9           1.5064           0.0539            2.91s
        10           1.4701           0.0078            2.84s
        20           1.0531           0.0847            2.53s
        30           0.8459           0.0034            2.27s
        40           0.6626           0.0022            2.11s
        50           0.5635           0.0041            1.98s
        60           0.4855          -0.

In [186]:
%%time
thresholds = [x/100. for x in range(100)]
thresh_df = find_optimal_threshold(thresholds, y_test, pred_test, pred_prob_test,
                                   default_class_idx)

CPU times: user 124 ms, sys: 1.55 ms, total: 125 ms
Wall time: 124 ms


In [187]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.92)
best_d

{'max_acc': 0.616751269035533,
 'max_in_scope_acc': 0.645021645021645,
 'max_out_scope_acc': 0.9877300613496932,
 'best_out_scope_acc': 0.5766871165644172,
 'best_acc': 0.5888324873096447,
 'best_in_scope_acc': 0.5974025974025974,
 'best_thresh': 0.28}

In [188]:
# Train results
get_accuracy_per_threshold(best_d['best_thresh'],
                           y_train, pred_train, pred_prob_train, default_class_idx)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.28,
 'acc': 0.9166666666666666,
 'in_scope_acc': 0.9166666666666666,
 'out_scope_acc': nan}

In [189]:
# save artifacts
BOW_PIPELINE_FN = os.path.join(INTER_DATA_DIR, "bow_pipeline_CA{}.pkl")
LE_FN = os.path.join(INTER_DATA_DIR, "label_encoder.pkl")
OHE_FN = os.path.join(INTER_DATA_DIR, "one_hot_encoder.pkl")
pickle.dump(pipeline, open(BOW_PIPELINE_FN.format(class_algo), 'wb'))
pickle.dump(le, open(LE_FN, 'wb'))
pickle.dump(onehot_encoder, open(OHE_FN, 'wb'))

In [285]:
%%time
print('Tfidf + LR\n')
x_train, y_train = df_train['sent_pre'].values, df_train['label_trans'].values
x_test, y_test = df_test['sent_pre'].values, df_test['label_trans'].values
default_class_idx = le.transform([DEFAULT_CLASS])[0]
class_algo = 'lr'
params = {'penalty': 'elasticnet', 'C': 50, 'l1_ratio': 0.1}
pred_train, pred_prob_train, pred_test, pred_prob_test, feature_names, pipeline = create_pipeline(
    x_train, y_train, x_test, y_test, max_df=0.8, max_features=100000, min_df=2,
    ngram_range=(1, 3), default_class_idx=default_class_idx, class_algo=class_algo, **params)

Tfidf + LR

fitting begins



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 0 seconds
time taken: 0
prediction on train and test

shape of transformed train:  (324,)
shape of transformed test:  (394,)
Record pipeline metrics

CPU times: user 467 ms, sys: 5.55 ms, total: 473 ms
Wall time: 469 ms


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished


In [286]:
%%time
thresholds = [x/100. for x in range(100)]
thresh_df = find_optimal_threshold(thresholds, y_test, pred_test, pred_prob_test,
                                   default_class_idx)

CPU times: user 123 ms, sys: 2.49 ms, total: 126 ms
Wall time: 124 ms


In [291]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.92)
best_d

{'max_acc': 0.6065989847715736,
 'max_in_scope_acc': 0.7012987012987013,
 'max_out_scope_acc': 0.9815950920245399,
 'best_out_scope_acc': 0.5030674846625767,
 'best_acc': 0.5888324873096447,
 'best_in_scope_acc': 0.6493506493506493,
 'best_thresh': 0.25}

In [292]:
# Train results
get_accuracy_per_threshold(best_d['best_thresh'],
                           y_train, pred_train, pred_prob_train, default_class_idx)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.25,
 'acc': 0.9166666666666666,
 'in_scope_acc': 0.9166666666666666,
 'out_scope_acc': nan}

In [293]:
pickle.dump(pipeline, open(BOW_PIPELINE_FN.format(class_algo), 'wb'))

## Word Embeddings (FastText)

### Preprocessing
1. Contraction
2. Lower casing
3. Remove non-alphabets

In [5]:
preprocess_obj = Text_Preprocessing(keep_eng=False, remove_nonalpha=False, lower_case=True,
                         remove_punkt=False, remove_stop=False, remove_numerals=False,
                         spell_check=False, contraction=True,
                         contraction_var=CONTRACTIONS, stem=False,
                         lem=False, filter_pos=False, pos_var=('N', 'J'),
                         tokenize=False, template_removal=False,
                         template_start_string='', regex_cleaning=False,
                         remove_ignore_words=False, ignore_words=IGNORE_WORDS,
                         custom_stoplist=[], word_size=2, word_size_filter=False)

In [6]:
%%time
df_train['sent_pre'] = df_train['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_train['sent_pre'] = preprocess_obj.fit_transform(df_train['sent_pre'])
df_train['sent_pre'] = df_train['sent_pre'].swifter.apply(lambda x:
                                                          re.sub("[^A-Za-z']+", ' ', x))
df_test['sent_pre'] = df_test['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_test['sent_pre'] = preprocess_obj.fit_transform(df_test['sent_pre'])
df_test['sent_pre'] = df_test['sent_pre'].swifter.apply(lambda x:
                                                        re.sub("[^A-Za-z']+", ' ', x))
df_train.fillna(value={'sent_pre': ''}, inplace=True)
df_test.fillna(value={'sent_pre': ''}, inplace=True)

contraction
lower case


Pandas Apply:   0%|          | 0/324 [00:00<?, ?it/s]

contraction
lower case


Pandas Apply:   0%|          | 0/394 [00:00<?, ?it/s]

CPU times: user 82.8 ms, sys: 27.3 ms, total: 110 ms
Wall time: 105 ms


In [7]:
df_train[['sentence', 'sent_pre']].head(10)

Unnamed: 0,sentence,sent_pre
0,You guys provide EMI option?,you guys provide emi option
1,Do you offer Zero Percent EMI payment options?,do you offer zero percent emi payment options
2,0% EMI.,emi
3,EMI,emi
4,I want in installment,i want in installment
5,I want it on 0% interest,i want it on interest
6,How to get in EMI,how to get in emi
7,what about emi options,what about emi options
8,I need emi payment.,i need emi payment
9,How to EMI,how to emi


In [8]:
labels = np.array(df_train['label'].tolist() + df_test['label'].tolist())
le = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
le.fit(labels)
df_train['label_trans'] = le.transform(df_train['label'].values)
df_test['label_trans'] = le.transform(df_test['label'].values)
labels_enc = np.array(df_train['label_trans'].tolist() + df_test['label_trans'].tolist())
labels_enc = labels_enc.reshape(len(labels_enc), 1)
onehot_encoder.fit(labels_enc)
train_labels_ohe = onehot_encoder.transform(df_train['label_trans'].values.reshape(-1, 1))
test_labels_ohe = onehot_encoder.transform(df_test['label_trans'].values.reshape(-1, 1))

### Fasttext

In [9]:
FT_PRETRAINED_FN = os.path.join(LOCAL_ROOT, "crawl-300d-2M-subword/crawl-300d-2M-subword.vec")
LABEL_PREFIX = '__label__'
FT_TRAIN_FN = os.path.join(INTER_DATA_DIR, "ft_prepared_data_train.txt")
FT_TEST_FN = os.path.join(INTER_DATA_DIR, "ft_prepared_data_test.txt")
FT_SCRATCH_MODEL_FN = os.path.join(INTER_DATA_DIR, "ft_scratch_model.bin")
FT_FINETUNED_MODEL_FN = os.path.join(INTER_DATA_DIR, "ft_finetuned_model.bin")
SEED = 100

In [10]:
def prepare_data_for_ft_training(file_name, data, dv_col='FT_DV', text_col='sent_pre'):
    fn = open(file_name, 'w')
    for row_num, row in data.iterrows():
        fn.write(row[dv_col] + '\t' + row[text_col])
        fn.write('\n')
    fn.close()

In [12]:
print('prepare data for FT model training\n')
df_train['FT_DV'] = df_train['label_trans'].apply(lambda x: LABEL_PREFIX+str(x))
df_test['FT_DV'] = df_test['label_trans'].apply(lambda x: LABEL_PREFIX+str(x))

print('make .txt files for fasttext training\n')
prepare_data_for_ft_training(FT_TRAIN_FN, df_train, dv_col='FT_DV', text_col='sent_pre')
prepare_data_for_ft_training(FT_TEST_FN, df_test, dv_col='FT_DV', text_col='sent_pre')

prepare data for FT model training

make .txt files for fasttext training



In [11]:
import fasttext

In [54]:
%%time
print('fasttext training from scratch...\n')
model = fasttext.train_supervised(FT_TRAIN_FN, label_prefix=LABEL_PREFIX,
                                  lr=0.1, epoch=100, wordNgrams=1, verbose=1,
                                  minCount=2, dim=40)
model.save_model(FT_SCRATCH_MODEL_FN)
print(model.test(FT_TEST_FN, k=1))
print(model.test(FT_TEST_FN, k=3))

fasttext training...

(231, 0.5670995670995671, 0.5670995670995671)
(231, 0.2597402597402597, 0.7792207792207793)
CPU times: user 244 ms, sys: 14.2 ms, total: 258 ms
Wall time: 144 ms


In [12]:
def calc_prediction(model, text, threshold, label_prefix, default_class_idx):
    pred, prob = model.predict(text)
    pred = int(pred[0].split(label_prefix)[-1])
    if prob[0] >= threshold:
        return pred
    else:
        return default_class_idx
    

def calc_prediction_series(model, texts, threshold, label_prefix, default_class_idx):
    preds = [calc_prediction(model, text, threshold, label_prefix, default_class_idx)
             for text in texts]
    return np.array(preds)


def get_accuracy(y_true, pred):
    return np.mean(y_true == pred)


def get_accuracy_per_threshold(threshold, model, texts, y_true, label_prefix,
                               default_class_idx):
    y_true_series = pd.Series(y_true)
    preds = calc_prediction_series(model, texts, threshold, label_prefix, default_class_idx)
    acc = get_accuracy(y_true, preds)
    pred_series = pd.Series(preds)
    mask = y_true_series == default_class_idx
    in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
    out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
    d = {'threshold': threshold, 'acc': acc, 'in_scope_acc': in_scope_acc,
         'out_scope_acc': out_scope_acc}
    return d


def find_optimal_threshold(thresholds, model, texts, y_true, label_prefix, default_class_idx):
    y_true_series = pd.Series(y_true)
    
    out = []
    for thresh in thresholds:
        preds = calc_prediction_series(model, texts, thresh, label_prefix,
                                       default_class_idx)
        acc = get_accuracy(y_true, preds)
        pred_series = pd.Series(preds)
        mask = y_true_series == default_class_idx
        in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
        out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
        d = {'threshold': thresh, 'acc': acc, 'in_scope_acc': in_scope_acc,
             'out_scope_acc': out_scope_acc}
        out.append(d)
    out = pd.DataFrame(out)
    return out


def summarize_results(thresh_df, min_in_scope_acc_frac=0.95):
    max_acc = thresh_df['acc'].max()
    max_in_scope_acc = thresh_df['in_scope_acc'].max()
    max_out_scope_acc = thresh_df['out_scope_acc'].max()
    min_in_scope_acc = min_in_scope_acc_frac * max_in_scope_acc
    mask = thresh_df['in_scope_acc'] >= min_in_scope_acc
    best_out_scope_acc = thresh_df.loc[mask, 'out_scope_acc'].max()
    mask1 = thresh_df['out_scope_acc'] == best_out_scope_acc
    best_acc = thresh_df.loc[mask&mask1, 'acc'].values[0]
    best_in_scope_acc = thresh_df.loc[mask&mask1, 'in_scope_acc'].values[0]
    best_thresh = thresh_df.loc[mask&mask1, 'threshold'].values[0]
    return {'max_acc': max_acc, 'max_in_scope_acc': max_in_scope_acc,
            'max_out_scope_acc': max_out_scope_acc, 'best_out_scope_acc': best_out_scope_acc,
            'best_acc': best_acc, 'best_in_scope_acc': best_in_scope_acc,
            'best_thresh': best_thresh}

In [66]:
%%time
thresholds = [x/100. for x in range(100)]
label_prefix = LABEL_PREFIX
default_class_idx = le.transform([DEFAULT_CLASS])[0]
texts = df_test['sent_pre'].tolist()
y_test = df_test['label_trans'].values
y_train = df_train['label_trans'].values
thresh_df = find_optimal_threshold(thresholds, model, texts, y_test, label_prefix,
                                   default_class_idx)

CPU times: user 476 ms, sys: 3.1 ms, total: 479 ms
Wall time: 477 ms


In [69]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.9)
best_d

{'max_acc': 0.5609137055837563,
 'max_in_scope_acc': 0.5670995670995671,
 'max_out_scope_acc': 0.9877300613496932,
 'best_out_scope_acc': 0.24539877300613497,
 'best_acc': 0.4035532994923858,
 'best_in_scope_acc': 0.5151515151515151,
 'best_thresh': 0.42}

In [70]:
# Train results
texts = df_train['sent_pre'].tolist()
get_accuracy_per_threshold(best_d['best_thresh'], model, texts, y_train, label_prefix,
                           default_class_idx)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.42,
 'acc': 0.9814814814814815,
 'in_scope_acc': 0.9814814814814815,
 'out_scope_acc': nan}

In [18]:
%%time
print('fasttext finetuning...\n')
model = fasttext.train_supervised(FT_TRAIN_FN, label_prefix=LABEL_PREFIX,
                                  lr=0.1, epoch=5, wordNgrams=1, verbose=1,
                                  pretrained_vectors=FT_PRETRAINED_FN, dim=300, minCount=2)
model.save_model(FT_FINETUNED_MODEL_FN)
print(model.test(FT_TEST_FN, k=1))
print(model.test(FT_TEST_FN, k=3))

fasttext finetuning...

(231, 0.6406926406926406, 0.6406926406926406)
(231, 0.2698412698412698, 0.8095238095238095)
CPU times: user 4min, sys: 11.1 s, total: 4min 11s
Wall time: 3min 36s


In [19]:
%%time
thresholds = [x/100. for x in range(100)]
label_prefix = LABEL_PREFIX
default_class_idx = le.transform([DEFAULT_CLASS])[0]
texts = df_test['sent_pre'].tolist()
y_test = df_test['label_trans'].values
y_train = df_train['label_trans'].values
thresh_df = find_optimal_threshold(thresholds, model, texts, y_test, label_prefix,
                                   default_class_idx)

CPU times: user 674 ms, sys: 45.3 ms, total: 719 ms
Wall time: 777 ms


In [23]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.95)
best_d

{'max_acc': 0.5482233502538071,
 'max_in_scope_acc': 0.6406926406926406,
 'max_out_scope_acc': 0.8343558282208589,
 'best_out_scope_acc': 0.1411042944785276,
 'best_acc': 0.41624365482233505,
 'best_in_scope_acc': 0.6103896103896104,
 'best_thresh': 0.38}

In [24]:
# Train results
texts = df_train['sent_pre'].tolist()
get_accuracy_per_threshold(best_d['best_thresh'], model, texts, y_train, label_prefix,
                           default_class_idx)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.38, 'acc': 1.0, 'in_scope_acc': 1.0, 'out_scope_acc': nan}

### what happens when we don't remove non-alphabets

In [25]:
%%time
df_train['sent_pre'] = df_train['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_train['sent_pre'] = preprocess_obj.fit_transform(df_train['sent_pre'])
df_test['sent_pre'] = df_test['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_test['sent_pre'] = preprocess_obj.fit_transform(df_test['sent_pre'])
df_train.fillna(value={'sent_pre': ''}, inplace=True)
df_test.fillna(value={'sent_pre': ''}, inplace=True)

contraction
lower case
contraction
lower case
CPU times: user 11.7 ms, sys: 16.6 ms, total: 28.3 ms
Wall time: 44.8 ms


In [26]:
FT_TRAIN_Wnonalpha_FN = os.path.join(INTER_DATA_DIR, "ft_prepared_data_Wnonalpha_train.txt")
FT_TEST_Wnonalpha_FN = os.path.join(INTER_DATA_DIR, "ft_prepared_data_Wnonalpha_test.txt")
FT_FINETUNED_MODEL_Wnonalpha_FN = os.path.join(INTER_DATA_DIR, "ft_finetuned_model_Wnonalpha.bin")

In [28]:
%%time
print('prepare data for FT model training with non-alphabets\n')
df_train['FT_DV'] = df_train['label_trans'].apply(lambda x: LABEL_PREFIX+str(x))
df_test['FT_DV'] = df_test['label_trans'].apply(lambda x: LABEL_PREFIX+str(x))

print('make .txt files for fasttext training\n')
prepare_data_for_ft_training(FT_TRAIN_Wnonalpha_FN, df_train, dv_col='FT_DV',
                             text_col='sent_pre')
prepare_data_for_ft_training(FT_TEST_Wnonalpha_FN, df_test, dv_col='FT_DV',
                             text_col='sent_pre')

prepare data for FT model training with non-alphabets

make .txt files for fasttext training

CPU times: user 72.9 ms, sys: 7.39 ms, total: 80.3 ms
Wall time: 86.8 ms


In [29]:
%%time
print('fasttext finetuning...\n')
model = fasttext.train_supervised(FT_TRAIN_Wnonalpha_FN, label_prefix=LABEL_PREFIX,
                                  lr=0.1, epoch=5, wordNgrams=1, verbose=1,
                                  pretrained_vectors=FT_PRETRAINED_FN, dim=300, minCount=2)
model.save_model(FT_FINETUNED_MODEL_Wnonalpha_FN)
print(model.test(FT_TEST_Wnonalpha_FN, k=1))
print(model.test(FT_TEST_Wnonalpha_FN, k=3))

fasttext finetuning...

(231, 0.5627705627705628, 0.5627705627705628)
(231, 0.2554112554112554, 0.7662337662337663)
CPU times: user 4min 5s, sys: 13.3 s, total: 4min 18s
Wall time: 3min 49s


### Results are worse!!

## Sentence Embeddings (Transformers)
1. Transformers on raw text
2. Transformers on preprocessed text (contraction, lower casing, only alphabets)
3. SentenceTransformers on raw text
4. SentenceTransformers on preprocessed text (contraction, lower casing, only alphabets)

### Bert-base-uncased model finetuned on raw lower cased text

In [117]:
labels = np.array(df_train['label'].tolist())
le = LabelEncoder()
le.fit(labels)
label2id = {k: v for v, k in enumerate(le.classes_)}
label2id[DEFAULT_CLASS] = len(label2id)
df_train['label_trans'] = le.transform(df_train['label'].values)
df_test['label_trans'] = df_test['label'].apply(lambda x: label2id[x])

In [131]:
set(df_train['label_trans']), set(df_test['label_trans']), label2id

({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21},
 {'100_NIGHT_TRIAL_OFFER': 0,
  'ABOUT_SOF_MATTRESS': 1,
  'CANCEL_ORDER': 2,
  'CHECK_PINCODE': 3,
  'COD': 4,
  'COMPARISON': 5,
  'DELAY_IN_DELIVERY': 6,
  'DISTRIBUTORS': 7,
  'EMI': 8,
  'ERGO_FEATURES': 9,
  'LEAD_GEN': 10,
  'MATTRESS_COST': 11,
  'OFFERS': 12,
  'ORDER_STATUS': 13,
  'ORTHO_FEATURES': 14,
  'PILLOWS': 15,
  'PRODUCT_VARIANTS': 16,
  'RETURN_EXCHANGE': 17,
  'SIZE_CUSTOMIZATION': 18,
  'WARRANTY': 19,
  'WHAT_SIZE_TO_ORDER': 20,
  'NO_NODES_DETECTED': 21})

In [119]:
SEED = 100
FT_BERT_OUTPUT_DIR = os.path.join(INTER_DATA_DIR, "ft_bert_output")
FT_BERT_LOGS_DIR = os.path.join(INTER_DATA_DIR, "ft_bert_logs")

In [125]:
train_texts = df_train['sentence'].str.lower().tolist()
train_labels = df_train['label_trans'].tolist()
test_texts = df_test['sentence'].str.lower().tolist()
test_labels = df_test['label_trans'].tolist()

In [126]:
"""
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=SEED, shuffle=True,
    stratify=train_labels)
"""

'\nfrom sklearn.model_selection import train_test_split\ntrain_texts, val_texts, train_labels, val_labels = train_test_split(\n    train_texts, train_labels, test_size=0.1, random_state=SEED, shuffle=True,\n    stratify=train_labels)\n'

In [127]:
print(len(train_texts), len(set(train_labels)),
      set(train_labels))

324 21 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}


In [128]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [129]:
%%time
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

CPU times: user 122 ms, sys: 25.5 ms, total: 148 ms
Wall time: 26.6 ms


In [140]:
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight('balanced', classes=list(set(train_labels)),
                               y=train_labels).tolist()
len(weights)

21

In [142]:
import torch

class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, class_weights=None, labels=None):
        self.encodings = encodings
        self.class_weights = class_weights
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.class_weights:
            item['class_weights'] = torch.tensor(self.class_weights)
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = IntentDataset(train_encodings, weights, train_labels)
val_dataset = IntentDataset(encodings=val_encodings, labels=train_labels)
test_dataset = IntentDataset(encodings=test_encodings)

In [155]:
train_dataset[0]['labels'].view(-1)

tensor([8])

In [160]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

class MultiClassTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        class_weights = inputs.pop("class_weights")[0]
        outputs = model(**inputs)
        logits = outputs[0]
        #print(logits.shape, labels.shape, class_weights.shape)
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [163]:
%%time
training_args = TrainingArguments(
    output_dir=FT_BERT_OUTPUT_DIR,
    num_train_epochs=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir=FT_BERT_LOGS_DIR,
    logging_steps=50,
    do_train=True,
    do_eval=True,
    learning_rate=0.00004,
    overwrite_output_dir=True,
    evaluate_during_training=True,
    load_best_model_at_end=True
)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(set(train_labels)))

trainer = MultiClassTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss
50,2.949153,2.572654
100,2.292687,1.747429
150,1.475707,0.90648
200,0.720295,0.359967
250,0.295399,0.124002
300,0.105778,0.03892
350,0.037179,0.019093
400,0.022112,0.012642
450,0.015317,0.009445
500,0.012227,0.007512


CPU times: user 1h 27min 53s, sys: 6min 15s, total: 1h 34min 8s
Wall time: 1h 34min 28s


TrainOutput(global_step=2050, training_loss=0.1960051299304497)

In [164]:
trainer.save_model(FT_BERT_OUTPUT_DIR)

In [166]:
%%time
print("Prediction...")

#model = BertForSequenceClassification.from_pretrained(FT_BERT_OUTPUT_DIR)
test_trainer = Trainer(trainer.model)

Prediction...
CPU times: user 110 ms, sys: 225 ms, total: 335 ms
Wall time: 577 ms


In [167]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)


def get_predictions(trainer, dataset):
    pred, _, _ = trainer.predict(dataset)
    preds = np.argmax(pred, axis=1)
    probs = np.array([softmax(v)[preds[i]] for i, v in enumerate(pred)])
    return preds, probs


def get_threshold_based_predictions(preds, probs, threshold, default_class_idx):
    preds_n = np.array([pred if probs[i] >= threshold else default_class_idx
                        for i, pred in enumerate(preds)])
    return preds_n


def get_accuracy(y_true, pred):
    return np.mean(y_true == pred)


def get_accuracy_per_threshold(threshold, y_true, preds, probs, default_class_idx):
    y_true_series = pd.Series(y_true)
    preds = get_threshold_based_predictions(preds, probs, threshold, default_class_idx)
    acc = get_accuracy(y_true, preds)
    pred_series = pd.Series(preds)
    mask = y_true_series == default_class_idx
    in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
    out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
    d = {'threshold': threshold, 'acc': acc, 'in_scope_acc': in_scope_acc,
         'out_scope_acc': out_scope_acc}
    return d


def find_optimal_threshold(thresholds, y_true, preds, probs, default_class_idx):
    y_true_series = pd.Series(y_true)
    
    out = []
    for thresh in thresholds:
        preds = get_threshold_based_predictions(preds, probs, thresh, default_class_idx)
        acc = get_accuracy(y_true, preds)
        pred_series = pd.Series(preds)
        mask = y_true_series == default_class_idx
        in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
        out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
        d = {'threshold': thresh, 'acc': acc, 'in_scope_acc': in_scope_acc,
             'out_scope_acc': out_scope_acc}
        out.append(d)
    out = pd.DataFrame(out)
    return out


def summarize_results(thresh_df, min_in_scope_acc_frac=0.95):
    max_acc = thresh_df['acc'].max()
    max_in_scope_acc = thresh_df['in_scope_acc'].max()
    max_out_scope_acc = thresh_df['out_scope_acc'].max()
    min_in_scope_acc = min_in_scope_acc_frac * max_in_scope_acc
    mask = thresh_df['in_scope_acc'] >= min_in_scope_acc
    best_out_scope_acc = thresh_df.loc[mask, 'out_scope_acc'].max()
    mask1 = thresh_df['out_scope_acc'] == best_out_scope_acc
    best_acc = thresh_df.loc[mask&mask1, 'acc'].values[0]
    best_in_scope_acc = thresh_df.loc[mask&mask1, 'in_scope_acc'].values[0]
    best_thresh = thresh_df.loc[mask&mask1, 'threshold'].values[0]
    return {'max_acc': max_acc, 'max_in_scope_acc': max_in_scope_acc,
            'max_out_scope_acc': max_out_scope_acc, 'best_out_scope_acc': best_out_scope_acc,
            'best_acc': best_acc, 'best_in_scope_acc': best_in_scope_acc,
            'best_thresh': best_thresh}

In [168]:
%%time
train_texts = df_train['sentence'].tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
train_dataset = IntentDataset(train_encodings)
train_pred, train_pred_prob = get_predictions(test_trainer, train_dataset)
test_pred, test_pred_prob = get_predictions(test_trainer, test_dataset)

CPU times: user 1min, sys: 616 ms, total: 1min 1s
Wall time: 1min 1s


In [172]:
test_pred[:10], test_pred_prob[:10]

(array([16, 16,  5, 17, 17,  5,  7,  5,  6, 15]),
 array([0.7652197 , 0.31089705, 0.998385  , 0.96910644, 0.9736631 ,
        0.99834   , 0.92975086, 0.9984188 , 0.95615286, 0.4055701 ],
       dtype=float32))

In [173]:
%%time
thresholds = [x/100. for x in range(100)]
default_class_idx = label2id[DEFAULT_CLASS]
print(default_class_idx)
y_test = df_test['label_trans'].values
y_train = df_train['label_trans'].values
thresh_df = find_optimal_threshold(thresholds, y_test, test_pred, test_pred_prob,
                                   default_class_idx)

21
CPU times: user 164 ms, sys: 24.9 ms, total: 189 ms
Wall time: 220 ms


In [185]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.95)
best_d

{'max_acc': 0.6700507614213198,
 'max_in_scope_acc': 0.70995670995671,
 'max_out_scope_acc': 0.7852760736196319,
 'best_out_scope_acc': 0.49079754601226994,
 'best_acc': 0.5989847715736041,
 'best_in_scope_acc': 0.6753246753246753,
 'best_thresh': 0.87}

In [187]:
# Train results
get_accuracy_per_threshold(best_d['best_thresh'], y_train, train_pred, train_pred_prob,
                           default_class_idx)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.87, 'acc': 1.0, 'in_scope_acc': 1.0, 'out_scope_acc': nan}

### bert-base-uncased finetuned on preprocessed text

In [188]:
preprocess_obj = Text_Preprocessing(keep_eng=False, remove_nonalpha=False, lower_case=True,
                         remove_punkt=False, remove_stop=False, remove_numerals=False,
                         spell_check=False, contraction=True,
                         contraction_var=CONTRACTIONS, stem=False,
                         lem=False, filter_pos=False, pos_var=('N', 'J'),
                         tokenize=False, template_removal=False,
                         template_start_string='', regex_cleaning=False,
                         remove_ignore_words=False, ignore_words=IGNORE_WORDS,
                         custom_stoplist=[], word_size=2, word_size_filter=False)

In [189]:
%%time
df_train['sent_pre'] = df_train['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_train['sent_pre'] = preprocess_obj.fit_transform(df_train['sent_pre'])
df_train['sent_pre'] = df_train['sent_pre'].swifter.apply(lambda x:
                                                          re.sub("[^A-Za-z']+", ' ', x))
df_test['sent_pre'] = df_test['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_test['sent_pre'] = preprocess_obj.fit_transform(df_test['sent_pre'])
df_test['sent_pre'] = df_test['sent_pre'].swifter.apply(lambda x:
                                                        re.sub("[^A-Za-z']+", ' ', x))
df_train.fillna(value={'sent_pre': ''}, inplace=True)
df_test.fillna(value={'sent_pre': ''}, inplace=True)

contraction
lower case


Pandas Apply:   0%|          | 0/324 [00:00<?, ?it/s]

contraction
lower case


Pandas Apply:   0%|          | 0/394 [00:00<?, ?it/s]

CPU times: user 87.4 ms, sys: 83.1 ms, total: 170 ms
Wall time: 264 ms


In [190]:
df_train.head(), df_test.head()

(                                         sentence label  label_trans  \
 0                    You guys provide EMI option?   EMI            8   
 1  Do you offer Zero Percent EMI payment options?   EMI            8   
 2                                         0% EMI.   EMI            8   
 3                                             EMI   EMI            8   
 4                           I want in installment   EMI            8   
 
                                          sent_pre  
 0                    you guys provide emi option   
 1  do you offer zero percent emi payment options   
 2                                            emi   
 3                                             emi  
 4                           i want in installment  ,
                                    sentence              label  label_trans  \
 0                   There are only 2 models  NO_NODES_DETECTED           21   
 1                                    Single  NO_NODES_DETECTED           21   
 

In [191]:
SEED = 100
FT_PRE_BERT_OUTPUT_DIR = os.path.join(INTER_DATA_DIR, "ft_pre_bert_output")
FT_PRE_BERT_LOGS_DIR = os.path.join(INTER_DATA_DIR, "ft_pre_bert_logs")

In [192]:
train_texts = df_train['sent_pre'].tolist()
train_labels = df_train['label_trans'].tolist()
test_texts = df_test['sent_pre'].tolist()
test_labels = df_test['label_trans'].tolist()

In [193]:
%%time
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

CPU times: user 161 ms, sys: 125 ms, total: 285 ms
Wall time: 2.04 s


In [194]:
weights = compute_class_weight('balanced', classes=list(set(train_labels)),
                               y=train_labels).tolist()
len(weights)

21

In [195]:
train_dataset = IntentDataset(train_encodings, weights, train_labels)
val_dataset = IntentDataset(encodings=val_encodings, labels=train_labels)
test_dataset = IntentDataset(encodings=test_encodings)

In [196]:
%%time
training_args = TrainingArguments(
    output_dir=FT_PRE_BERT_OUTPUT_DIR,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir=FT_PRE_BERT_LOGS_DIR,
    logging_steps=100,
    do_train=True,
    do_eval=True,
    learning_rate=0.00004,
    overwrite_output_dir=True,
    evaluate_during_training=True,
    load_best_model_at_end=True
)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(set(train_labels)))

trainer = MultiClassTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss
100,2.130712,0.789722
200,0.352767,0.056885
300,0.039178,0.015832
400,0.01716,0.009433
500,0.011459,0.006823
600,0.008808,0.005407
700,0.008069,0.004571
800,0.006259,0.004063
900,0.005681,0.003756
1000,0.005347,0.003604


CPU times: user 49min 52s, sys: 3min 49s, total: 53min 42s
Wall time: 53min 48s


TrainOutput(global_step=1050, training_loss=0.24647830054873512)

In [197]:
trainer.save_model(FT_PRE_BERT_OUTPUT_DIR)

In [198]:
%%time
print("Prediction...")

#model = BertForSequenceClassification.from_pretrained(FT_BERT_OUTPUT_DIR)
test_trainer = Trainer(trainer.model)

Prediction...
CPU times: user 83.8 ms, sys: 124 ms, total: 208 ms
Wall time: 322 ms


In [199]:
%%time
train_dataset = IntentDataset(train_encodings)
train_pred, train_pred_prob = get_predictions(test_trainer, train_dataset)
test_pred, test_pred_prob = get_predictions(test_trainer, test_dataset)

CPU times: user 1min 1s, sys: 881 ms, total: 1min 2s
Wall time: 1min 2s


In [200]:
%%time
thresholds = [x/100. for x in range(100)]
default_class_idx = label2id[DEFAULT_CLASS]
print(default_class_idx)
y_test = df_test['label_trans'].values
y_train = df_train['label_trans'].values
thresh_df = find_optimal_threshold(thresholds, y_test, test_pred, test_pred_prob,
                                   default_class_idx)

21
CPU times: user 155 ms, sys: 15.1 ms, total: 170 ms
Wall time: 184 ms


In [205]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.943)
best_d

{'max_acc': 0.6802030456852792,
 'max_in_scope_acc': 0.7835497835497836,
 'max_out_scope_acc': 0.8220858895705522,
 'best_out_scope_acc': 0.50920245398773,
 'best_acc': 0.6446700507614214,
 'best_in_scope_acc': 0.7402597402597403,
 'best_thresh': 0.8}

In [206]:
# Train results
get_accuracy_per_threshold(best_d['best_thresh'], y_train, train_pred, train_pred_prob,
                           default_class_idx)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.8, 'acc': 1.0, 'in_scope_acc': 1.0, 'out_scope_acc': nan}

In [209]:
# f1_score
from sklearn.metrics import f1_score

y_test_series = pd.Series(y_test)
y_pred_series = pd.Series(test_pred)
mask = y_test_series != 21
print('# in scope examples in test: ', mask.sum())
f_score = f1_score(y_true=y_test_series[mask].values, y_pred=y_pred_series[mask].values,
                   average='weighted')
print('F1 Score: %0.4f' % (f_score))

# in scope examples in test:  231
F1 Score: 0.7834


### Error Analysis

In [222]:
id2label = {v: k for k, v in label2id.items()}
errors_df = df_test.copy()
errors_df['pred'] = test_pred
errors_df['pred_label'] = errors_df['pred'].apply(lambda x: id2label[x])
errors_df['pred_prob'] = test_pred_prob
print(errors_df.shape)
print(errors_df.head())

(394, 7)
                                   sentence              label  label_trans  \
0                   There are only 2 models  NO_NODES_DETECTED           21   
1                                    Single  NO_NODES_DETECTED           21   
2  What's difference between ergo and ortho         COMPARISON            5   
3                              Return order    RETURN_EXCHANGE           17   
4               Hai not recieved my product  DELAY_IN_DELIVERY            6   

                                     sent_pre  pred        pred_label  \
0                       there are only models    16  PRODUCT_VARIANTS   
1                                      single    16  PRODUCT_VARIANTS   
2  what has difference between ergo and ortho     5        COMPARISON   
3                                return order    17   RETURN_EXCHANGE   
4                 hai not recieved my product    17   RETURN_EXCHANGE   

   pred_prob  
0   0.874091  
1   0.770442  
2   0.995672  
3   0.962098  
4 

In [223]:
mask = errors_df['label_trans'] != errors_df['pred']
print(errors_df.loc[mask, :].head(10))

                                             sentence              label  \
0                             There are only 2 models  NO_NODES_DETECTED   
1                                              Single  NO_NODES_DETECTED   
4                         Hai not recieved my product  DELAY_IN_DELIVERY   
10                    please send them after lockdown  NO_NODES_DETECTED   
11  I have not recieved anything with regard to that.  NO_NODES_DETECTED   
15  Is there anybody? I want to purchase this SOF ...  NO_NODES_DETECTED   
16  No but i wann the product to be delivered as i...  NO_NODES_DETECTED   
17                        Is it available in Lucknow?  NO_NODES_DETECTED   
23                     i am not able to order pillows  NO_NODES_DETECTED   
24          How can i purchase this product in nepal?  NO_NODES_DETECTED   

    label_trans                                           sent_pre  pred  \
0            21                              there are only models    16   
1          