## Experiments
1. SentenceTransformers on raw text
2. SentenceTransformers on preprocessed text (contraction, lower casing, only alphabets)

In [8]:
import pandas as pd
import numpy as np
import os, sys, swifter, re, random
from constants import *
from utility import *
from preprocess_utils import _remove_non_ascii_characters
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
# GLOBALS
DEFAULT_CLASS = 'NO_NODES_DETECTED'

In [4]:
# read data
%time df_train = pd.read_csv(TRAIN_FN)
%time df_test = pd.read_csv(TEST_FN)

df_train.drop_duplicates(inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_test.drop_duplicates(inplace=True)
df_test.reset_index(drop=True, inplace=True)

print(df_train.shape, '\t', df_test.shape)

CPU times: user 4.12 ms, sys: 1.79 ms, total: 5.91 ms
Wall time: 5.99 ms
CPU times: user 3.38 ms, sys: 3.49 ms, total: 6.87 ms
Wall time: 10.6 ms
(324, 2) 	 (394, 2)


## Sentence Transformers on raw lower cased text

In [7]:
df_train['sentence'] = df_train['sentence'].str.lower()
df_test['sentence'] = df_test['sentence'].str.lower()

In [10]:
from itertools import combinations, product

In [11]:
labels = list(set(df_train['label']))
num_negative_examples = 5
random.seed(100)
label_examples = {}
for label in labels:
    print(label)
    mask = df_train['label'] == label
    pos_lst = df_train.loc[mask, 'sentence'].tolist()
    neg_lst = random.sample(df_train.loc[~mask, 'sentence'].tolist(), num_negative_examples)
    pos_comb_lst = list(combinations(pos_lst, 2))
    pos_comb_lst = [x + (0.95,) for x in pos_comb_lst]
    pos_neg_comb_lst = list(product(pos_lst, neg_lst))
    pos_neg_comb_lst = [x + (0.05,) for x in pos_neg_comb_lst]
    label_examples[label] = pos_comb_lst + pos_neg_comb_lst

ABOUT_SOF_MATTRESS
ERGO_FEATURES
ORDER_STATUS
COMPARISON
COD
OFFERS
ORTHO_FEATURES
LEAD_GEN
PRODUCT_VARIANTS
DELAY_IN_DELIVERY
EMI
CHECK_PINCODE
MATTRESS_COST
WARRANTY
DISTRIBUTORS
CANCEL_ORDER
PILLOWS
100_NIGHT_TRIAL_OFFER
SIZE_CUSTOMIZATION
RETURN_EXCHANGE
WHAT_SIZE_TO_ORDER


In [12]:
for label in label_examples:
    print(label, len(label_examples[label]))

ABOUT_SOF_MATTRESS 110
ERGO_FEATURES 110
ORDER_STATUS 290
COMPARISON 110
COD 126
OFFERS 95
ORTHO_FEATURES 221
LEAD_GEN 315
PRODUCT_VARIANTS 315
DELAY_IN_DELIVERY 110
EMI 425
CHECK_PINCODE 95
MATTRESS_COST 315
WARRANTY 95
DISTRIBUTORS 693
CANCEL_ORDER 95
PILLOWS 95
100_NIGHT_TRIAL_OFFER 243
SIZE_CUSTOMIZATION 81
RETURN_EXCHANGE 161
WHAT_SIZE_TO_ORDER 266


In [13]:
label_examples['ABOUT_SOF_MATTRESS'][:2]

[('how is sof different from other mattress brands', 'why sof mattress', 0.95),
 ('how is sof different from other mattress brands',
  'about sof mattress',
  0.95)]

In [14]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

In [15]:
train_examples = []
for label, examples in label_examples.items():
    for example in examples:
        train_examples.append(InputExample(texts=list(example[:2]), label=example[2]))
print(len(train_examples), train_examples[:2])

4366 [<sentence_transformers.readers.InputExample.InputExample object at 0x176a4b250>, <sentence_transformers.readers.InputExample.InputExample object at 0x176a4b1f0>]


In [17]:
%%time
model = SentenceTransformer('stsb-roberta-base-v2')

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

You try to use a model that was created with version 1.1.0, however, your version is 0.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/273 [00:00<?, ?it/s]

CPU times: user 18min 13s, sys: 2min 52s, total: 21min 6s
Wall time: 21min 11s


In [77]:
%%time
# save
FT_SENTBERT_OUTPUT_DIR = os.path.join(INTER_DATA_DIR, "ft_sentbert_output")
model.save(FT_SENTBERT_OUTPUT_DIR)

CPU times: user 346 ms, sys: 554 ms, total: 899 ms
Wall time: 1.39 s


In [97]:
model = SentenceTransformer(FT_SENTBERT_OUTPUT_DIR)

In [27]:
import torch


def cos_sim(a, b):
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))


test_ex = df_test.loc[2, 'sentence']
train_ex = df_train.loc[69, 'sentence']
test_ex_emb = model.encode([test_ex])
train_ex_emb = model.encode([train_ex])
cosine_sim = cos_sim(test_ex_emb, train_ex_emb)
print('train example: ', train_ex, '\t', 'test example: ', test_ex, '\t',
      'cosine sim: ', cosine_sim)

train example:  product comparison 	 test example:  what's difference between ergo and ortho 	 cosine sim:  tensor([[0.9186]])


## Approach
1. Embed all examples in train
2. For each test example: embed -> find k closest train examples along with their scores -> apply threshold logic on scores and make predictions

In [98]:
def get_embeddings(model, docs, n_splits=8):
    docs1 = np.array_split(docs, n_splits)

    train_embeddings = []
    for i, doc_lst in enumerate(docs1):
        tmp_out = model.encode(doc_lst.tolist())
        train_embeddings.append(tmp_out)
    train_embeddings = np.concatenate(train_embeddings, axis=0)
    return train_embeddings


print('train embeddings\n')
docs = df_train['sentence'].tolist()
%time train_embeddings = get_embeddings(model, docs)

print('test embeddings\n')
docs = df_test['sentence'].tolist()
%time test_embeddings = get_embeddings(model, docs)

train embeddings

CPU times: user 5.68 s, sys: 46.3 ms, total: 5.73 s
Wall time: 5.73 s
test embeddings

CPU times: user 13.8 s, sys: 256 ms, total: 14 s
Wall time: 14.1 s


In [50]:
def get_top_examples(test_emb, train_embs, train_texts, train_labels, topk=3):
    out = []
    for i, train_emb in enumerate(train_embs):
        score = cos_sim(test_emb, train_emb).item()
        out.append((i, score, train_texts[i], train_labels[i]))
    out = sorted(out, key=lambda k: k[1], reverse=True)
    return out[:topk]

In [99]:
train_texts = df_train['sentence'].tolist()
train_labels = df_train['label'].tolist()
test_texts = df_test['sentence'].tolist()
test_labels = df_test['label'].tolist()

In [100]:
%%time
i = 0
print(test_texts[i])
get_top_examples(test_embeddings[i], train_embeddings, train_texts, train_labels, topk=3)

there are only 2 models
CPU times: user 28.7 ms, sys: 1.1 ms, total: 29.8 ms
Wall time: 29 ms


[(233, 0.8776043653488159, 'type of mattress', 'PRODUCT_VARIANTS'),
 (224, 0.8752570748329163, 'which product is best', 'PRODUCT_VARIANTS'),
 (68, 0.8676466941833496, 'compare the 2 mattresses', 'COMPARISON')]

In [101]:
%%time
test_preds = []
for j, test_emb in enumerate(test_embeddings):
    if j % 50 == 0:
        print(j)
    test_pred = get_top_examples(test_emb, train_embeddings, train_texts, train_labels, 1)
    d = {'test_text': test_texts[j], 'test_label': test_labels[j],
         'pred_idx': test_pred[0][0], 'pred_score': test_pred[0][1],
         'pred_text': test_pred[0][2], 'pred_label': test_pred[0][3]}
    test_preds.append(d)
    
test_preds = pd.DataFrame(test_preds)
print(test_preds.shape)
print(test_preds.head())

0
50
100
150
200
250
300
350
(394, 6)
                                  test_text         test_label  pred_idx  \
0                   there are only 2 models  NO_NODES_DETECTED       233   
1                                    single  NO_NODES_DETECTED       159   
2  what's difference between ergo and ortho         COMPARISON        66   
3                              return order    RETURN_EXCHANGE       284   
4               hai not recieved my product  DELAY_IN_DELIVERY       258   

   pred_score                                 pred_text         pred_label  
0    0.877604                          type of mattress   PRODUCT_VARIANTS  
1    0.552013                         do you deliver to      CHECK_PINCODE  
2    0.998691  difference between ergo & ortho mattress         COMPARISON  
3    0.956194             help me with exchange process    RETURN_EXCHANGE  
4    0.908211                       delivery is delayed  DELAY_IN_DELIVERY  
CPU times: user 10.3 s, sys: 21.3 ms, total

In [102]:
%%time
train_preds = []
for j, train_emb in enumerate(train_embeddings):
    if j % 50 == 0:
        print(j)
    train_pred = get_top_examples(train_emb, train_embeddings, train_texts, train_labels, 1)
    d = {'train_text': train_texts[j], 'train_label': train_labels[j],
         'pred_idx': train_pred[0][0], 'pred_score': train_pred[0][1],
         'pred_text': train_pred[0][2], 'pred_label': train_pred[0][3]}
    train_preds.append(d)
    
train_preds = pd.DataFrame(train_preds)
print(train_preds.shape)
print(train_preds.head())

0
50
100
150
200
250
300
(324, 6)
                                       train_text train_label  pred_idx  \
0                    you guys provide emi option?         EMI         0   
1  do you offer zero percent emi payment options?         EMI         1   
2                                         0% emi.         EMI         2   
3                                             emi         EMI         3   
4                           i want in installment         EMI         4   

   pred_score                                       pred_text pred_label  
0    1.000000                    you guys provide emi option?        EMI  
1    1.000000  do you offer zero percent emi payment options?        EMI  
2    0.999999                                         0% emi.        EMI  
3    1.000000                                             emi        EMI  
4    1.000000                           i want in installment        EMI  
CPU times: user 8.46 s, sys: 18.1 ms, total: 8.47 s
Wall time: 8.

In [103]:
assert (train_preds['train_label'] == train_preds['pred_label']).sum() == train_preds.shape[0]

In [104]:
def get_threshold_based_predictions(preds, probs, threshold, default_class):
    preds_n = np.array([pred if probs[i] >= threshold else default_class
                        for i, pred in enumerate(preds)])
    return preds_n


def get_accuracy(y_true, pred):
    return np.mean(y_true == pred)


def get_accuracy_per_threshold(threshold, y_true, preds, probs, default_class):
    y_true_series = pd.Series(y_true)
    preds = get_threshold_based_predictions(preds, probs, threshold, default_class)
    acc = get_accuracy(y_true, preds)
    pred_series = pd.Series(preds)
    mask = y_true_series == default_class
    in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
    out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
    d = {'threshold': threshold, 'acc': acc, 'in_scope_acc': in_scope_acc,
         'out_scope_acc': out_scope_acc}
    return d


def find_optimal_threshold(thresholds, y_true, preds, probs, default_class):
    y_true_series = pd.Series(y_true)
    
    out = []
    for thresh in thresholds:
        preds = get_threshold_based_predictions(preds, probs, thresh, default_class)
        acc = get_accuracy(y_true, preds)
        pred_series = pd.Series(preds)
        mask = y_true_series == default_class
        in_scope_acc = get_accuracy(y_true_series[~mask].values, pred_series[~mask].values)
        out_scope_acc = get_accuracy(y_true_series[mask].values, pred_series[mask].values)
        d = {'threshold': thresh, 'acc': acc, 'in_scope_acc': in_scope_acc,
             'out_scope_acc': out_scope_acc}
        out.append(d)
    out = pd.DataFrame(out)
    return out


def summarize_results(thresh_df, min_in_scope_acc_frac=0.95):
    max_acc = thresh_df['acc'].max()
    max_in_scope_acc = thresh_df['in_scope_acc'].max()
    max_out_scope_acc = thresh_df['out_scope_acc'].max()
    min_in_scope_acc = min_in_scope_acc_frac * max_in_scope_acc
    mask = thresh_df['in_scope_acc'] >= min_in_scope_acc
    best_out_scope_acc = thresh_df.loc[mask, 'out_scope_acc'].max()
    mask1 = thresh_df['out_scope_acc'] == best_out_scope_acc
    best_acc = thresh_df.loc[mask&mask1, 'acc'].values[0]
    best_in_scope_acc = thresh_df.loc[mask&mask1, 'in_scope_acc'].values[0]
    best_thresh = thresh_df.loc[mask&mask1, 'threshold'].values[0]
    return {'max_acc': max_acc, 'max_in_scope_acc': max_in_scope_acc,
            'max_out_scope_acc': max_out_scope_acc, 'best_out_scope_acc': best_out_scope_acc,
            'best_acc': best_acc, 'best_in_scope_acc': best_in_scope_acc,
            'best_thresh': best_thresh}

In [105]:
DEFAULT_CLASS = 'NO_NODES_DETECTED'

In [106]:
%%time
thresholds = [x/100. for x in range(100)]
y_test = df_test['label'].values
y_train = df_train['label'].values
test_pred = test_preds['pred_label'].values
test_pred_prob = test_preds['pred_score'].values
train_pred = train_preds['pred_label'].values
train_pred_prob = train_preds['pred_score'].values
thresh_df = find_optimal_threshold(thresholds, y_test, test_pred, test_pred_prob,
                                   DEFAULT_CLASS)

CPU times: user 161 ms, sys: 5.95 ms, total: 167 ms
Wall time: 163 ms


In [107]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.94)
best_d

{'max_acc': 0.6802030456852792,
 'max_in_scope_acc': 0.7402597402597403,
 'max_out_scope_acc': 1.0,
 'best_out_scope_acc': 0.49693251533742333,
 'best_acc': 0.6142131979695431,
 'best_in_scope_acc': 0.696969696969697,
 'best_thresh': 0.82}

In [108]:
# Train results
get_accuracy_per_threshold(best_d['best_thresh'], y_train, train_pred, train_pred_prob,
                           DEFAULT_CLASS)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.82, 'acc': 1.0, 'in_scope_acc': 1.0, 'out_scope_acc': nan}

In [109]:
# f1_score
from sklearn.metrics import f1_score

y_test_series = pd.Series(y_test)
y_pred_series = pd.Series(test_pred)
mask = y_test_series != DEFAULT_CLASS
print('# in scope examples in test: ', mask.sum())
f_score = f1_score(y_true=y_test_series[mask].values, y_pred=y_pred_series[mask].values,
                   average='weighted')
print('F1 Score: %0.4f' % (f_score))

# in scope examples in test:  231
F1 Score: 0.7392


## Sentence Transformers on preprocessed text

In [74]:
preprocess_obj = Text_Preprocessing(keep_eng=False, remove_nonalpha=False, lower_case=True,
                         remove_punkt=False, remove_stop=False, remove_numerals=False,
                         spell_check=False, contraction=True,
                         contraction_var=CONTRACTIONS, stem=False,
                         lem=False, filter_pos=False, pos_var=('N', 'J'),
                         tokenize=False, template_removal=False,
                         template_start_string='', regex_cleaning=False,
                         remove_ignore_words=False, ignore_words=IGNORE_WORDS,
                         custom_stoplist=[], word_size=2, word_size_filter=False)

In [75]:
%%time
df_train['sent_pre'] = df_train['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_train['sent_pre'] = preprocess_obj.fit_transform(df_train['sent_pre'])
df_train['sent_pre'] = df_train['sent_pre'].swifter.apply(lambda x:
                                                          re.sub("[^A-Za-z']+", ' ', x))
df_test['sent_pre'] = df_test['sentence'].apply(lambda x: _remove_non_ascii_characters(x))
df_test['sent_pre'] = preprocess_obj.fit_transform(df_test['sent_pre'])
df_test['sent_pre'] = df_test['sent_pre'].swifter.apply(lambda x:
                                                        re.sub("[^A-Za-z']+", ' ', x))
df_train.fillna(value={'sent_pre': ''}, inplace=True)
df_test.fillna(value={'sent_pre': ''}, inplace=True)

contraction
lower case


Pandas Apply:   0%|          | 0/324 [00:00<?, ?it/s]

contraction
lower case


Pandas Apply:   0%|          | 0/394 [00:00<?, ?it/s]

CPU times: user 74.2 ms, sys: 11.5 ms, total: 85.7 ms
Wall time: 89 ms


In [81]:
labels = list(set(df_train['label']))
num_negative_examples = 15
random.seed(100)
label_examples = {}
for label in labels:
    print(label)
    mask = df_train['label'] == label
    pos_lst = df_train.loc[mask, 'sent_pre'].tolist()
    neg_lst = random.sample(df_train.loc[~mask, 'sent_pre'].tolist(), num_negative_examples)
    pos_comb_lst = list(combinations(pos_lst, 2))
    pos_comb_lst = [x + (0.95,) for x in pos_comb_lst]
    pos_neg_comb_lst = list(product(pos_lst, neg_lst))
    pos_neg_comb_lst = [x + (0.05,) for x in pos_neg_comb_lst]
    label_examples[label] = pos_comb_lst + pos_neg_comb_lst

ABOUT_SOF_MATTRESS
ERGO_FEATURES
ORDER_STATUS
COMPARISON
COD
OFFERS
ORTHO_FEATURES
LEAD_GEN
PRODUCT_VARIANTS
DELAY_IN_DELIVERY
EMI
CHECK_PINCODE
MATTRESS_COST
WARRANTY
DISTRIBUTORS
CANCEL_ORDER
PILLOWS
100_NIGHT_TRIAL_OFFER
SIZE_CUSTOMIZATION
RETURN_EXCHANGE
WHAT_SIZE_TO_ORDER


In [82]:
for label in label_examples:
    print(label, len(label_examples[label]))

ABOUT_SOF_MATTRESS 220
ERGO_FEATURES 220
ORDER_STATUS 490
COMPARISON 220
COD 246
OFFERS 195
ORTHO_FEATURES 391
LEAD_GEN 525
PRODUCT_VARIANTS 525
DELAY_IN_DELIVERY 220
EMI 675
CHECK_PINCODE 195
MATTRESS_COST 525
WARRANTY 195
DISTRIBUTORS 1023
CANCEL_ORDER 195
PILLOWS 195
100_NIGHT_TRIAL_OFFER 423
SIZE_CUSTOMIZATION 171
RETURN_EXCHANGE 301
WHAT_SIZE_TO_ORDER 456


In [83]:
%%time
train_examples = []
for label, examples in label_examples.items():
    for example in examples:
        train_examples.append(InputExample(texts=list(example[:2]), label=example[2]))
print(len(train_examples), train_examples[:2])

7606 [<sentence_transformers.readers.InputExample.InputExample object at 0x10f361fd0>, <sentence_transformers.readers.InputExample.InputExample object at 0x10f361f70>]
CPU times: user 10.6 ms, sys: 1.09 ms, total: 11.7 ms
Wall time: 11.1 ms


In [84]:
FT_PRE_SENTBERT_OUTPUT_DIR = os.path.join(INTER_DATA_DIR, "ft_pre_sentbert_output")

In [85]:
%%time
model = SentenceTransformer('stsb-roberta-base-v2')

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100,
          output_path=FT_PRE_SENTBERT_OUTPUT_DIR)

You try to use a model that was created with version 1.1.0, however, your version is 0.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/476 [00:00<?, ?it/s]

CPU times: user 26min 43s, sys: 4min 9s, total: 30min 53s
Wall time: 30min 59s


In [86]:
%%time
# save
model.save(FT_PRE_SENTBERT_OUTPUT_DIR)

CPU times: user 480 ms, sys: 418 ms, total: 898 ms
Wall time: 1.03 s


In [87]:
print('train embeddings\n')
docs = df_train['sent_pre'].tolist()
%time train_embeddings = get_embeddings(model, docs)

print('test embeddings\n')
docs = df_test['sent_pre'].tolist()
%time test_embeddings = get_embeddings(model, docs)

train embeddings

CPU times: user 5.46 s, sys: 42.8 ms, total: 5.5 s
Wall time: 5.5 s
test embeddings

CPU times: user 14.3 s, sys: 266 ms, total: 14.5 s
Wall time: 14.6 s


In [88]:
train_texts = df_train['sent_pre'].tolist()
train_labels = df_train['label'].tolist()
test_texts = df_test['sent_pre'].tolist()
test_labels = df_test['label'].tolist()

In [89]:
%%time
test_preds = []
for j, test_emb in enumerate(test_embeddings):
    if j % 50 == 0:
        print(j)
    test_pred = get_top_examples(test_emb, train_embeddings, train_texts, train_labels, 1)
    d = {'test_text': test_texts[j], 'test_label': test_labels[j],
         'pred_idx': test_pred[0][0], 'pred_score': test_pred[0][1],
         'pred_text': test_pred[0][2], 'pred_label': test_pred[0][3]}
    test_preds.append(d)
    
test_preds = pd.DataFrame(test_preds)
print(test_preds.shape)
print(test_preds.head())

0
50
100
150
200
250
300
350
(394, 6)
                                    test_text         test_label  pred_idx  \
0                       there are only models  NO_NODES_DETECTED       218   
1                                      single  NO_NODES_DETECTED       119   
2  what has difference between ergo and ortho         COMPARISON        66   
3                                return order    RETURN_EXCHANGE       273   
4                 hai not recieved my product  DELAY_IN_DELIVERY       255   

   pred_score                                      pred_text  \
0    0.857281                               product variants   
1    0.489505                                                  
2    0.957130         difference between ergo ortho mattress   
3    0.809019                                 present status   
4    0.964084  it has been days my product have not received   

           pred_label  
0    PRODUCT_VARIANTS  
1  WHAT_SIZE_TO_ORDER  
2          COMPARISON  
3        ORD

In [90]:
%%time
train_preds = []
for j, train_emb in enumerate(train_embeddings):
    if j % 50 == 0:
        print(j)
    train_pred = get_top_examples(train_emb, train_embeddings, train_texts, train_labels, 1)
    d = {'train_text': train_texts[j], 'train_label': train_labels[j],
         'pred_idx': train_pred[0][0], 'pred_score': train_pred[0][1],
         'pred_text': train_pred[0][2], 'pred_label': train_pred[0][3]}
    train_preds.append(d)
    
train_preds = pd.DataFrame(train_preds)
print(train_preds.shape)
print(train_preds.head())

0
50
100
150
200
250
300
(324, 6)
                                       train_text train_label  pred_idx  \
0                    you guys provide emi option          EMI         0   
1  do you offer zero percent emi payment options          EMI         1   
2                                            emi          EMI         2   
3                                             emi         EMI         3   
4                           i want in installment         EMI         4   

   pred_score                                       pred_text pred_label  
0    1.000000                    you guys provide emi option         EMI  
1    1.000000  do you offer zero percent emi payment options         EMI  
2    1.000000                                            emi         EMI  
3    1.000000                                             emi        EMI  
4    1.000001                           i want in installment        EMI  
CPU times: user 7.66 s, sys: 9.28 ms, total: 7.67 s
Wall time: 7.

In [91]:
assert (train_preds['train_label'] == train_preds['pred_label']).sum() == train_preds.shape[0]

In [92]:
%%time
thresholds = [x/100. for x in range(100)]
y_test = df_test['label'].values
y_train = df_train['label'].values
test_pred = test_preds['pred_label'].values
test_pred_prob = test_preds['pred_score'].values
train_pred = train_preds['pred_label'].values
train_pred_prob = train_preds['pred_score'].values
thresh_df = find_optimal_threshold(thresholds, y_test, test_pred, test_pred_prob,
                                   DEFAULT_CLASS)

CPU times: user 149 ms, sys: 6.1 ms, total: 155 ms
Wall time: 151 ms


In [94]:
best_d = summarize_results(thresh_df, min_in_scope_acc_frac=0.93)
best_d

{'max_acc': 0.6395939086294417,
 'max_in_scope_acc': 0.7186147186147186,
 'max_out_scope_acc': 0.901840490797546,
 'best_out_scope_acc': 0.5153374233128835,
 'best_acc': 0.6065989847715736,
 'best_in_scope_acc': 0.670995670995671,
 'best_thresh': 0.72}

In [95]:
# Train results
get_accuracy_per_threshold(best_d['best_thresh'], y_train, train_pred, train_pred_prob,
                           DEFAULT_CLASS)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'threshold': 0.72, 'acc': 1.0, 'in_scope_acc': 1.0, 'out_scope_acc': nan}

In [96]:
# f1_score
y_test_series = pd.Series(y_test)
y_pred_series = pd.Series(test_pred)
mask = y_test_series != DEFAULT_CLASS
print('# in scope examples in test: ', mask.sum())
f_score = f1_score(y_true=y_test_series[mask].values, y_pred=y_pred_series[mask].values,
                   average='weighted')
print('F1 Score: %0.4f' % (f_score))

# in scope examples in test:  231
F1 Score: 0.7250
