In [7]:
import pandas as pd
import numpy as np
import os, sys
from constants import *
from utility import *

In [8]:
# read data
%time df_train = pd.read_csv(TRAIN_FN)
%time df_test = pd.read_csv(TEST_FN)

CPU times: user 2.57 ms, sys: 807 µs, total: 3.37 ms
Wall time: 2.64 ms
CPU times: user 2.56 ms, sys: 576 µs, total: 3.14 ms
Wall time: 2.67 ms


In [9]:
print('shape: ', df_train.shape, '\t', df_test.shape)
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)
print('shape after dropping duplicates: ', df_train.shape, '\t', df_test.shape)
print(df_train.head())
print(df_test.head())

shape:  (328, 2) 	 (397, 2)
shape after dropping duplicates:  (324, 2) 	 (394, 2)
                                         sentence label
0                    You guys provide EMI option?   EMI
1  Do you offer Zero Percent EMI payment options?   EMI
2                                         0% EMI.   EMI
3                                             EMI   EMI
4                           I want in installment   EMI
                                   sentence              label
0                   There are only 2 models  NO_NODES_DETECTED
1                                    Single  NO_NODES_DETECTED
2  What's difference between ergo and ortho         COMPARISON
3                              Return order    RETURN_EXCHANGE
4               Hai not recieved my product  DELAY_IN_DELIVERY


In [10]:
print('class distribution in Train\n')
print(df_train['label'].value_counts())
print('\nXXX\n')
print('class distribution in Test\n')
print(df_test['label'].value_counts())

class distribution in Train

DISTRIBUTORS             33
EMI                      25
LEAD_GEN                 21
MATTRESS_COST            21
PRODUCT_VARIANTS         21
ORDER_STATUS             20
WHAT_SIZE_TO_ORDER       19
100_NIGHT_TRIAL_OFFER    18
ORTHO_FEATURES           17
RETURN_EXCHANGE          14
COD                      12
DELAY_IN_DELIVERY        11
ABOUT_SOF_MATTRESS       11
ERGO_FEATURES            11
COMPARISON               11
PILLOWS                  10
OFFERS                   10
CHECK_PINCODE            10
WARRANTY                 10
CANCEL_ORDER             10
SIZE_CUSTOMIZATION        9
Name: label, dtype: int64

XXX

class distribution in Test

NO_NODES_DETECTED        163
SIZE_CUSTOMIZATION        24
CHECK_PINCODE             22
MATTRESS_COST             21
COMPARISON                18
LEAD_GEN                  16
EMI                       16
DELAY_IN_DELIVERY         13
PILLOWS                   13
RETURN_EXCHANGE           12
ORTHO_FEATURES            11
WHAT

In [11]:
print('classes in test that are not in train')
train_labels = df_train['label'].unique().tolist()
test_labels = df_test['label'].unique().tolist()
print(set(test_labels) - set(train_labels))

print('\nclasses in train that are not in test')
print(set(train_labels) - set(test_labels))

classes in test that are not in train
{'NO_NODES_DETECTED'}

classes in train that are not in test
{'WARRANTY'}


In [18]:
preprocess_obj = Text_Preprocessing(keep_eng=False, remove_nonalpha=True, lower_case=True,
                         remove_punkt=False, remove_stop=False, remove_numerals=False,
                         spell_check=False, contraction=True,
                         contraction_var=CONTRACTIONS, stem=False,
                         lem=False, filter_pos=False, pos_var=('N', 'J'),
                         tokenize=True, template_removal=False,
                         template_start_string='', regex_cleaning=False,
                         remove_ignore_words=False, ignore_words=IGNORE_WORDS,
                         custom_stoplist=[], word_size=2, word_size_filter=False)

In [19]:
a = df_train.loc[252, 'sentence']
print(a)
a_pre = preprocess_obj.fit_transform(pd.Series([a])).values[0]
print(a_pre)

It's been a month
contraction
lower case
remove non-alphabets
tokenization


Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

['it', 'has', 'been', 'a', 'month']


In [22]:
# difference in vocab b/w train and test
train_sentences = preprocess_obj.fit_transform(df_train['sentence']).tolist()
test_sentences = preprocess_obj.fit_transform(df_test['sentence']).tolist()
test_labels = df_test['label'].tolist()
train_vocab = set()
for row in train_sentences:
    train_vocab.update(row)

test_vocab = set()
for row in test_sentences:
    test_vocab.update(row)
    
print('Train Vocab Size: ', len(train_vocab), '\t', 'Test Vocab Size: ', len(test_vocab))
print('# Words in test but not in train: ', len(test_vocab-train_vocab))

contraction
lower case
remove non-alphabets
tokenization


Pandas Apply:   0%|          | 0/324 [00:00<?, ?it/s]

contraction
lower case
remove non-alphabets
tokenization


Pandas Apply:   0%|          | 0/394 [00:00<?, ?it/s]

Train Vocab Size:  271 	 Test Vocab Size:  548
# Words in test but not in train:  375


In [24]:
# classes that span the unseen words in test
unseen_test_words = list(test_vocab - train_vocab)
print(unseen_test_words[:10])

out = {}
for i, test_sent in enumerate(test_sentences):
    for token in test_sent:
        if token in unseen_test_words:
            if test_labels[i] in out:
                out[test_labels[i]].append((i, token))
            else:
                out[test_labels[i]] = [(i, token)]

print(len(out))

['serviceable', 'recieved', 'queen', 'bareilly', 'support', 'chhattisgarh', 'per', 'karnataka', 'whether', 'locking']
21


In [25]:
for k, v in out.items():
    print(k, '\t', len(v), '\t', v[:10])
    print('\nXXX\n')

NO_NODES_DETECTED 	 349 	 [(0, 'only'), (0, 'models'), (1, 'single'), (10, 'send'), (10, 'them'), (10, 'after'), (10, 'lockdown'), (11, 'recieved'), (11, 'regard'), (15, 'purchase')]

XXX

DELAY_IN_DELIVERY 	 17 	 [(4, 'hai'), (4, 'recieved'), (8, 'completed'), (39, 'completed'), (105, 'ordered'), (153, 'shipped'), (207, 'weeks'), (240, 'since'), (240, 'ordered'), (240, 'but')]

XXX

CHECK_PINCODE 	 37 	 [(6, 'item'), (6, 'karnataka'), (69, 'code'), (89, 'code'), (138, 'u'), (155, 'u'), (155, 'at'), (155, 'kerala'), (175, 'u'), (175, 'at')]

XXX

PRODUCT_VARIANTS 	 12 	 [(9, 'double'), (73, 'hello'), (73, 'confusion'), (73, 'suggest'), (73, 'should'), (97, 'queen'), (97, 'pls'), (97, 'assist'), (168, 'variant'), (168, 'matters')]

XXX

ABOUT_SOF_MATTRESS 	 2 	 [(12, 'planning'), (12, 'purchase')]

XXX

SIZE_CUSTOMIZATION 	 26 	 [(19, 'customize'), (107, 'foot'), (154, 'queen'), (179, 'wany'), (179, 'custimize'), (199, 'inch'), (199, 'height'), (204, 'hi'), (204, 'queen'), (213, 'scaled

### Most of the unseen words come from OOS class. However, a good chunk of unseen words do come from in scope classes and a majority of such words are spelled wrongly