In [1]:
!pip install transformers

You should consider upgrading via the '/Users/tarun/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import  GaussianNB
from sklearn.svm import  SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf

In [3]:
import pandas as pd
df = pd.read_csv('Knowledge.vs.scenario.csv')

In [4]:
batch_1 = df[:1000]

In [5]:
batch_1['1'].value_counts()

1    820
Name: 1, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
num_classes=2

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
bert_model = model_class.from_pretrained('bert-base-uncased',num_labels=num_classes)

In [7]:
tokenized = batch_1['0'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
tokenized

0      [101, 2054, 2003, 1996, 2832, 1997, 6134, 2968...
1      [101, 2054, 2003, 2734, 2000, 2468, 1037, 2047...
2      [101, 8304, 6549, 3471, 2091, 1996, 2240, 2054...
3      [101, 2323, 1037, 2136, 10197, 8518, 2241, 200...
4      [101, 2323, 2057, 3443, 2019, 8680, 2005, 6627...
                             ...                        
815    [101, 2129, 2000, 3252, 1037, 8040, 6824, 2136...
816    [101, 2622, 2968, 2291, 2005, 3674, 16483, 393...
817    [101, 10552, 2007, 2060, 22283, 3160, 2000, 19...
818    [101, 2689, 5227, 2000, 2689, 9531, 2057, 1005...
819    [101, 2040, 4627, 1996, 3679, 8040, 6824, 3116...
Name: 0, Length: 820, dtype: object

In [9]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [10]:
np.array(padded).shape

(820, 146)

In [11]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(820, 146)

In [12]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [13]:
features = last_hidden_states[0][:,0,:].numpy()

In [14]:
labels = batch_1['1']

In [15]:
questions = [[features, labels[i], len(features)]
                 for i, features in enumerate(tokenized)]

In [16]:
questions.sort(key=lambda x: x[2])

In [17]:
sorted_questions = [(review_lab[0], review_lab[1]) for review_lab in questions]

In [18]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_questions, output_types=(tf.int32,tf.int32))

In [19]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [20]:
import math
TOTAL_BATCHES = math.ceil(len(batch_1) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [21]:
from tensorflow.keras import layers
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [22]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [23]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [24]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])


In [25]:
text_model.fit(train_data,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff594f59c70>

In [27]:
results = text_model.evaluate(test_data)
print(f'Accuracy is {results[1]*100}%')

Accuracy is 100.0%
