<a href="https://colab.research.google.com/github/vikassinha167/Seldon/blob/master/nlp_ratings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers
!pip install seldon_deploy_sdk
!pip install alibi_detect==0.8.1
!pip install datasets
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import numpy as np
import pandas as pd 
import datasets
import nltk

In [6]:
import tensorflow as tf
tf.get_logger().setLevel('INFO')

from transformers import AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification

from sklearn.model_selection import train_test_split

from seldon_deploy_sdk import Configuration, ApiClient, SeldonDeploymentsApi, ModelMetadataServiceApi, DriftDetectorApi, BatchJobsApi, BatchJobDefinition
from seldon_deploy_sdk.auth import OIDCAuthenticator

from alibi_detect.cd import KSDrift
from alibi_detect.utils.saving import save_detector, load_detector

from google.cloud import storage

In [8]:
!gsutil cp gs://kelly-seldon/nlp-ratings/review_data.csv review_data.csv

Copying gs://kelly-seldon/nlp-ratings/review_data.csv...
- [1 files][ 11.3 MiB/ 11.3 MiB]                                                
Operation completed over 1 objects/11.3 MiB.                                     


In [9]:
df = pd.read_csv("review_data.csv", delimiter=";")
df.head()

Unnamed: 0,product,user_id,rating,review,date_created
0,Product 65359,27604,5.0,"_product_ provided me with a pretty good, sec...",2020-04-05 15:51:09
1,Product 34804,152368,4.0,it protects our files and computer and very si...,2019-06-24 00:40:54
2,Product 18042,1212264,5.0,"like most businesses, we are always looking fo...",2016-11-24 13:29:07
3,Product 2179,1383,3.0,we believed _product_ was a great solution for...,2019-09-21 14:02:50
4,Product 90712,92494,4.0,i formerly used _product_ and was relieved to ...,2018-11-28 05:37:54


In [10]:
df.drop(columns=['product', 'user_id', 'date_created'], axis=1, inplace=True)
df.head()

Unnamed: 0,rating,review
0,5.0,"_product_ provided me with a pretty good, sec..."
1,4.0,it protects our files and computer and very si...
2,5.0,"like most businesses, we are always looking fo..."
3,3.0,we believed _product_ was a great solution for...
4,4.0,i formerly used _product_ and was relieved to ...


In [11]:
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]
print(rows_with_NaN.head(), "\n\n", "Number of rows with missing values:", len(rows_with_NaN))

      rating review
1804     5.0    NaN
5781     4.0    NaN
6334     5.0    NaN
6339     5.0    NaN
6363     4.0    NaN 

 Number of rows with missing values: 21


In [12]:
df = df.drop(rows_with_NaN.index)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,rating,review
0,5.0,"_product_ provided me with a pretty good, sec..."
1,4.0,it protects our files and computer and very si...
2,5.0,"like most businesses, we are always looking fo..."
3,3.0,we believed _product_ was a great solution for...
4,4.0,i formerly used _product_ and was relieved to ...


In [13]:
df['review'] = df['review'].astype(str)
df['rating'] = df['rating'].astype(str)

In [14]:
rating_mapping = {
    '1.0': 0,
    '1.5': 1,
    '2.0': 2,
    '2.5': 3,
    '3.0': 4,
    '3.5': 5,
    '4.0': 6, 
    '4.5': 7,
    '5.0': 8
}

df['label'] = df['rating'].apply(lambda x: rating_mapping[x])

In [15]:
df.drop(columns="rating", axis=1, inplace=True)
df.head()

Unnamed: 0,review,label
0,"_product_ provided me with a pretty good, sec...",8
1,it protects our files and computer and very si...,6
2,"like most businesses, we are always looking fo...",8
3,we believed _product_ was a great solution for...,4
4,i formerly used _product_ and was relieved to ...,6


In [16]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
df_proc = df.copy()

In [18]:
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
df_proc['processed_review']= df_proc['review'].apply(lambda x:remove_punctuation(x))
df_proc.head()

Unnamed: 0,review,label,processed_review
0,"_product_ provided me with a pretty good, sec...",8,product provided me with a pretty good secure...
1,it protects our files and computer and very si...,6,it protects our files and computer and very si...
2,"like most businesses, we are always looking fo...",8,like most businesses we are always looking for...
3,we believed _product_ was a great solution for...,4,we believed product was a great solution for s...
4,i formerly used _product_ and was relieved to ...,6,i formerly used product and was relieved to ha...


In [19]:
df_proc['processed_review']= df_proc['processed_review'].apply(lambda x: x.lower())
df_proc.head()

Unnamed: 0,review,label,processed_review
0,"_product_ provided me with a pretty good, sec...",8,product provided me with a pretty good secure...
1,it protects our files and computer and very si...,6,it protects our files and computer and very si...
2,"like most businesses, we are always looking fo...",8,like most businesses we are always looking for...
3,we believed _product_ was a great solution for...,4,we believed product was a great solution for s...
4,i formerly used _product_ and was relieved to ...,6,i formerly used product and was relieved to ha...


In [20]:
nltk.download('stopwords')
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

In [22]:
df_proc['processed_review']= df_proc['processed_review'].apply(lambda x:remove_stopwords(x))
df_proc.head()

Unnamed: 0,review,label,processed_review
0,"_product_ provided me with a pretty good, sec...",8,product provided pretty good secure logon expe...
1,it protects our files and computer and very si...,6,protects files computer simple use needs
2,"like most businesses, we are always looking fo...",8,like businesses always looking ways less using...
3,we believed _product_ was a great solution for...,4,believed product great solution scalable anima...
4,i formerly used _product_ and was relieved to ...,6,formerly used product relieved powerful produc...


In [23]:
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
# Defining the object for Lemmatisation
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [24]:
def lemmatizer(text):
    lemm_text = ' '. join([wordnet_lemmatizer.lemmatize(word) for word in text.split()])
    return lemm_text

df_proc['processed_review']=df_proc['processed_review'].apply(lambda x:lemmatizer(x))
df_proc.head()

Unnamed: 0,review,label,processed_review
0,"_product_ provided me with a pretty good, sec...",8,product provided pretty good secure logon expe...
1,it protects our files and computer and very si...,6,protects file computer simple use need
2,"like most businesses, we are always looking fo...",8,like business always looking way le using two ...
3,we believed _product_ was a great solution for...,4,believed product great solution scalable anima...
4,i formerly used _product_ and was relieved to ...,6,formerly used product relieved powerful produc...


In [25]:
df_proc.drop(columns="review", inplace=True, axis=1)
df_proc.head()

Unnamed: 0,label,processed_review
0,8,product provided pretty good secure logon expe...
1,6,protects file computer simple use need
2,8,like business always looking way le using two ...
3,4,believed product great solution scalable anima...
4,6,formerly used product relieved powerful produc...


In [26]:
train, test = train_test_split(df_proc, test_size=0.3, random_state=42)

In [27]:
train_ds = datasets.Dataset.from_pandas(train, preserve_index=False)
test_ds = datasets.Dataset.from_pandas(test, preserve_index=False)
comp_ds = datasets.DatasetDict({"train":train_ds,"test":test_ds})

In [28]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [29]:
def preprocess_function(df):
    return tokenizer(df["processed_review"], padding="max_length", truncation=True)

In [30]:
tokenized_revs = comp_ds.map(preprocess_function, batched=True)

  0%|          | 0/35 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

In [31]:
data_collator = DefaultDataCollator(return_tensors="tf")

In [32]:
tf_train_set = tokenized_revs["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

tf_test_set = tokenized_revs["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

In [33]:
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=9)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [34]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  6921      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,960,393
Trainable params: 66,960,393
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.layers[0].trainable = False

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

In [36]:
# We will not run this and instead we will load a pre-trained model from a Google Storage bucket.
# model.fit(tf_train_set, validation_data=tf_test_set, epochs=5)

In [37]:
from pathlib import Path
Path("1").mkdir(parents=True, exist_ok=True)

In [38]:
client = storage.Client.create_anonymous_client()
bucket = client.bucket('kelly-seldon')

In [39]:
def load_model(bucket):
    blobs = bucket.list_blobs(prefix="nlp-ratings/model/1/")
    for blob in blobs:
        filename = blob.name.split('/')[-1]
        blob.download_to_filename("1/" + filename)
    model = TFAutoModelForSequenceClassification.from_pretrained("1", num_labels=9)
    return model

In [40]:
model = load_model(bucket)

Some layers from the model checkpoint at 1 were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at 1 and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  6921      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66,960,393
Trainable params: 66,960,393
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

In [43]:
test_batch = test[:200]
test_batch_ds = datasets.Dataset.from_pandas(test_batch, preserve_index=False)
tokenized_revs_batch = test_batch_ds.map(preprocess_function, batched=True)

test_tf = tokenized_revs_batch.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [44]:
loss, accuracy = model.evaluate(test_tf)

print("Model accuracy: {:2.2%}".format(accuracy))
print("Model loss: {}".format(loss))

Model accuracy: 60.50%
Model loss: 0.9071974754333496
