In [None]:
# Mount drive with the notebook
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install required packages
pip install transformers fast_ml

In [None]:
# import required packages
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from fast_ml.model_development import train_valid_test_split
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from sklearn.metrics import accuracy_score

In [None]:
# read dataframe
df = pd.read_csv("/content/drive/MyDrive/Vivek_Customer_Satisfaction/Twitter_Data.csv")

In [None]:
# drop if there any null values
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
# {-1 -> Negative, 0 -> Neutral, 1 -> Positive}
df['category'].value_counts()

In [None]:
# clan messages
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub('[^a-zA-Z0-9(+*) \n\.]', ' ', str(x)))
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub("\s+", " ", str(x)))

In [None]:
# replaces the lables classes 
df['category'] = df['category'].replace([1,-1,0],[0,1,2])

In [None]:
# split data into train, valid, test
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target='category', 
                                                                            train_size=0.7, valid_size=0.2, test_size=0.1)

In [None]:
# convertiong dataframe into lists 
X_train = X_train['clean_text'].tolist()
y_train = to_categorical(y_train)
X_valid = X_valid['clean_text'].tolist()
y_valid = to_categorical(y_valid)
X_test = X_test['clean_text'].tolist()
y_test = y_test.tolist() 

In [None]:
# define distilbert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# define tokenize function that tokenize sentences and converting them into tensors
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=128, truncation=True, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

In [None]:
# tokenize the three datasets
x_train = tokenize(X_train, tokenizer)
x_valid = tokenize(X_valid, tokenizer)
x_test = tokenize(X_test, tokenizer)

In [None]:
# init distilbert model
distilbert = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
train_input_ids = x_train[0]
train_attention_mask = x_train[1]

In [None]:
valid_input_ids = x_valid[0]
valid_attention_mask = x_valid[1]

In [None]:
# finetune the distilbert model by adding them layer
max_len = 128
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = distilbert([input_ids,input_mask])[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(3,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
# compile the model
optimizer = Adam(learning_rate=5e-05, decay=0.01)

loss = CategoricalCrossentropy(from_logits = True)
model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

In [None]:
# define callbacks 
early_stopping = EarlyStopping(patience=3)

checkpoint_path = '/content/drive/MyDrive/Vivek_Customer_Satisfaction/Model_Checkpoints/'
checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    save_best_only=True
)

tensorboard = TensorBoard(log_dir='/content/drive/MyDrive/Vivek_Customer_Satisfaction/Model_Checkpoints/logs', write_graph=True, write_images=False, update_freq='epoch')

In [None]:
# fit model
train_history = model.fit(
    x = [train_input_ids, train_attention_mask],
    y = np.asarray(y_train),
    validation_data = ([valid_input_ids, valid_attention_mask], np.asarray(y_valid)),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, checkpoint, tensorboard]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# save model at specific path
model.save('/content/drive/MyDrive/Vivek_Customer_Satisfaction/Model_Checkpoints/sentiment-analysis.h5')