In [2]:
 pip install transformers

In [3]:
pip install openpyxl

In [4]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [5]:
import tensorflow as tf
import logging
from tensorflow.keras.layers import (
    Dense,
    Flatten,
    Conv1D,
    Dropout,
    Input,
    LSTM,
    Bidirectional
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from tensorflow.keras import regularizers
from transformers import BertTokenizer, TFBertModel
from sklearn.linear_model import LogisticRegression
import os
import pandas as pd
import numpy as np
import keras
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas()
import re
import random
import torch

In [6]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [7]:
max_length = 100
batch_size = 32
test_size = 0.1
num_class = 2

In [8]:
model_name = "sagorsarker/bangla-bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [9]:
df=pd.read_excel('/kaggle/input/sa-bangla-binary-restaurant-review/Bert_Preprocessed_Restaurant_Review_Dataset.xlsx')
df.info()


In [10]:
df.head()

In [11]:
#df=df.drop_duplicates(subset=['Text'],inplace=True)

In [12]:
#df = df.dropna(subset=['Label'])

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['Text'],df['Label'] , test_size=0.1)
x_train, x_validation, y_train, y_validation = train_test_split(x_train,y_train,test_size=0.1)

train=df[:int(.8*len(df))]
test=df[int(.8*len(df)):int(.9*len(df))]
validation=df[int(.9*len(df)):]
x_train=train['Text']
y_train=train['Label']
x_validation=validation['Text']
y_validation=validation['Label']
x_test=test['Text']
y_test=test['Label']

In [14]:
x_train = x_train.astype(str)
#y_train = y_train.astype(str)
x_validation = x_validation.astype(str)
x_test = x_test.astype(str)
#y_test = y_test.astype(str)

In [15]:
def bert_encode(data):
    tokens = tokenizer.batch_encode_plus(
        data, max_length=max_length, padding="max_length", truncation=True
    )
    return tf.constant(tokens["input_ids"])
train_encoded = bert_encode(x_train)
validation_encoded=bert_encode(x_validation)
test_encoded = bert_encode(x_test)

train_dataset = (
    tf.data.Dataset.from_tensor_slices((train_encoded, y_train))
    .shuffle(100)
    .batch(batch_size)
).cache()
validation_dataset = (
    tf.data.Dataset.from_tensor_slices((validation_encoded,y_validation))
    .shuffle(100)
    .batch(batch_size)
).cache()

test_dataset = (
    tf.data.Dataset.from_tensor_slices((test_encoded,y_test))
    .shuffle(100)
    .batch(batch_size)
).cache()


In [16]:
from keras.utils.vis_utils import plot_model

In [17]:
def bert_bangla_sentiment_model():
    bert_encoder = TFBertModel.from_pretrained(model_name, output_attentions=True)
    input_word_ids = Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    last_hidden_states = bert_encoder(input_word_ids)[0]    
    x = tf.keras.layers.LSTM(100, dropout=0.3, recurrent_dropout=0.3)(last_hidden_states)
    output = Dense(num_class, activation="softmax")(x)
    model = Model(inputs=input_word_ids, outputs=output)
    return model

In [18]:
with strategy.scope():
    model = bert_bangla_sentiment_model()
    adam_optimizer = Adam(learning_rate=1e-5)
    model.compile(
        loss="sparse_categorical_crossentropy", optimizer=adam_optimizer, metrics=["accuracy"]
    )
    model.summary()

In [19]:
tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=False, show_dtype=False,show_layer_names=True, rankdir='TB', expand_nested=False, dpi=96)

In [20]:
history = model.fit(
    train_dataset,
    batch_size=batch_size,
    epochs=30,
    validation_data=validation_dataset,
    verbose=1,
)

In [21]:
model.save_weights('weights.h5', overwrite=True)

In [22]:
#model.evaluate(x_test, y_test)

In [23]:
import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history["val_" + string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, "val_" + string])
    plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [24]:
from sklearn.metrics import classification_report
y_pred=model.predict(test_dataset, batch_size=batch_size)
y_pred = np.argmax(y_pred,axis=1)

In [25]:
y_pred = y_pred.reshape(-1,1)

In [26]:
categories = ['negative','positive']
print(classification_report(y_test, y_pred, target_names=categories))

In [27]:
import seaborn as sns
import matplotlib.pyplot as plt

### Confusion Matrix
from sklearn.metrics import confusion_matrix
#predictions = model.predict(x_test, steps=len(x_test), verbose=0)
#y_pred=model.predict(x_test)
#y_pred = np.round(y_pred)


cm = confusion_matrix(y_test, y_pred)

## Get Class Labels
labels = 2
#[0,1,2,3,4,5]_
class_names = categories 

# Plot confusion matrix in a beautiful manner
fig = plt.figure(figsize=(16, 14))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g'); #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted', fontsize=20)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(class_names, fontsize = 10)
ax.xaxis.tick_bottom()

ax.set_ylabel('True', fontsize=20)
ax.yaxis.set_ticklabels(class_names, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Refined Confusion Matrix', fontsize=20)

plt.savefig('ConMat24.png')
plt.show()

In [28]:
Influenced by:
Kaggle Notebook:https://www.kaggle.com/code/lmasca/disaster-tweets-using-bert-embeddings-and-lstm/notebook