https://www.kaggle.com/code/boscochanam/text-classification-bert-transfer-learning/notebook

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        continue

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split




Preprocessing

In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Download stopwords and tokenizer from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords and Porter Stemmer
stop_words = stopwords.words('english')
prt = nltk.stem.PorterStemmer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Preprocessing function
def preprocess(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Convert to lowercase and remove punctuation
    tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
    
    # Remove stopwords
    tokens_stop = [i for i in tokens_pun_lower if i not in stop_words]
    
    # Apply stemming
    terms = [prt.stem(i) for i in tokens_stop]
    
    # Return the processed text
    return " ".join(terms)

In [8]:
# Load the CSV into a DataFrame
import os 

os.chdir(r'E:\Python code\ag-news-classification-dataset')

csv_path = 'test.csv'
df = pd.read_csv(csv_path)

# Apply the preprocess function to the 'Description' column
df['Processed_Description'] = df['Description'].apply(preprocess)

# Display the first few rows of the DataFrame
print(df[['Title', 'Description', 'Processed_Description', 'Class Index']].head())

                                               Title  \
0                  Fears for T N pension after talks   
1  The Race is On: Second Private Team Sets Launc...   
2      Ky. Company Wins Grant to Study Peptides (AP)   
3      Prediction Unit Helps Forecast Wildfires (AP)   
4        Calif. Aims to Limit Farm-Related Smog (AP)   

                                         Description  \
0  Unions representing workers at Turner   Newall...   
1  SPACE.com - TORONTO, Canada -- A second\team o...   
2  AP - A company founded by a chemistry research...   
3  AP - It's barely dawn when Mike Fitzpatrick st...   
4  AP - Southern California's smog-fighting agenc...   

                               Processed_Description  Class Index  
0  union repres worker turner newal say talk stri...            3  
1  toronto canada rocket compet 36 10 million ans...            4  
2  ap compani found chemistri research univers lo...            4  
3  ap bare dawn mike fitzpatrick start shift blur...  

token

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
token = tokenizer.encode_plus(
    df['Processed_Description'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [11]:
token.input_ids

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  3779,  1231,  1643,  4894,  7589,  1885,  1200,  1207,
         1348,  1474,  2037, 18178,  6486,  3016,  7672,  1200,   182,
         8032,  4654,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [12]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [13]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['Processed_Description'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [14]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [15]:
labels = np.zeros((len(df), 10))
labels.shape

(7600, 10)

In [16]:
df['Class'] = pd.factorize(df['Class Index'])[0]
df

Unnamed: 0,Class Index,Title,Description,Processed_Description,Class
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...,union repres worker turner newal say talk stri...,0
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",toronto canada rocket compet 36 10 million ans...,1
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,ap compani found chemistri research univers lo...,1
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,ap bare dawn mike fitzpatrick start shift blur...,1
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,ap southern california agenc went emiss bovin ...,1
...,...,...,...,...,...
7595,1,Around the world,Ukrainian presidential candidate Viktor Yushch...,ukrainian presidenti candid viktor yushchenko ...,3
7596,2,Void is filled with Clement,With the supply of attractive pitching options...,suppli attract pitch option dwindl daili lost ...,2
7597,2,Martinez leaves bitter,Like Roger Clemens did almost exactly eight ye...,like roger clemen almost exactli eight year ea...,2
7598,3,5 of arthritis patients in Singapore take Bext...,SINGAPORE : Doctors in the United States have ...,singapor doctor unit state warn painkil bextra...,0


In [17]:
labels[np.arange(len(df)), df['Class Index'].values.astype(int)] = 1

In [18]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

In [19]:
def ModelDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [20]:
dataset = dataset.map(ModelDatasetMapFunction)

In [21]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

In [22]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [23]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train

In [24]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Creating Model

In [25]:
from transformers import TFBertModel

In [26]:
model = TFBertModel.from_pretrained('bert-base-cased')

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [27]:
from tensorflow.keras import regularizers
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')
bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer', kernel_regularizer=regularizers.l2(0.01))(bert_embds)
drop_out = tf.keras.layers.Dropout(0.2, name="dropout")(intermediate_layer)
output_layer = tf.keras.layers.Dense(10, activation='softmax', name='output_layer', kernel_regularizer=regularizers.l2(0.01))(drop_out) # softmax -> calcs probs of classes

model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   72         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                       

In [28]:
learning_rate_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-5, decay_rate=1e-6, decay_steps=10000)

optim = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)

loss_func = tf.keras.losses.CategoricalCrossentropy()

acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
precision = tf.keras.metrics.Precision(name='precision')
recall = tf.keras.metrics.Recall(name='recall')

model.compile(optimizer=optim, loss=loss_func, metrics=[acc, precision, recall])

In [None]:
hist =model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

Epoch 1/5

 44/380 [==>...........................] - ETA: 45:29 - loss: 8.0207 - accuracy: 0.3707 - precision: 0.6571 - recall: 0.0327

In [None]:
model.save("textclassification.h5")

# Evaluation

In [None]:
model.evaluate(val_dataset)

In [None]:
import matplotlib.pyplot as plt
train_acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']

# Plot the epoch vs accuracy graph
plt.plot(range(1, len(train_acc) + 1), train_acc, label='Training Accuracy')
plt.plot(range(1, len(val_acc) + 1), val_acc, label='Validation Accuracy')
plt.title('Epoch vs Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
loss,accuracy,precision,recall=model.evaluate(val_dataset)
print()
print()
print()
print()
print("loss is : ", loss)

print("accuracy is: ", str(round(accuracy*100))+ "%")
print("recall is: ", str(round(recall*100)) + "%")
print("Precision is : ", str(round(precision*100)) + "%")

In [None]:
model.save('model')

In [None]:
# model = tf.keras.models.load_model('model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['business', 'Entertainment', 'food', 'Graphichs', 'historical','medical','politcis', 'space','sport','technology']):
    probs = model.predict(processed_data)[0]
    return np.argmax(probs)

#  Prediction

In [None]:
dict1 = {0:"Space", 1:"Politics",2:"Sport",3:"technology",4:"historical", 5:"Medical", 6:"Graphics",7:"Entertrainment",8:"Food",9:"business"}

In [None]:
text = """Bank voted 8-1 for no rate change

The decision to keep interest rates on hold at 4.75% earlier this month was passed 8-1 by the Bank of England's rate-setting body, minutes have shown.

One member of the Bank's Monetary Policy Committee (MPC) - Paul Tucker - voted to raise rates to 5%. The news surprised some analysts who had expected the latest minutes to show another unanimous decision. Worries over growth rates and consumer spending were behind the decision to freeze rates, the minutes showed. The Bank's latest inflation report, released last week, had noted that the main reason inflation might fall was weaker consumer spending.

However, MPC member Paul Tucker voted for a quarter point rise in interest rates to 5%. He argued that economic growth was picking up, and that the equity, credit and housing markets had been stronger than expected.

The Bank's minutes said that risks to the inflation forecast were "sufficiently to the downside" to keep rates on hold at its latest meeting. However, the minutes added: "Some members noted that an increase might be warranted in due course if the economy evolved in line with the central projection". Ross Walker, UK economist at Royal Bank of Scotland, said he was surprised that a dissenting vote had been made so soon. He said the minutes appeared to be "trying to get the market to focus on the possibility of a rise in rates". "If the economy pans out as they expect then they are probably going to have to hike rates." However, he added, any rate increase is not likely to happen until later this year, with MPC members likely to look for a more sustainable pick up in consumer spending before acting.
"""