In [97]:
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sam_sub = pd.read_csv("sample_submission.csv")

# Data-Cleaning

In [98]:
import re
def clean_tweet(text):

  # take off html tags
  text = BeautifulSoup(text).get_text()
  
  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()

  # split text into tokens to remove whitespaces
  tokens = text.split()

  return " ".join(tokens)

In [99]:
train["clean_tweet"] = train['tweet'].apply(clean_tweet)
test["clean_tweet"] =  test['tweet'].apply(clean_tweet)

In [100]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train["clean_tweet"],
                                                                    train["label"], 
                                                                    test_size=0.3,
                                                                    random_state=42)

In [110]:
type(train_labels)

pandas.core.series.Series

In [107]:
# Convert labels to numpy arrays
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
# Tokenize texts with a maximum sequence length parameter
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=60,return_tensors='tf')  # Adjust max_length as needed
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=60,return_tensors='tf')  # Adjust max_length as needed
test_encodings = tokenizer(test["clean_tweet"].tolist(), truncation=True, padding=True, max_length=60,return_tensors='tf')
# train_labels = np.array(train_labels)
# val_labels = np.array(val_labels)

In [134]:
sample_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=60,return_tensors='tf')

In [140]:
train_encodings['input_ids']

<tf.Tensor: shape=(5544, 60), dtype=int32, numpy=
array([[  101,  2293,  2026, ...,     0,     0,     0],
       [  101,  2026,  2611, ...,     0,     0,     0],
       [  101,  2047, 26381, ...,     0,     0,     0],
       ...,
       [  101,  2009,  2215, ...,     0,     0,     0],
       [  101,  6302, 23205, ...,     0,     0,     0],
       [  101,  2074,  2288, ...,     0,     0,     0]])>

In [112]:
#Have a look of encoding data
k = 0
print('Training Comments -->>',train_texts.tolist()[k])
print('\nInput Ids -->>\n',train_encodings['input_ids'][k])
print('\nDecoded Ids -->>\n',tokenizer.decode(train_encodings['input_ids'][k]))
print('\nAttention Mask -->>\n',train_encodings['attention_mask'][k])
print('\nLabels -->>',train_labels.tolist()[k])


Training Comments -->> love my apple watch apple watch mm excited christmas came early instagood https instagram com p r gadcn n

Input Ids -->>
 tf.Tensor(
[  101  2293  2026  6207  3422  6207  3422  3461  7568  4234  2234  2220
 16021 15900 17139 16770 16021 23091  4012  1052  1054 11721 16409  2078
  1050   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0], shape=(60,), dtype=int32)

Decoded Ids -->>
 [CLS] love my apple watch apple watch mm excited christmas came early instagood https instagram com p r gadcn n [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

Attention Mask -->>
 tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 

In [113]:
# import tensorflow as tf
# train_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(train_encodings),
#     train_labels
# ))
# val_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(val_encodings),
#     val_labels
# ))

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,from_pt=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [114]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [115]:
history = model.fit(
    [train_encodings['input_ids'], train_encodings['token_type_ids'], train_encodings['attention_mask']],
    train_labels,
    validation_data=(
      [val_encodings['input_ids'], val_encodings['token_type_ids'], val_encodings['attention_mask']],val_labels),
    batch_size=32,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [116]:
val_loss, val_accuracy = model.evaluate(
    [val_encodings['input_ids'], val_encodings['token_type_ids'], val_encodings['attention_mask']],
    val_labels
)
print(f'Test loss: {val_loss}, Test accuracy: {val_accuracy}')

Test loss: 0.2240566909313202, Test accuracy: 0.9065656661987305


In [118]:
pred = model.predict(
    [test_encodings['input_ids'], test_encodings['token_type_ids'], test_encodings['attention_mask']])
 
# pred is of type TFSequenceClassifierOutput
logits = pred.logits



In [131]:
pred_labels = tf.argmax(logits, axis=1)
predictions_label = list(pred_labels.numpy())
predictions_label.count(1) 

654

In [132]:
sam_sub["label"]=predictions_label
sam_sub.to_csv("sam_sub-BERT.csv",index=False)