# DataMining - BERT

https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb

https://www.tensorflow.org/text/tutorials/classify_text_with_bert



# Setup Kaggle

    !pip install kaggle

    !mkdir ~/.kaggle

    !cp "drive/MyDrive/Colab Notebooks/DataMining/kaggle.json" ~/.kaggle/

    !chmod 600 ~/.kaggle/kaggle.json

    !kaggle datasets download -d cosmos98/twitter-and-reddit-sentimental-analysis-dataset

    !unzip /content/twitter-and-reddit-sentimental-analysis-dataset.zip

# Import packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import os, time
#import tweepy as tw #for accessing Twitter API

# For Building the model
from sklearn.model_selection import train_test_split
#import tensorflow as tf
import seaborn as sns

#For data visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

# Read and clean data

In [2]:
df = pd.read_csv("data/Reddit_Data.csv")
df = df.rename(columns={'clean_comment':'text', 'category':'score'})
df.head(10)

Unnamed: 0,text,score
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
5,you should all sit down together and watch the...,-1
6,was teens when discovered zen meditation was ...,1
7,jesus was zen meets jew,0
8,there are two varieties christians dogmatic th...,-1
9,dont worry about trying explain yourself just ...,1


In [9]:
def score2label(y):
  pass

In [3]:
df.describe().loc["count"]

score    37249.0
Name: count, dtype: float64

In [4]:
df.isnull().sum() #100 adat hiányzik

text     100
score      0
dtype: int64

In [5]:
df.dropna(axis=0, inplace=True) #NA adatok eldobása

In [6]:
df.shape

(37149, 2)

# Import packages

    !pip install transformers

In [7]:
import tensorflow as tf
print("TensorFlow version: ", tf.__version__)
from tensorflow.data import Dataset
import transformers
print("Transformers version: ", transformers.__version__)
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer

print("\nNum GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.get_logger().setLevel('ERROR')

ModuleNotFoundError: No module named 'tensorflow'

# Text preprocess

Load data: https://www.tensorflow.org/tutorials/load_data/numpy

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
BERT_MODEL = "bert-base-uncased"

In [18]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

In [19]:
def tokenize(text):
  return tokenizer(text,
          #do_lower_case=True,
          add_special_tokens=True,
          padding="max_length",
          max_length=250,
          truncation=True,
          return_tensors="np",
          return_token_type_ids=False,
          return_attention_mask=True)

In [20]:
XX_tr, XX_te, y_train, y_test = train_test_split(df["text"].to_list(), df["score"].to_numpy()+1)

In [21]:
X_tr_encoded = tokenize(XX_tr)
X_te_encoded = tokenize(XX_tr)

In [22]:
X_train = (X_tr_encoded["input_ids"], X_tr_encoded["attention_mask"])
X_test = (X_te_encoded["input_ids"], X_te_encoded["attention_mask"])

# Build model

In [23]:
!pip install tensorflow_addons



In [24]:
import tensorflow.keras as keras
import tensorflow_addons as tfa

 The versions of TensorFlow you are currently using is 2.7.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [25]:
NUM_LABELS = 3
EPOCHS = 5
LR_RATE = 2e-5
BATCHES = 32

In [26]:
model = TFAutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=LR_RATE,
    decay_steps=int(len(y_train) * EPOCHS / BATCHES),
    decay_rate=0.9)
optimizer = keras.optimizers.SGD(learning_rate=lr_schedule, clipnorm=1)
#optimizer, _ = create_optimizer(init_lr=LR_RATE, num_warmup_steps=0, num_train_steps=int(SAMPLE_SIZE // BATCHES * EPOCHS))
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [28]:
metrics = [#keras.metrics.AUC(multi_label=True, num_labels=3, from_logits=True), 
           #tfa.metrics.F1Score(num_classes=3, average="macro")
           ]

In [29]:
model.compile(optimizer=optimizer, loss=criterion, metrics=["acc", "ce", "mse"]+metrics)

In [30]:
train_history = model.fit(X_train,
          y_train, 
          epochs=EPOCHS, 
          batch_size=BATCHES, 
          shuffle=True,
          validation_split=0.2) 
#https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit

Epoch 1/5


ResourceExhaustedError: ignored

In [None]:
train_history.history

In [None]:
test_history = model.evaluate(X_train, y_train, 
               batch_size=BATCHES)