In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from textblob import TextBlob
import advertools
import re
import spacy
import xgboost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report,auc, 
                            roc_auc_score, precision_score,
                            recall_score,f1_score, accuracy_score)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
import gensim
import nltk
import nltk.util
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Dense, Dropout, SpatialDropout1D, LSTM, Conv1D, MaxPool1D, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import pad_sequences
from transformers import BertTokenizer, TFBertForSequenceClassification
import warnings
warnings.filterwarnings("ignore")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package wordnet to C:\Users\Ali
[nltk_data]     Haider\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Ali
[nltk_data]     Haider\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import tensorflow as tf
tf.__version__

'2.10.1'

In [3]:
data_path = Path.cwd().parent/"Data"
final_data = pd.DataFrame()

In [32]:
final_data = pd.read_csv("control_data.csv")
final_data.rename(columns={"Disorder":"class"},inplace=True)
final_data.head()

Unnamed: 0,class,tweet
0,SCHIZOPHRENIA,"""@USER That feelingtake good care 💗xx"""
1,ADHD,"""@USER This had me belly laughing 😂"""
2,CONTROL,"""Solid tactics from Simeone."""
3,CONTROL,"3. ""Partey's control in the middle is unmatched."""
4,PTSD,"""Relationships where both people have conceale..."


In [36]:
final_data["class"].value_counts()

class
CONTROL            18761
ADHD                3034
SCHIZOPHRENIA       2970
OCD                 2905
ANXIETY             2729
PTSD                2466
DEPRESSION          2161
AUTISM              1425
EATING DISORDER      403
BIPOLAR              244
Name: count, dtype: int64

In [37]:
pipeline_data = final_data.copy()

# Data Preprocessing

In [38]:
final_data.head(1)

Unnamed: 0,class,tweet
0,SCHIZOPHRENIA,"""@USER That feelingtake good care 💗xx"""


In [39]:
def clean_text(text:str) -> str:
    tokens = advertools.word_tokenize(text,phrase_len=1)
    emoji_token = advertools.extract_emoji(tokens[0])
    emoji_token = emoji_token["emoji_text"]
    for tok in range(0,len(emoji_token)):
        if emoji_token[tok]:
            tokens[0][tok] = " ".join(emoji_token[tok])
    clean_text = " ".join(tokens[0])
    clean_pattern = r"@\w+|#\w+|\W+|x+|https\.\//www\.(\w+|\W+)\.com|http\w*|www\.(\w+|\W+)\.com|user\w*|\d+"
    clean_text = re.sub(clean_pattern," ",clean_text).strip()
    clean_text = re.sub(r"\s+"," ",clean_text).strip()
    return clean_text

In [40]:
def text_lemmatize(text: str) -> str:
    lemma = WordNetLemmatizer()
    tokens = advertools.word_tokenize(text,phrase_len=1)
    text = [lemma.lemmatize(tok) for tok in tokens[0]]
    return " ".join(text)

In [41]:
# def spell_check(text:str) -> str:
#     nlp = spacy.load("en_core_web_sm")
#     nlp.add_pipe("contextual spellchecker")
#     doc = nlp(text)
#     return doc._.outcome_spellCheck

In [42]:
def remove_stopword(text: str) -> str:
    tokens = advertools.word_tokenize(text,phrase_len=1)[0]
    stop_words = stopwords.words("english")
    clean_word = [tok for tok in tokens if tok not in stop_words]
    return " ".join(clean_word)

In [43]:
def generate_n_grams(text: str, n: list[int]) -> list[tuple]:
    tokens = advertools.word_tokenize(text,phrase_len=1)[0]
    if len(n)<=1:
        n_grams = list(nltk.ngrams(tokens,n[0],pad_right=True,right_pad_symbol="</s>"))
        bag_ngrams = " ".join(["_".join(c) for c in n_grams])
    else:
        bag_ngrams = " "
        for n_gram in range(n[0],n[1]+1):
            n_grams = list(nltk.ngrams(tokens,n_gram,pad_right=True,right_pad_symbol="</s>"))
            if n_gram==1:       
                n_grams = " ".join(["".join(c[0]) for c in n_grams])
                bag_ngrams+=n_grams
            else:
                n_grams = " ".join(["_".join(c) for c in n_grams])
                bag_ngrams+=" "+n_grams
    return bag_ngrams

In [44]:
def tfidf(train: pd.DataFrame, test:pd.DataFrame()):
    tf_idf = TfidfVectorizer()
    train_feat = tf_idf.fit_transform(train)
    test_feat = tf_idf.transform(test)
    return train_feat,test_feat

In [45]:
def classification_metrics(actuals: np.array, preds: np.array) -> pd.DataFrame:
    metrics_performance = pd.DataFrame()
    precision_0 = precision_score(actuals,preds, pos_label=0)
    precision_1 = precision_score(actuals,preds, pos_label=1)
    recall_0 = recall_score(actuals,preds,pos_label=0)
    recall_1 = recall_score(actuals,preds,pos_label=1)
    f1_0 = f1_score(actuals,preds,pos_label=0)
    f1_1 = f1_score(actuals,preds,pos_label=1)
    accuracy = accuracy_score(actuals,preds)
    metrics_performance["precision_0"] = [precision_0]
    metrics_performance["precision_1"] = [precision_1]
    metrics_performance["recall_0"] = [recall_0]
    metrics_performance["recall_1"] = [recall_1]
    metrics_performance["f1_0"] = [f1_0]
    metrics_performance["f1_1"] = [f1_1]
    metrics_performance["accuracy"] = [accuracy]
    return metrics_performance

In [46]:
def data_preprocess_pipeline(data_preprocess: pd.DataFrame,
                             # ngram_range: list,
                            filename: str,
                            ) -> pd.DataFrame:
    data_preprocess = data_preprocess[~(data_preprocess["tweet"].isnull())]
    data_preprocess = data_preprocess[["class","tweet"]]
    data_preprocess.rename(columns={"class":"Disorder"}, inplace=True)
    data_preprocess["tweet"] = data_preprocess["tweet"].apply(clean_text)
    data_preprocess["tweet"] = data_preprocess["tweet"].apply(text_lemmatize)
    data_preprocess["tweet"] = data_preprocess["tweet"].apply(remove_stopword)
    data_preprocess = data_preprocess[data_preprocess["tweet"]!=""]
    # data_preprocess["text_features"] = data_preprocess["tweet"].apply(lambda text: 
    #                                                                   generate_n_grams(text,n=ngram_range))
    print("saving preprocessed data")
    data_preprocess.to_csv(filename,index=False)
    print("data saved")
    return data_preprocess

# balaned Dataset training

In [47]:
pipeline_data.head(1)

Unnamed: 0,class,tweet
0,SCHIZOPHRENIA,"""@USER That feelingtake good care 💗xx"""


In [48]:
# ngram_range = [1,3]
data_preprocess = data_preprocess_pipeline(pipeline_data,
                                           # ngram_range=ngram_range,
                                           filename="dl_balanced_data_preprocess.csv",
                                           )
data_preprocess.head()

saving preprocessed data
data saved


Unnamed: 0,Disorder,tweet
0,SCHIZOPHRENIA,feelingtake good care growing heart
1,ADHD,belly laughing face tear joy
2,CONTROL,solid tactic simeone
3,CONTROL,partey control middle unmatched
4,PTSD,relationship people concealed motif aware othe...


In [50]:
data_preprocess.Disorder.value_counts()

Disorder
CONTROL            18640
ADHD                3018
SCHIZOPHRENIA       2959
OCD                 2888
ANXIETY             2719
PTSD                2458
DEPRESSION          2143
AUTISM              1402
EATING DISORDER      402
BIPOLAR              243
Name: count, dtype: int64

In [51]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_preprocess["tweet"].values.tolist())

In [52]:
WORD2VEC_DIM = 300
SG = {"CBOW":0,"SKIP_GRAM":1}
corpus = [advertools.word_tokenize(text,phrase_len=1)[0] for text in data_preprocess["tweet"].values]
word2vec = gensim.models.Word2Vec(corpus,window=5,vector_size=WORD2VEC_DIM,min_count=1,epochs=100,sg=SG["CBOW"])
WORD2VEC_VOCAB_SIZE = len(word2vec.wv.index_to_key)+1
word2vec_embedding = np.zeros((WORD2VEC_VOCAB_SIZE,WORD2VEC_DIM))
for word,index in tokenizer.word_index.items():
    if word in word2vec.wv.index_to_key:
        word2vec_embedding[index] = word2vec.wv[word]

2024-08-18 12:24:45,589 | INFO | word2vec.py:582 | scan_vocab | collecting all words and their counts
2024-08-18 12:24:45,637 | INFO | word2vec.py:565 | _scan_vocab | PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-08-18 12:24:45,798 | INFO | word2vec.py:565 | _scan_vocab | PROGRESS: at sentence #10000, processed 71168 words, keeping 10812 word types
2024-08-18 12:24:45,888 | INFO | word2vec.py:565 | _scan_vocab | PROGRESS: at sentence #20000, processed 140479 words, keeping 14763 word types
2024-08-18 12:24:45,904 | INFO | word2vec.py:565 | _scan_vocab | PROGRESS: at sentence #30000, processed 211155 words, keeping 17670 word types
2024-08-18 12:24:45,962 | INFO | word2vec.py:588 | scan_vocab | collected 19345 word types from a corpus of 260267 raw words and 36872 sentences
2024-08-18 12:24:45,969 | INFO | word2vec.py:637 | prepare_vocab | Creating a fresh vocabulary
2024-08-18 12:24:46,384 | INFO | utils.py:447 | add_lifecycle_event | Word2Vec lifecycle event {

In [54]:
word2vec_embedding.shape

(19346, 300)

In [26]:
GLOVE_VOCAB_SIZE = len(tokenizer.word_index)+1
EMBEDDINGS_DIMENSION = 300
glove_embeddings = np.zeros((GLOVE_VOCAB_SIZE,EMBEDDINGS_DIMENSION))
glove_model = spacy.load("en_core_web_lg")
for word,index in tokenizer.word_index.items():
    if word in glove_model.vocab.strings:
        glove_embeddings[index] = glove_model.vocab[word].vector

In [27]:
glove_embeddings.shape

(17559, 300)

In [29]:
import fasttext

In [30]:
with open("corpus.txt","w", encoding="utf-8") as file:
    for doc in data_preprocess["tweet"].values:
        file.write(doc + "\n")
MODEL_TYPE = "skipgram"
EMBEDDINGS_DIMENSION = 300
FASTTEXT_VOCAB_SIZE = len(tokenizer.word_index)+1
fasttext_embeddings = np.zeros((FASTTEXT_VOCAB_SIZE,EMBEDDINGS_DIMENSION))
fasttext_model = fasttext.train_unsupervised("corpus.txt",ws=5,minn=2,epoch=100,dim=300,model=MODEL_TYPE)
for word,index in tokenizer.word_index.items():
    if word in fasttext_model.words:
        fasttext_embeddings[index] = fasttext_model.get_word_vector(word)

In [31]:
fasttext_embeddings.shape

(17559, 300)

In [32]:
import tensorflow_hub as hub

In [47]:
ELMO_VOCAB_SIZE = len(data_preprocess)
EMBEDDINGS_DIMENSION = 1024

elmo_embeddings = np.zeros((ELMO_VOCAB_SIZE,EMBEDDINGS_DIMENSION))
elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3")
for index,text in enumerate(data_preprocess["tweet"].values):
    print("index",index)
    embeddings = elmo(tf.constant([text]))
    elmo_embeddings[index]=embeddings.numpy()
elmo_embeddings.shape

In [40]:
cls_embeddings.shape

TensorShape([1, 768])

In [49]:
BERT_VOCAB_SIZE = len(data_preprocess)
EMBEDDINGS_DIMENSION = 768

bert_embeddings = np.zeros((BERT_VOCAB_SIZE,EMBEDDINGS_DIMENSION))
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained("bert-base-uncased")
for index,text in enumerate(data_preprocess["tweet"].values):
    print("index",index)
    inputs = bert_tokenizer(text, padding=True,return_tensors='tf')
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state
    cls_embeddings = embeddings[:, 0, :]
    bert_embeddings[index]=cls_embeddings.numpy()

In [51]:
bert_embeddings.shape

(20758, 768)

In [55]:
max_length_sequence = max([len(doc) for doc in corpus])
max_length_sequence

186

In [56]:
sequence = tokenizer.texts_to_sequences(data_preprocess["tweet"].values)
padded_sequence = pad_sequences(sequence,maxlen=max_length_sequence,padding="pre")

In [57]:
padded_sequence

array([[   0,    0,    0, ...,  147,   77,    7],
       [   0,    0,    0, ...,    2,   53,   58],
       [   0,    0,    0, ...,   94,  179, 1138],
       ...,
       [   0,    0,    0, ...,    0, 1019,   25],
       [   0,    0,    0, ...,  668,  271,   18],
       [   0,    0,    0, ..., 2095, 1592, 2583]])

In [58]:
word2vec_embedding.shape

(19346, 300)

### LSTM

In [61]:
embedding = word2vec_embedding
Embedding_layer = Embedding(WORD2VEC_VOCAB_SIZE,WORD2VEC_DIM,weights = [embedding],input_length = max_length_sequence, trainable=False)
model = Sequential()
model.add(Embedding_layer)
model.add(Dropout(0.2))
model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,"sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])



In [62]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 186, 300)          5803800   
                                                                 
 dropout (Dropout)           (None, 186, 300)          0         
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 5,964,301
Trainable params: 160,501
Non-trainable params: 5,803,800
_________________________________________________________________


In [63]:
data_preprocess.head(2)

Unnamed: 0,Disorder,tweet
0,SCHIZOPHRENIA,feelingtake good care growing heart
1,ADHD,belly laughing face tear joy


In [64]:
data_bin = data_preprocess[["tweet","Disorder"]]
diagnosed_group = data_bin[data_bin["Disorder"]!="CONTROL"]
diagnosed_group["Disorder"] = "DIAGNOSED"
control_group = data_bin[data_bin["Disorder"]=="CONTROL"]
data_bin = pd.concat([diagnosed_group,control_group],axis=0)
encode_target = {"DIAGNOSED":1,
                "CONTROL":0}
data_bin["Disorder"] = data_bin["Disorder"].map(encode_target)
data_bin.head()

Unnamed: 0,tweet,Disorder
0,feelingtake good care growing heart,1
1,belly laughing face tear joy,1
4,relationship people concealed motif aware othe...,1
9,guna honest brain though spongebob saw creeper,1
12,thanks,1


In [65]:
data_bin['Disorder'].value_counts()

Disorder
0    18640
1    18232
Name: count, dtype: int64

In [66]:
y = data_bin["Disorder"].values
X_train,X_temp,y_train,y_temp = train_test_split(padded_sequence, y, test_size=0.3, random_state=42,stratify=y)
X_val,X_test,y_val,y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42,stratify=y_temp)

In [67]:
X_train.shape,X_val.shape,X_test.shape

((25810, 186), (8849, 186), (2213, 186))

In [68]:
print(np.unique(y_train,return_counts=True))
print(np.unique(y_val,return_counts=True))
print(np.unique(y_test,return_counts=True))

(array([0, 1], dtype=int64), array([13048, 12762], dtype=int64))
(array([0, 1], dtype=int64), array([4473, 4376], dtype=int64))
(array([0, 1], dtype=int64), array([1119, 1094], dtype=int64))


In [69]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', verbose=1)

In [36]:
history = model.fit(X_train,y_train,epochs=5,batch_size=32,validation_data=(X_val, y_val),verbose=1,callbacks=[early_stopping],
                   workers=3)

Epoch 1/10

KeyboardInterrupt: 

In [48]:
y_pred = model.predict(X_test)



In [52]:
y_pred = np.squeeze(y_pred)
y_pred

array([0.32711464, 0.9879332 , 0.9888914 , ..., 0.9932704 , 0.9756116 ,
       0.99673086], dtype=float32)

In [54]:
y_pred = np.where(y_pred<0.6,0,1)

In [57]:
model_performance_lstm = classification_metrics(y_test,y_pred)
model_performance_lstm

Unnamed: 0,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1,accuracy
0,0.527933,0.916711,0.374257,0.953661,0.438007,0.934821,0.883189


In [59]:
test_loss,accuracy = model.evaluate(X_test,y_test)



### CNN+LSTM

In [74]:
Embedding_layer = Embedding(WORD2VEC_VOCAB_SIZE,WORD2VEC_DIM,weights = [word2vec_embedding],input_length = max_length_sequence, trainable=False)
model = Sequential()
model.add(Embedding_layer)
model.add(Conv1D(filters=32,kernel_size=5,activation="relu",padding="same",strides=1))
model.add(MaxPool1D(pool_size=2))
model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(32,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1,"sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [75]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 186, 300)          5268600   
                                                                 
 conv1d_5 (Conv1D)           (None, 186, 32)           48032     
                                                                 
 max_pooling1d_5 (MaxPoolin  (None, 93, 32)            0         
 g1D)                                                            
                                                                 
 lstm_6 (LSTM)               (None, 100)               53200     
                                                                 
 dense_10 (Dense)            (None, 32)                3232      
                                                                 
 dropout_5 (Dropout)         (None, 32)                0         
                                                      

In [76]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', verbose=1)
history = model.fit(X_train,y_train,epochs=5,batch_size=32,validation_split=0.1,verbose=1,callbacks=[early_stopping],
                   workers=3)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 4: early stopping


In [77]:
y_pred = model.predict(X_test)
y_pred = np.squeeze(y_pred)
y_pred = np.where(y_pred<0.6,0,1)
model_performance = classification_metrics(y_test,y_pred)
model_performance



Unnamed: 0,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1,accuracy
0,0.522026,0.927528,0.469307,0.940499,0.494265,0.933969,0.883189


In [78]:
test_loss,accuracy = model.evaluate(X_test,y_test)



### BERT

In [176]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [177]:
data_bert = data_bin.copy()
data_bert["tweet"] = "[CLS] " +data_bert['tweet'] + "[SEP]"
X = data_bert[["tweet"]]
y = data_bert[["Disorder"]]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

train_text = X_train["tweet"].values.tolist()
train_label = y_train["Disorder"].values.tolist()

test_text = X_test["tweet"].values.tolist()
test_label = y_test["Disorder"].values.tolist()

X_train.shape,X_test.shape

((16606, 1), (4152, 1))

In [178]:
len(train_text),len(train_label)

(16606, 16606)

In [179]:
len(test_text),len(test_label)

(4152, 4152)

In [180]:
y_train.Disorder.value_counts()

Disorder
1    14585
0     2021
Name: count, dtype: int64

In [181]:
y_test.Disorder.value_counts()

Disorder
1    3647
0     505
Name: count, dtype: int64

In [183]:
BERT_TOKENS_LENGTH = 512
max_length_sequence = min(max([len(doc) for doc in corpus]),BERT_TOKENS_LENGTH)
train_encoding = bert_tokenizer.batch_encode_plus(train_text,
                                              padding=True,
                                              truncation=True,
                                              max_length = max_length_sequence,
                                              return_tensors='tf')
test_encoding = bert_tokenizer.batch_encode_plus(test_text,
                                              padding=True, 
                                            truncation=True,
                                              max_length = max_length_sequence,
                                              return_tensors='tf')

In [184]:
test_encoding["input_ids"].shape

TensorShape([4152, 186])

In [185]:
train_text[0]

'[CLS] people telling e plaining behaviour based theory psychosis doe help undo thing e perienced wa real happened remember way happened im stuck sorry ive helped[SEP]'

In [130]:
# train_encoding = bert_tokenizer(train_text,padding=True)
# test_encoding = bert_tokenizer(test_text,padding=True)

In [186]:
# train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encoding),train_label))
# test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encoding),test_label))

In [187]:
# BERT_TOKENS_LENGTH = 512
# max_length_sequence = min(max([len(doc) for doc in corpus]),BERT_TOKENS_LENGTH)
# max_length_sequence

In [101]:
# model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [188]:
# Compile the model with an appropriate optimizer, loss function, and metrics
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
bert_model.compile(optimizer="adam", loss=loss, metrics=[metric])

In [189]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_673 (Dropout)       multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [191]:
history = bert_model.fit(
    [train_encoding['input_ids'], train_encoding['token_type_ids'], train_encoding['attention_mask']],
    tf.convert_to_tensor(train_label, dtype=tf.int32),
    validation_data=(
      [test_encoding['input_ids'], test_encoding['token_type_ids'], test_encoding['attention_mask']], tf.convert_to_tensor(test_label, dtype=tf.int32)),
    batch_size=32,
    epochs=3
)

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported







Epoch 2/3
Epoch 3/3


In [194]:
test_loss, test_accuracy = bert_model.evaluate(
    [test_encoding['input_ids'], test_encoding['token_type_ids'], test_encoding['attention_mask']],
    tf.convert_to_tensor(test_label)
)



In [195]:
test_loss, test_accuracy

(0.3753882050514221, 0.8775259256362915)

In [196]:
pred = bert_model.predict(
    [test_encoding['input_ids'], test_encoding['token_type_ids'], test_encoding['attention_mask']])



In [197]:
pred

TFSequenceClassifierOutput(loss=None, logits=array([[-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ],
       ...,
       [-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ]], dtype=float32), hidden_states=None, attentions=None)

In [198]:
logits = pred.logits
logits

array([[-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ],
       ...,
       [-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ],
       [-0.72870165,  1.5744926 ]], dtype=float32)

In [199]:
pred_labels = tf.argmax(logits, axis=1)
pred_labels

<tf.Tensor: shape=(4152,), dtype=int64, numpy=array([1, 1, 1, ..., 1, 1, 1], dtype=int64)>

In [200]:
pred_labels = pred_labels.numpy()
 
label = {
    1: 'positive',
    0: 'Negative'
}

In [202]:
np.unique(pred_labels)

array([1], dtype=int64)

## GAN-BERT