# Install Transformers Library

In [None]:
! pip install transformers

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

# Load Dataset

In [None]:
%%time

df = pd.read_csv("data.csv")
df.head()

CPU times: user 910 ms, sys: 142 ms, total: 1.05 s
Wall time: 1.05 s


Unnamed: 0,label,text
0,1,WASHINGTON (Reuters) - The head of a conservat...
1,1,WASHINGTON (Reuters) - Transgender people will...
2,1,WASHINGTON (Reuters) - The special counsel inv...
3,1,WASHINGTON (Reuters) - Trump campaign adviser ...
4,1,SEATTLE/WASHINGTON (Reuters) - President Donal...


In [None]:
df_small = df.loc[np.random.randint(0, df.shape[0], 500)]

In [None]:
df_small['label'].mean()

0.51

## Acquire OpenAI API Endpoint

In [None]:
! pip install openai

Collecting openai
  Downloading openai-1.35.14-py3-none-any.whl (328 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m286.7/328.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

### Create `word_embedding` Function

In [None]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def word_embedding(text: str):
    output = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text,
        encoding_format="float"
    )

    return output.data[0].embedding

In [None]:
df_small = df.loc[np.random.randint(0, df.shape[0], 500)]

In [None]:
%%time

df_small['embedding'] = df_small.apply(lambda row: word_embedding(row['text']), axis=1)

CPU times: user 8.45 s, sys: 254 ms, total: 8.7 s
Wall time: 4min 24s


In [None]:
df_small.head()

Unnamed: 0,label,text,embedding
13810,1,JAKARTA (Reuters) - The closure of I Gusti Ngu...,"[-0.014054704, -0.030279916, 0.006176567, 0.00..."
26837,0,Donald Trump s wife was the headline speaker o...,"[-0.03336349, -0.017549355, -0.018850768, -0.0..."
16302,1,MADRID (Reuters) - The regional parliament of ...,"[0.0071368287, 0.016498122, 0.014763569, -0.03..."
39047,0,"Houston, Texas trial lawyer and Democrat mega-...","[-0.020817684, 0.0014526903, 0.010799594, -0.0..."
20143,1,BERLIN (Reuters) - German Finance Minister Wol...,"[0.012651787, -0.021570904, 0.02550003, -0.013..."


In [None]:
X = df_small['embedding'].values
y = df_small['label'].values

In [None]:
X = np.array([np.array(X[i]) for i in range(len(X))])

In [None]:
X.shape, y.shape

((500, 1536), (500,))

## Build `tensorflow` Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, ReLU, Input
from tensorflow.keras.models import Model

# Input layer
inputs = Input(shape=(1536,))

# First dense layer
x = Dense(512)(inputs)
x = BatchNormalization()(x)
x = ReLU()(x)

# Second dense layer
x = Dense(256)(x)
x = BatchNormalization()(x)
x = ReLU()(x)

# Third dense layer
x = Dense(128)(x)
x = BatchNormalization()(x)
x = ReLU()(x)

# Output layer
outputs = Dense(1, activation='sigmoid')(x)

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1536)]            0         
                                                                 
 dense_12 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_9 (Bat  (None, 512)               2048      
 chNormalization)                                                
                                                                 
 re_lu_9 (ReLU)              (None, 512)               0         
                                                                 
 dense_13 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_10 (Ba  (None, 256)               1024      
 tchNormalization)                                           

In [None]:
model.fit(X, y, epochs=10, validation_split=0.2, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79e9446bb5e0>

## Performance Evaluation

In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Assuming you have already trained your model and have X and y

# Make predictions
predictions = model.predict(X)

# Since the output layer uses sigmoid activation, we need to convert predictions to binary values
predictions_binary = (predictions > 0.5).astype(int)

# Flatten the predictions to match the shape of y
predictions_binary = predictions_binary.flatten()

# Generate the classification report
report = classification_report(y, predictions_binary)

# Display the classification report
print(report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       248
           1       0.99      1.00      1.00       252

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500



## Push to HF Cloud

In [None]:
# Define name
nom = "news_embedding_clf"
path = f"folder_{nom}"

In [None]:
%%capture
! pip install git+https://github.com/huggingface/huggingface_hub.git@main
! sudo apt -qq install git-lfs
! git config --global credential.helper store

In [None]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write)

In [None]:
from huggingface_hub import push_to_hub_keras

In [None]:
push_to_hub_keras(model, "news-clf-model")

fingerprint.pb:   0%|          | 0.00/56.0 [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/203k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

variables.data-00000-of-00001:   0%|          | 0.00/3.83M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/eagle0504/news-clf-model/commit/d270456bd3023640b3273204b1c8e662363bfcf1', commit_message='Push Keras model using huggingface_hub.', commit_description='', oid='d270456bd3023640b3273204b1c8e662363bfcf1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import from_pretrained_keras

reloaded_model = from_pretrained_keras('eagle0504/news-clf-model')

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

## Inference: Acquire Model from HF Cloud

In [None]:
from huggingface_hub import from_pretrained_keras
import numpy as np
import tensorflow as tf

# Load the model
reloaded_model = from_pretrained_keras('eagle0504/news-clf-model')

# Prepare input data
# Ensure the input data is of dtype float32
input_data = np.random.rand(500, 1536).astype(np.float32)  # Replace with actual data

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
input_data = X.astype(np.float32)  # Replace with actual data

In [None]:
# Make predictions
predictions = reloaded_model(input_data, training=False)

# Convert predictions to a NumPy array if necessary
predictions = predictions.numpy()


In [None]:
# Since the output layer uses sigmoid activation, we need to convert predictions to binary values
predictions_binary = (predictions > 0.5).astype(int)

# Flatten the predictions to match the shape of y
predictions_binary = predictions_binary.flatten()

# Generate the classification report
report = classification_report(y, predictions_binary)

# Display the classification report
print(report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       248
           1       0.99      1.00      1.00       252

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500

