In [3]:
from datasets import load_dataset

# Load our data
data = load_dataset("rotten_tomatoes")
data

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 8530/8530 [00:00<00:00, 544267.33 examples/s]
Generating validation split: 100%|██████████| 1066/1066 [00:00<00:00, 256666.36 examples/s]
Generating test split: 100%|██████████| 1066/1066 [00:00<00:00, 418292.46 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [4]:
data["train"][0,1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'],
 'label': [1, 1]}

### Text Classification with representation model

In [5]:
### Using task s
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
pipe = pipeline(
    model=model_path,
    return_all_scores=True,
    device="cuda"
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

#run inference
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text"), batch_size=16)):
    # print(output)
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)


1066it [00:02, 497.91it/s]                      


In [14]:
y_pred[:10]


[1, 1, 0, 1, 1, 0, 1, 1, 1, 1]

In [15]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    """Create and print the classification report"""
    performance = classification_report(
        y_true, y_pred,
        target_names=["Negative Review", "Positive Review"]
    )
    print(performance)

In [16]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



### Classification Task that Leverage Embeddings

In [17]:
#supervised classification
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
train_embeddings = model.encode(data['train']['text'],show_progress_bar=True)
test_embeddings = model.encode(data['test']['text'],show_progress_bar=True)


loading configuration file config.json from cache at C:\Users\swati\.cache\huggingface\hub\models--sentence-transformers--all-mpnet-base-v2\snapshots\9a3225965996d404b775526de6dbfe85d3368642\config.json
Model config MPNetConfig {
  "_name_or_path": "sentence-transformers/all-mpnet-base-v2",
  "architectures": [
    "MPNetForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "mpnet",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.46.3",
  "vocab_size": 30527
}

loading weights file model.safetensors from cache at C:\Users\swati\.cache\huggingface\hub\models--sentence-transformers--all-mpnet-base-v2\snapshots\9a3225965996d404b775526de6dbfe

In [18]:
train_embeddings.shape


(8530, 768)

In [19]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression on our train embeddings
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

In [20]:
# Predict previously unseen instances
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



### What if we don't use any classification model, instead we use cosine similarity!

In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

df = pd.DataFrame(np.hstack([train_embeddings,np.array(data['train']['label']).reshape(-1,1)]))
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,0.014930,-0.005548,0.011995,-0.014927,0.006273,-0.003671,-0.026792,0.009564,0.006486,0.019282,...,0.044446,-0.006567,0.004846,0.032993,-0.027030,0.026459,0.005665,-0.012956,0.001537,1.0
1,0.035829,-0.002350,-0.026249,0.025348,-0.011112,0.003088,-0.066249,-0.048875,-0.018407,-0.032934,...,0.029469,-0.034689,0.032908,0.009154,0.029719,0.033257,0.005511,-0.014471,-0.020907,1.0
2,0.040902,0.110522,0.024601,-0.000690,0.005234,0.001776,-0.054121,0.007338,0.000782,0.032753,...,0.002668,-0.029976,-0.031094,-0.004008,0.023225,-0.004077,0.084754,0.016156,0.025994,1.0
3,-0.003141,0.030397,-0.018153,-0.022295,0.021435,0.019211,0.045022,0.083038,0.044163,0.053213,...,0.015134,-0.002314,-0.008651,0.000362,-0.038151,-0.004815,0.003380,0.039602,-0.032613,1.0
4,0.006541,0.044168,0.029882,0.016410,0.003639,0.005672,-0.054883,0.011031,-0.038811,-0.015811,...,-0.032514,-0.028547,-0.006436,0.007190,-0.055310,-0.044386,0.055256,0.098378,-0.002131,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8525,0.000396,0.092961,0.013449,0.008073,0.047658,0.046298,-0.062199,0.002735,0.026924,0.050474,...,-0.003101,-0.005296,0.004556,-0.003797,-0.054100,0.014604,0.072304,0.019597,-0.007351,0.0
8526,0.061984,0.026447,0.011905,0.008043,-0.028330,0.030330,-0.045090,0.000994,-0.066630,-0.015674,...,0.027525,0.030700,-0.005522,-0.009416,-0.052891,0.003865,-0.015289,0.001440,-0.026430,0.0
8527,0.028653,0.026281,0.002346,-0.002681,0.035184,0.037812,-0.027434,-0.023547,0.000076,0.019272,...,0.023894,-0.058434,-0.059795,0.005341,-0.014020,-0.009184,-0.003212,0.031001,-0.014205,0.0
8528,-0.003424,0.046733,-0.002419,-0.061851,-0.026619,-0.003960,-0.034317,-0.018882,0.025307,-0.008848,...,-0.002240,-0.022505,-0.033220,-0.015093,-0.007377,-0.000089,-0.022067,-0.058857,-0.018474,0.0


In [24]:
# Average the embeddings of all documents in each target label
averaged_target_embeddiings = df.groupby(768).mean().values
averaged_target_embeddiings.shape

(2, 768)

In [25]:
# Find the best matching embeddings between evaluation documents and target embeddings
sim_matrix = cosine_similarity(test_embeddings,averaged_target_embeddiings)
y_pred = np.argmax(sim_matrix,axis=1)
evaluate_performance(data['test']['label'],y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.84      0.84       533
Positive Review       0.84      0.85      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



### Zero-Shot Classification

In [26]:
# Create embeddings for our labels
label_embeddings = model.encode(["negative","positive"])

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(test_embeddings,label_embeddings)
y_pred = np.argmax(sim_matrix,axis=1)
y_pred.shape

(1066,)

In [31]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.62      0.71      0.66       533
Positive Review       0.66      0.56      0.61       533

       accuracy                           0.64      1066
      macro avg       0.64      0.64      0.64      1066
   weighted avg       0.64      0.64      0.64      1066



### Classification with Generative Models

In [32]:
#encoder-decoder model
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device="cuda"
)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
loading configuration file config.json from cache at C:\Users\swati\.cache\huggingface\hub\models--google--flan-t5-small\snapshots\0fc9ddf78a1e988dac52e2dac162b0ede4fd74ab\config.json
Model config T5Config {
  "_name_or_path": "google/flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_

In [34]:
prompt = "Is the following sentence positive or negative?"
data = data.map(lambda x: {"t5": prompt + x["text"]})
data

Map: 100%|██████████| 8530/8530 [00:00<00:00, 30279.43 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 21336.71 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 19585.39 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [35]:
# Run inference
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)

100%|██████████| 1066/1066 [00:38<00:00, 27.40it/s]


In [36]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.84      0.83       533
Positive Review       0.84      0.83      0.83       533

       accuracy                           0.83      1066
      macro avg       0.83      0.83      0.83      1066
   weighted avg       0.83      0.83      0.83      1066

