# text_classification

## Import packages

In [None]:
! pip install datasets
! pip install sentence_transformers
! pip install transformers[torch]
! pip install evaluate



In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

from sklearn.metrics import accuracy_score
from sklearn import svm

## Data Preparation

In [None]:
imdb = load_dataset("imdb")
imdb.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [None]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [None]:
train_df = pd.DataFrame(imdb["train"], columns=['text', 'label'])
print(train_df.shape)

test_df = pd.DataFrame(imdb["test"], columns=['text', 'label'])
print(test_df.shape)
train_df.head()

(25000, 2)
(25000, 2)


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


## cnt/tfidf vector

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [None]:
vectorizer = CountVectorizer(max_features=15000)
X = vectorizer.fit_transform(train_df['text'])
vectorizer.get_feature_names_out()

array(['00', '000', '007', ..., 'zu', 'zucker', 'zuniga'], dtype=object)

In [None]:
train_X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(train_X.shape)
train_X.head()

(25000, 15000)


Unnamed: 0,00,000,007,01,02,10,100,1000,101,102,...,zombie,zombies,zone,zoo,zoom,zooms,zorro,zu,zucker,zuniga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = vectorizer.transform(test_df['text'])
test_X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(test_X.shape)
test_X.head()

(25000, 15000)


Unnamed: 0,00,000,007,01,02,10,100,1000,101,102,...,zombie,zombies,zone,zoo,zoom,zooms,zorro,zu,zucker,zuniga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(train_X, train_df['label'])
train_df['pred'] = gnb.predict(train_X)
train_df

Unnamed: 0,text,label,pred
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0,0
2,If only to avoid making this type of film in t...,0,0
3,This film was probably inspired by Godard's Ma...,0,0
4,"Oh, brother...after hearing about this ridicul...",0,0
...,...,...,...
24995,A hit at the time but now better categorised a...,1,0
24996,I love this movie like no other. Another time ...,1,1
24997,This film and it's sequel Barry Mckenzie holds...,1,1
24998,'The Adventures Of Barry McKenzie' started lif...,1,1


In [None]:
train_df.groupby(['label', 'pred']).size()

label  pred
0      0       11653
       1         847
1      0        4422
       1        8078
dtype: int64

In [None]:
(11496+7693)/25000

0.76756

In [None]:
(11653+8078)/25000

0.78924

In [None]:
test_df['pred'] = gnb.predict(test_X)
test_df.groupby(['label', 'pred']).size()

label  pred
0      0       10600
       1        1900
1      0        7090
       1        5410
dtype: int64

In [None]:
(10792+5452)/25000

0.64976

In [None]:
(10600+5410)/25000

0.6404

## Sentence transformer

In [None]:
model = SentenceTransformer('distilbert-base-uncased')



In [None]:
train_X = model.encode(train_df['text'])
test_X = model.encode(test_df['text'])

In [None]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_X, train_df['label'].values)
accuracy_score(y_true=train_df['label'] , y_pred=clf_svm.predict(train_X))

0.8984

In [None]:
accuracy_score(y_true=test_df['label'] , y_pred=clf_svm.predict(test_X))

0.88412

## Finetune

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    # num_train_epochs=5,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2718,0.212322,0.917
2,0.1688,0.194908,0.92884
3,0.1253,0.236013,0.92764
4,0.0627,0.336985,0.92104


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2718,0.212322,0.917
2,0.1688,0.194908,0.92884
3,0.1253,0.236013,0.92764
4,0.0627,0.336985,0.92104
5,0.0379,0.334922,0.92804


In [None]:
! rm -r results.zip

In [None]:
! zip results.zip -r results/checkpoint-1250/

  adding: results/checkpoint-1250/ (stored 0%)
  adding: results/checkpoint-1250/special_tokens_map.json (deflated 42%)
  adding: results/checkpoint-1250/model.safetensors (deflated 8%)
  adding: results/checkpoint-1250/config.json (deflated 46%)
  adding: results/checkpoint-1250/scheduler.pt (deflated 55%)
  adding: results/checkpoint-1250/trainer_state.json (deflated 63%)
  adding: results/checkpoint-1250/optimizer.pt (deflated 13%)
  adding: results/checkpoint-1250/tokenizer_config.json (deflated 76%)
  adding: results/checkpoint-1250/rng_state.pth (deflated 25%)
  adding: results/checkpoint-1250/tokenizer.json (deflated 71%)
  adding: results/checkpoint-1250/vocab.txt (deflated 53%)
  adding: results/checkpoint-1250/training_args.bin (deflated 50%)


## Inference

In [None]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="./results/checkpoint-1250", truncation=True)
classifier(text)

[{'label': 'POSITIVE', 'score': 0.9942052960395813}]

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="./results/checkpoint-1250", truncation=True, device=0)

In [None]:
test_df['pred'] = classifier(list(test_df['text']))
test_df

Unnamed: 0,text,label,pred
0,I love sci-fi and am willing to put up with a ...,0,"{'label': 'NEGATIVE', 'score': 0.9961985945701..."
1,"Worth the entertainment value of a rental, esp...",0,"{'label': 'NEGATIVE', 'score': 0.9887642860412..."
2,its a totally average film with a few semi-alr...,0,"{'label': 'NEGATIVE', 'score': 0.991312563419342}"
3,STAR RATING: ***** Saturday Night **** Friday ...,0,"{'label': 'NEGATIVE', 'score': 0.9966806769371..."
4,"First off let me say, If you haven't enjoyed a...",0,"{'label': 'POSITIVE', 'score': 0.987015962600708}"
...,...,...,...
24995,Just got around to seeing Monster Man yesterda...,1,"{'label': 'POSITIVE', 'score': 0.97499018907547}"
24996,I got this as part of a competition prize. I w...,1,"{'label': 'POSITIVE', 'score': 0.7180275917053..."
24997,I got Monster Man in a box set of three films ...,1,"{'label': 'NEGATIVE', 'score': 0.5887627601623..."
24998,"Five minutes in, i started to feel how naff th...",1,"{'label': 'POSITIVE', 'score': 0.961819052696228}"


In [None]:
test_df['pred_label'] = test_df['pred'].apply(lambda x: x['label'])

In [None]:
test_df.groupby(['label', 'pred_label']).size()

label  pred_label
0      NEGATIVE      11624
       POSITIVE        876
1      NEGATIVE        837
       POSITIVE      11663
dtype: int64

In [None]:
1-(876+837)/25000

0.93148

## Result

In [None]:
classifier(test_df['text'][0])

[{'label': 'NEGATIVE', 'score': 0.9977849721908569}]

In [None]:
trainer.evaluate()

Step,Training Loss,Validation Loss
500,0.3177,
1000,0.2531,
1500,0.227,
2000,0.167,
2500,0.1535,
3000,0.1576,
3013,0.1576,0.246284


{'eval_loss': 0.24628369510173798}

In [None]:
# model_dir = './results/checkpoint-1250'
# model = AutoModelForSequenceClassification.from_pretrained(model_dir)
# tokenizer = AutoTokenizer.from_pretrained(model_dir)