In [1]:
!pip install transformers datasets

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata
  Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.5/121.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/7c/55/b3432f43d6d7fee999bb23a547820d74c48ec540f5f7842e41aa5d8d5f3a/datasets-2.14.6-py3-none-any.whl.metadata
  Downloading datasets-2.14.6-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/5e/5d/97afbafd9d584ff1b45fcb354a479a3609bd97f912f8f1f6c563cb1fae21/filelock-3.12.4-py3-none-any.whl.metadata
  Downloading filelock-3.12.4-py3-none-any

In [71]:
!pip install accelerate -U -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("Reddit_Data.csv")
df

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [5]:
label_data_mapping = {-1 : 0 , 0 : 1 , 1 : 2}
df['category'] = df['category'].map(label_data_mapping)

In [7]:
df.columns = ['sentence' , 'label']

In [9]:
df.head()

Unnamed: 0,sentence,label
0,family mormon have never tried explain them t...,2
1,buddhism has very much lot compatible with chr...,2
2,seriously don say thing first all they won get...,0
3,what you have learned yours and only yours wha...,1
4,for your own benefit you may want read living ...,2


In [47]:
df.drop_duplicates(inplace=True)

In [49]:
df.dropna(inplace=True)

In [50]:
df

Unnamed: 0,sentence,label
0,family mormon have never tried explain them t...,2
1,buddhism has very much lot compatible with chr...,2
2,seriously don say thing first all they won get...,0
3,what you have learned yours and only yours wha...,1
4,for your own benefit you may want read living ...,2
...,...,...
37244,jesus,1
37245,kya bhai pure saal chutiya banaya modi aur jab...,2
37246,downvote karna tha par upvote hogaya,1
37247,haha nice,2


In [51]:
df.to_csv("data.csv" , index=None)

In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset

In [3]:
data = load_dataset("csv" , data_files="data.csv")

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 36799
    })
})

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
split = data['train'].train_test_split(seed = 42 , test_size = 0.3)

In [7]:
split

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 25759
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 11040
    })
})

In [8]:
checkpoint = "distilbert-base-uncased"

In [9]:
tokenizer =  AutoTokenizer.from_pretrained(checkpoint)

In [10]:
def tokenizer_fn(batch):
    return tokenizer(batch['sentence'] , truncation=True , padding=True)

In [11]:
tokenized_dataset = split.map(tokenizer_fn , batched=True)

Map:   0%|          | 0/11040 [00:00<?, ? examples/s]

In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25759
    })
    test: Dataset({
        features: ['sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 11040
    })
})

In [13]:
from transformers import AutoModelForSequenceClassification , Trainer , TrainingArguments

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint , num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [16]:
training_args = TrainingArguments(
    output_dir="training_dir",
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=64 , 
    per_device_train_batch_size=16,
)

In [22]:
def compute_metric(logits_and_loss):
    logits , labels = logits_and_loss
    prediction = np.argmax(logits , axis=-1)
    accuracy = np.mean(prediction == labels)
    return {"acc" : accuracy}

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

Detected kernel version 5.4.254, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Acc
1,0.1437,0.228672,0.94221
2,0.103,0.2041,0.952264
3,0.0506,0.249234,0.953895


TrainOutput(global_step=4830, training_loss=0.10577863630053913, metrics={'train_runtime': 3897.0776, 'train_samples_per_second': 19.829, 'train_steps_per_second': 1.239, 'total_flos': 1.0236865722651648e+16, 'train_loss': 0.10577863630053913, 'epoch': 3.0})

In [25]:
trainer.save_model("model")

In [28]:
from transformers import pipeline

In [29]:
result = pipeline("text-classification" , device=0 , model="model")

In [30]:
result("I don't know if i should be happy or sad for my friend's birthday")

[{'label': 'LABEL_2', 'score': 0.9974657297134399}]

In [None]:
resule("i am happ")

In [1]:
import shutil

In [3]:
shutil.make_archive("model" , "zip" , "model")

'/home/jovyan/workspace/fine_tune_transofrmer/model.zip'