In [1]:
# !pip install transformers datasets evaluate scikit-learn pandas -q


In [2]:
# Imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np
import pickle
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load data
input_path = 'sample_data1.csv'
df = pd.read_csv(input_path)

In [None]:
df.head()

In [None]:



df.columns = ['text', 'label']
df.dropna(inplace=True)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Save label mapping
with open('label_mapping.pkl', 'wb') as f:
    pickle.dump(dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_)), f)

# Save preprocessed data
df.to_csv('preprocessed_data.csv', index=False)
print('Preprocessed data saved')


Preprocessed data saved


In [4]:
# !pip install accelerate>=0.26.0


In [5]:


# Load data
df = pd.read_csv('preprocessed_data.csv')
dataset = Dataset.from_pandas(df)

# Tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    return tokenizer(example['text'], truncation=True, padding='max_length')

tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset.set_format('torch')
with open('tokenized_dataset.pkl', 'wb') as f:
    pickle.dump(tokenized_dataset, f)
print('Tokenized dataset saved')


Map: 100%|██████████| 964/964 [00:00<00:00, 7255.49 examples/s]

Tokenized dataset saved





In [6]:
with open('tokenized_dataset.pkl', 'rb') as f:
    tokenized_dataset = pickle.load(f)

split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_data = split_dataset['train']
eval_data = split_dataset['test']

with open('train_data.pkl', 'wb') as f:
    pickle.dump(train_data, f)
with open('eval_data.pkl', 'wb') as f:
    pickle.dump(eval_data, f)
print('Train and Eval split saved')


Train and Eval split saved


In [7]:
# import transformers
# print(transformers.__file__)


In [8]:
# !pip uninstall transformers -y
# !pip install transformers==4.53.2


In [9]:
# from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# print("TrainingArguments module:", TrainingArguments.__module__)

In [10]:
import accelerate
print(accelerate.__version__)


1.8.1


In [11]:


with open('train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('eval_data.pkl', 'rb') as f:
    eval_data = pickle.load(f)

num_labels = len(set(train_data['label']))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
)

trainer.train()
model.save_pretrained('model/')
tokenizer.save_pretrained('model/')
print('Model and tokenizer saved')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,6.5533
20,5.9945
30,5.6034
40,5.08
50,4.8046
60,4.3107
70,4.1852
80,3.6784
90,3.5057
100,3.2993


Model and tokenizer saved


In [12]:


predictions = trainer.predict(eval_data)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print('Classification Report:\n')
print(classification_report(y_true, y_pred))




Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.97      0.90        80
           1       0.00      0.00      0.00        34
           2       0.68      0.90      0.78        52
           3       0.87      0.96      0.91        27

    accuracy                           0.78       193
   macro avg       0.59      0.71      0.65       193
weighted avg       0.65      0.78      0.71       193



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
def predict(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    pred_label = probs.argmax().item()
    with open('label_mapping.pkl', 'rb') as f:
        label_map = pickle.load(f)
    return label_map[pred_label]

print('Sample Prediction:', predict("I love this product, it's amazing!"))


Sample Prediction: 1
