In [1]:
!pip install transformers datasets scikit-learn pandas matplotlib



In [2]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from datasets import Dataset
import re

In [3]:
df = pd.read_csv('Train.csv')

In [4]:
# Step 3: Data Preprocessing

# Text cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ''  # Handle non-string values (you can adjust this behavior if needed)
    
    text = re.sub(r'\n', ' ', text)  # Replace newlines with space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    return text

In [5]:
# Apply text cleaning
df['text_cleaned'] = df['text'].apply(clean_text)

In [6]:
# Tokenization using a smaller Hugging Face model for efficiency
model_name = "distilbert-base-uncased"  # DistilBERT is lighter and faster
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['text_cleaned'], padding='max_length', truncation=True, max_length=128)

In [8]:
dataset = Dataset.from_pandas(df)

In [9]:
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/290183 [00:00<?, ? examples/s]

In [10]:
# Step 4: Label Encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label_model'])

In [11]:
print(dataset.column_names)

['Unnamed: 0', 'text', 'genre', 'label', 'label_model', 'text_cleaned', 'input_ids', 'attention_mask']


In [12]:
dataset = dataset.add_column("label_new", df['label_encoded'].values)

In [13]:
print(dataset.column_names)

['Unnamed: 0', 'text', 'genre', 'label', 'label_model', 'text_cleaned', 'input_ids', 'attention_mask', 'label_new']


In [14]:
# Step 5: Reduce dataset size for optimization
max_subset_size = 7500  # Use only 7,500 samples
dataset = dataset.select(range(min(len(dataset), max_subset_size)))

In [15]:
# Split the dataset into training and validation sets
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [16]:
pip install torch




In [17]:
pip install pytorch

Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pytorch
  Building wheel for pytorch (setup.py): started
  Building wheel for pytorch (setup.py): finished with status 'error'
  Running setup.py clean for pytorch
Failed to build pytorch
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [6 lines of output]
  Traceback (most recent call last):
    File "<string>", line 2, in <module>
    File "<pip-setuptools-caller>", line 34, in <module>
    File "C:\Users\tusha\AppData\Local\Temp\pip-install-7ngdua1g\pytorch_313d391f75a548f29bb1a40d92158800\setup.py", line 15, in <module>
      raise Exception(message)
  Exception: You tried to install "pytorch". The package named for PyTorch is "torch"
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for pytorch
ERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (pytorch)


In [18]:
# Step 6: Load the smaller pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [20]:
pip install accelerate

Note: you may need to restart the kernel to use updated packages.


In [21]:
# Step 7: Training Arguments and Trainer Setup
training_args = TrainingArguments(
    output_dir='./results',               # Output directory
    num_train_epochs=3,                   # 3 epochs for better learning
    per_device_train_batch_size=32,       # Batch size reduced for 8GB RAM
    per_device_eval_batch_size=64,        # Evaluation batch size
    warmup_steps=50,                      # Few warmup steps
    weight_decay=0.01,                    # Regularization
    logging_dir='./logs',                 # Logging directory
    logging_steps=10,                     # Log every 10 steps
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    save_strategy="epoch",                # Save model after each epoch
    gradient_accumulation_steps=2,        # Use gradient accumulation for larger effective batch size
    save_total_limit=2,                   # Limit saved checkpoints
    load_best_model_at_end=True           # Load the best model at the end
)



In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(np.argmax(p.predictions, axis=-1), p.label_ids)
    }  # Accuracy metric
)

In [23]:
# Step 8: Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.6046,0.839598,0.634667
2,1.2555,0.71346,0.685333
3,1.0111,0.713631,0.706667


TrainOutput(global_step=282, training_loss=1.6342965897093429, metrics={'train_runtime': 4515.2066, 'train_samples_per_second': 3.987, 'train_steps_per_second': 0.062, 'total_flos': 596188339200000.0, 'train_loss': 1.6342965897093429, 'epoch': 3.0})

In [24]:
# Step 9: Evaluate the model
trainer.evaluate()

{'eval_loss': 0.7134597897529602,
 'eval_accuracy': 0.6853333333333333,
 'eval_runtime': 81.6274,
 'eval_samples_per_second': 18.376,
 'eval_steps_per_second': 0.294,
 'epoch': 3.0}

In [25]:
# Specify model
model_name = "distilbert-base-uncased"

In [26]:
from transformers import pipeline

text_classifier = pipeline("text-classification", model=model_name, )


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [27]:
test_text = "I am well!"
print(text_classifier(test_text)[0]['label'])

LABEL_1


In [28]:
# Step 11: Saving the model 
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.txt',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')