In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['WANDB_DISABLED'] = 'true'

In [8]:
train = pd.read_csv(r"../data/twitter_training.csv",names=['id','unknown','Category','Text'])
test = pd.read_csv(r"../data/twitter_validation.csv",names=['id','unknown','Category','Text'])
train.head()

Unnamed: 0,id,unknown,Category,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [9]:
test.Category.value_counts()

Category
Neutral       285
Positive      277
Negative      266
Irrelevant    172
Name: count, dtype: int64

In [10]:
train = train.dropna()
test = test.dropna()

In [11]:
len(train)
len(test)

1000

In [12]:
train.isna().sum()

id          0
unknown     0
Category    0
Text        0
dtype: int64

In [30]:
train_df = train.sample(n=6000, random_state=42)
test_df = test.sample(n=750, random_state=42)

In [31]:
train_df

Unnamed: 0,id,unknown,Category,Text
61413,4926,GrandTheftAuto(GTA),Irrelevant,Looks to me like he failed to check out the wa...
44887,11709,Verizon,Irrelevant,"Wow, it takes all sorts of crazy people out th..."
73662,9020,Nvidia,Neutral,Nvidia Unveils The World’s Fastest Gaming Moni...
36694,8295,Microsoft,Neutral,Huge radio play here. Reinvention / Corporate ...
2308,1604,CallOfDutyBlackopsColdWar,Negative,SO I HAPPY WHO ABOUT THIS.
...,...,...,...,...
11683,8408,NBA2K,Negative,2k games are never the same again.
30761,7284,LeagueOfLegends,Neutral,Check Back my video!
45375,11790,Verizon,Neutral,Verizon told US drop ‘most powerful 5G’ claim ...
20093,12639,WorldOfCraft,Neutral,me just earned the [1000 Honorable Kills] Achi...


In [32]:
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to the 'Category' column
train_df['Category'] = label_encoder.fit_transform(train_df['Category'])
test_df['Category'] = label_encoder.transform(test_df['Category'])

In [35]:
train_df

Unnamed: 0,id,unknown,Category,Text
61413,4926,GrandTheftAuto(GTA),0,Looks to me like he failed to check out the wa...
44887,11709,Verizon,0,"Wow, it takes all sorts of crazy people out th..."
73662,9020,Nvidia,2,Nvidia Unveils The World’s Fastest Gaming Moni...
36694,8295,Microsoft,2,Huge radio play here. Reinvention / Corporate ...
2308,1604,CallOfDutyBlackopsColdWar,1,SO I HAPPY WHO ABOUT THIS.
...,...,...,...,...
11683,8408,NBA2K,1,2k games are never the same again.
30761,7284,LeagueOfLegends,2,Check Back my video!
45375,11790,Verizon,2,Verizon told US drop ‘most powerful 5G’ claim ...
20093,12639,WorldOfCraft,2,me just earned the [1000 Honorable Kills] Achi...


In [51]:
import dill
preprocessor_obj_filepath = os.path.join('artifacts',"preprocessor.pkl")
def save_object(file_path, obj):
    try:
        dir_path =os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        
        with open(file_path,"wb") as file_obj:
            dill.dump(obj, file_obj)
            
    except Exception as e:
        print(e)
        
save_object(preprocessor_obj_filepath,label_encoder)

In [4]:
import os

current_directory = os.getcwd()
print(f"Current Directory: {current_directory}")


Current Directory: d:\SL\End to End Projects\Text classification\notebooks


In [1]:
import dill
import os
def load_object(file_path):
    try:
        with open(file_path, "rb") as file_obj:
            return dill.load(file_obj)
    except Exception as e:
        print(e)

preprocessor_obj_filepath = os.path.join('artifacts','preprocessor', "preprocessor.pkl")
label_encoder = load_object(preprocessor_obj_filepath)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
input_category = 'Irrelevant'

# Transform the input category using the loaded label encoder
encoded_category = label_encoder.transform([input_category])[0]

print(f"Encoded category for '{input_category}': {encoded_category}")

Encoded category for 'Irrelevant': 0


In [6]:
# Example encoded category value
encoded_category = 3

# Decode the encoded category using the loaded label encoder
original_category = label_encoder.inverse_transform([encoded_category])[0]

print(f"Original category for encoded value '{encoded_category}': {original_category}")

Original category for encoded value '3': Positive


In [10]:
train_texts = train_df['Text'].astype(str).values.tolist()
train_labels = train_df['Category'].values.tolist()
test_texts = test_df['Text'].astype(str).values.tolist()

In [11]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2,random_state=42,stratify=train_labels)

In [39]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer,TrainingArguments

In [13]:
model_name  = 'distilbert/distilbert-base-uncased'


In [14]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert/distilbert-base-uncased',num_labels=4)

In [15]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True,return_tensors = 'pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True,return_tensors = 'pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True,return_tensors = 'pt')

In [16]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
## Test Dataset
class SentimentTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)

In [17]:
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentTestDataset(test_encodings)

In [18]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred)
    #precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1}

In [19]:
training_args = TrainingArguments(
    output_dir='results',          # output directory
    evaluation_strategy="steps",
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs4',            # directory for storing logs
    #logging_steps=10,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
model = DistilBertForSequenceClassification.from_pretrained(model_name,num_labels=4)

trainer = Trainer(
    model=model,# the instantiated 🤗 Transformers model to be trained
    args=training_args, # training arguments, defined above
    train_dataset=train_dataset,# training dataset
    eval_dataset=val_dataset , # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'train_runtime': 2736.5464, 'train_samples_per_second': 3.508, 'train_steps_per_second': 0.11, 'train_loss': 1.2173590087890624, 'epoch': 2.0}


TrainOutput(global_step=300, training_loss=1.2173590087890624, metrics={'train_runtime': 2736.5464, 'train_samples_per_second': 3.508, 'train_steps_per_second': 0.11, 'total_flos': 501738167347200.0, 'train_loss': 1.2173590087890624, 'epoch': 2.0})

In [21]:
trainer.evaluate()


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 1.0295817852020264,
 'eval_accuracy': 0.5733333333333334,
 'eval_f1_score': 0.5292642695398562,
 'eval_runtime': 100.9893,
 'eval_samples_per_second': 11.882,
 'eval_steps_per_second': 0.188,
 'epoch': 2.0}

In [22]:
test_df['Category'] = 0
test_texts = test_df['Text'].values.tolist()
test_labels = test_df['Category'].values.tolist() 
test_encodings = tokenizer(test_texts, truncation=True, padding=True,return_tensors = 'pt')
test_dataset = SentimentDataset(test_encodings, test_labels)
preds = trainer.predict(test_dataset=test_dataset)

  0%|          | 0/12 [00:00<?, ?it/s]

In [23]:
probs = torch.from_numpy(preds[0]).softmax(1)

predictions = probs.numpy()# convert tensors to numpy array

In [24]:
newdf = pd.DataFrame(predictions,columns=['Neutral','Positive','Negative','Irrelevant'])
newdf.head()


Unnamed: 0,Neutral,Positive,Negative,Irrelevant
0,0.071103,0.402284,0.453506,0.073107
1,0.098287,0.08123,0.0991,0.721382
2,0.077792,0.26209,0.606484,0.053634
3,0.200256,0.082093,0.157054,0.560596
4,0.116955,0.118665,0.692668,0.071711


In [25]:
def labels(x):
  if x == 0:
    return 'Neutral'
  elif x == 1:
    return 'Positive'
  elif x == 2:
    return 'Negative'
  else:
    return 'Irrelevant'

results = np.argmax(predictions,axis=1)
test_df['Sentiment'] = results
test_df['predicted'] = test_df['Sentiment'].map(labels)
test_df.head()

Unnamed: 0,id,unknown,Category,Text,Sentiment,predicted
521,11656,Verizon,0,Remote working and an increase in cloud-based ...,2,Negative
737,9769,PlayStation5(PS5),0,I actually quite like the design of the ps5. I...,3,Irrelevant
740,7021,johnson&johnson,0,New York charges Johnson & Johnson with insura...,2,Negative
660,2567,Borderlands,0,Chris loves me in borderlands one and two.,3,Irrelevant
411,7463,LeagueOfLegends,0,Check out my video! #LeagueofLegends | Capture...,2,Negative


In [37]:
model_path = 'artifacts\model'
# trainer.save_model(model_path)
# tokenizer.save_pretrained(model_path)

In [40]:
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)


In [36]:
train_df.reset_index(inplace=True)

In [54]:
test_df['Text'][737]

'I actually quite like the design of the ps5. It truly feels like the next generation of a console rather than just being a bulkier box with more power'

In [47]:
from transformers import pipeline
nlp = pipeline('sentiment-analysis', model=model,tokenizer=tokenizer)
res=nlp(test_df['Text'][737])

In [48]:
res

[{'label': 'LABEL_3', 'score': 0.7213824391365051}]

In [51]:
# Define the mapping dictionary
label_mapping = {
    'LABEL_0': 0,
    'LABEL_1': 1,
    'LABEL_2': 2,
    'LABEL_3': 3
}

# Iterate through the list and update the 'label' value
for item in res:
    item['label'] = label_mapping[item['label']]

print(res)

[{'label': 3, 'score': 0.7213824391365051}]


In [53]:
# Example encoded category value
encoded_category = res[0]['label']

score = res[0]['score']
# Decode the encoded category using the loaded label encoder
original_category = label_encoder.inverse_transform([encoded_category])[0]

# Create a dictionary containing the original category and the score
result_dict = {"result": original_category, "score": score}

print(result_dict)

{'result': 'Positive', 'score': 0.7213824391365051}


In [None]:
model = model
model = torch.load(PATH)


In [None]:
test_df['Category'] = 0
test_texts = test_df['Text'].values.tolist()
test_labels = test_df['Category'].values.tolist() 
test_encodings = tokenizer(test_texts, truncation=True, padding=True,return_tensors = 'pt')
test_dataset = SentimentDataset(test_encodings, test_labels)
preds = model.predict(test_dataset=test_dataset)

In [None]:
test_encodings = tokenizer(test_texts[0], truncation=True, padding=True,return_tensors = 'pt')


In [None]:
test_dataset = SentimentTestDataset(test_encodings)

In [None]:
test_dataset

In [None]:
trainer.predict(test_dataset)