In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.probability import FreqDist
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [3]:
stop_words = set(stopwords.words('english'))
wordnet_lem = WordNetLemmatizer()
eng_stems = SnowballStemmer('english')

In [174]:
reviews = pd.read_csv(r"path/to/Reviews.csv")
reviews_backup = reviews

In [175]:
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [176]:
reviews.shape

(568454, 10)

In [177]:
reviews.drop(["Id", "ProductId", "UserId", "ProfileName", "HelpfulnessNumerator", "HelpfulnessDenominator", "Time"], axis = 1, inplace = True)

In [178]:
reviews.isnull().sum()

Score       0
Summary    27
Text        0
dtype: int64

In [179]:
reviews.dropna(inplace= True)
reviews = reviews.reset_index(drop=True)

In [180]:
reviews.shape

(568427, 3)

In [181]:
reviews.head()

Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


Function to replace all the numerical values by 2 classes viz positive and negative reviews. Postive reviews are returned as 1 and negative reviews are returned as 0

In [182]:
def label_to_score(score):
    if score >= 3:
        return 1
    else:
        return 0

In [183]:
reviews["Score"] = reviews["Score"].apply(lambda x: label_to_score(x))

In [184]:
reviews.head()

Unnamed: 0,Score,Summary,Text
0,1,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,0,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,1,"""Delight"" says it all",This is a confection that has been around a fe...
3,0,Cough Medicine,If you are looking for the secret ingredient i...
4,1,Great taffy,Great taffy at a great price. There was a wid...


In [185]:
reviews["Score"].value_counts()

Score
1    486415
0     82012
Name: count, dtype: int64

In [186]:
positive_data = reviews[reviews["Score"] == 1]
positive_data = positive_data.iloc[:82013, :]
positive_data.shape

(82013, 3)

In [187]:
negative_data = reviews[reviews["Score"] == 0]
negative_data.shape

(82012, 3)

In [188]:
reviews = pd.concat([positive_data, negative_data], axis = 0)
reviews.tail()

Unnamed: 0,Score,Summary,Text
568406,0,Tastes horrible!,I just bought this soup today at my local groc...
568407,0,Not so good,This soup is mostly broth. Although it has a k...
568408,0,Where's the tortellini?,"It is mostly broth, with the advertised 3/4 cu..."
568419,0,Mixed wrong,I had ordered some of these a few months back ...
568423,0,disappointed,I'm disappointed with the flavor. The chocolat...


In [189]:
reviews = reviews.sample(frac=1).reset_index(drop=True)
reviews.tail()

Unnamed: 0,Score,Summary,Text
164020,1,Best canned clams I've ever used,My local grocery store has stopped stocking Cr...
164021,1,Good product. High price,They are only two dollars a pop at speedway an...
164022,1,great oatmeal!,I loved this oatmeal. I have always been a fan...
164023,1,K-Cup Columbian Coffee,This product was purchased at a good price. Th...
164024,0,Not Good,I didn't find this product good for kids under...


In [190]:
reviews.drop(["Summary"], axis = 1, inplace = True)

In [144]:
reviews.to_csv("pre_processed.csv", index=False)

In [21]:
def Data_Cleaning(df, ColumnName):
    for i in range(len(df[ColumnName])):
        tokens = word_tokenize(df[ColumnName][i].lower())
        removed_stopwords = [word for word in tokens if (not word in stop_words) and (word.isalpha())]
        lemmas = [wordnet_lem.lemmatize(word) for word in removed_stopwords]
        stems = [eng_stems.stem(word) for word in lemmas]
        df[ColumnName][i] = stems
    return df

In [None]:
reviews = Data_Cleaning(reviews, "Text")

In [23]:
reviews["Text"] = reviews['Text'].apply(lambda y: " ".join(y))

In [24]:
reviews.head()

Unnamed: 0,Score,Text
0,1,sweet syrupi like u soda refresh calori per oz...
1,1,much like sunshin raisin biscuit longer manufa...
2,0,despit fact nutrit inform amazon state product...
3,1,popcorn good first time use use half salt real...
4,1,son love applesauc even almost mess eat spoon ...


In [126]:
reviews.to_csv("pre_processed.csv", index=False)

In [25]:
reviews.Score.value_counts()

Score
1    82013
0    82012
Name: count, dtype: int64

TFIDF

In [26]:
tfidf_vect = TfidfVectorizer()

In [27]:
x_train, x_test, y_train, y_test = train_test_split(reviews["Text"], reviews["Score"], test_size = 0.2, random_state=42)

In [28]:
x_train = tfidf_vect.fit_transform(x_train)
x_test = tfidf_vect.transform(x_test)

Naive Bayes with TF-IDF

In [29]:
nb = MultinomialNB()

In [30]:
nb_tfidf = nb.fit(x_train, y_train)

In [31]:
nb_tfidf_train_results = nb_tfidf.predict(x_train)

In [32]:
nb_tfidf_train_acc = accuracy_score(nb_tfidf_train_results, y_train)
nb_tfidf_train_acc*100

86.47081237616217

In [33]:
nb_tfidf_test_results = nb_tfidf.predict(x_test)

In [34]:
nb_tfidf_test_acc = accuracy_score(nb_tfidf_test_results, y_test)
nb_tfidf_test_acc*100

84.95046486816035

In [35]:
nb_tfidf_conf = confusion_matrix(nb_tfidf_test_results, y_test)
nb_tfidf_conf

array([[13739,  2441],
       [ 2496, 14129]], dtype=int64)

In [36]:
nb_tfidf_Report = classification_report(nb_tfidf_test_results, y_test)
print(nb_tfidf_Report)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     16180
           1       0.85      0.85      0.85     16625

    accuracy                           0.85     32805
   macro avg       0.85      0.85      0.85     32805
weighted avg       0.85      0.85      0.85     32805



Random Forest with TF-IDF

In [209]:
rf = RandomForestClassifier()

In [38]:
rf_tfidf = rf.fit(x_train, y_train)

In [39]:
rf_tfidf_train_results = rf_tfidf.predict(x_train)

In [40]:
rf_tfidf_train_acc = accuracy_score(rf_tfidf_train_results, y_train)
rf_tfidf_train_acc*100

99.9961896052431

In [41]:
rf_tfidf_train_conf = confusion_matrix(rf_tfidf_train_results, y_train)
rf_tfidf_train_conf

array([[65776,     4],
       [    1, 65439]], dtype=int64)

In [42]:
rf_tfidf_test_results = rf_tfidf.predict(x_test)

In [43]:
rf_tfidf_test_acc = accuracy_score(rf_tfidf_test_results, y_test)
rf_tfidf_test_acc*100

88.59625057155921

In [44]:
rf_tfidf_conf = confusion_matrix(rf_tfidf_test_results, y_test)
rf_tfidf_conf

array([[14236,  1742],
       [ 1999, 14828]], dtype=int64)

In [45]:
rf_tfidf_Report = classification_report(rf_tfidf_test_results, y_test)
print(rf_tfidf_Report)

              precision    recall  f1-score   support

           0       0.88      0.89      0.88     15978
           1       0.89      0.88      0.89     16827

    accuracy                           0.89     32805
   macro avg       0.89      0.89      0.89     32805
weighted avg       0.89      0.89      0.89     32805



Decision Tree with Tfidf

In [208]:
dt = DecisionTreeClassifier()

In [47]:
dt_tfidf = dt.fit(x_train, y_train)

In [48]:
dt_tfidf_train_results = dt_tfidf.predict(x_train)

In [49]:
dt_tfidf_train_acc = accuracy_score(dt_tfidf_train_results, y_train)
dt_tfidf_train_acc*100

99.9961896052431

In [50]:
dt_tfidf_train_conf = confusion_matrix(dt_tfidf_train_results, y_train)
dt_tfidf_train_conf

array([[65777,     5],
       [    0, 65438]], dtype=int64)

In [51]:
dt_tfidf_test_results = dt_tfidf.predict(x_test)

In [52]:
dt_tfidf_test_acc = accuracy_score(dt_tfidf_test_results, y_test)
dt_tfidf_test_acc*100

79.77747294619722

In [53]:
dt_tfidf_test_conf = confusion_matrix(dt_tfidf_test_results, y_test)
dt_tfidf_test_conf

array([[13083,  3482],
       [ 3152, 13088]], dtype=int64)

In [62]:
dt_tfidf_Report = classification_report(dt_tfidf_test_results, y_test)
print(dt_tfidf_Report)

              precision    recall  f1-score   support

           0       0.81      0.79      0.80     16565
           1       0.79      0.81      0.80     16240

    accuracy                           0.80     32805
   macro avg       0.80      0.80      0.80     32805
weighted avg       0.80      0.80      0.80     32805



Word2Vec

In [193]:
import gensim
import gensim.downloader as gensim_api
from gensim.models import Word2Vec

Word2Vec model

In [194]:
vector_size = 100  # Dimensionality of the word vectors
window = 5  # Context window size
min_count = 1  # Minimum frequency count of words to consider
workers = 4  # Number of CPU cores to use for training
processed_reviews = reviews["Text"]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=processed_reviews,
                          vector_size=vector_size,
                          window=window,
                          min_count=min_count,
                          workers=workers)

# Save trained Word2Vec model
word2vec_model.save("word2vec_amazon_fine_food_reviews.model")

In [195]:
word2vec_model = Word2Vec.load("word2vec_amazon_fine_food_reviews.model")

def compute_average_embedding(review_tokens):
    embeddings = []
    for token in review_tokens:
        if token in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[token])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)
# Evaluate classifier


In [196]:
X = np.array([compute_average_embedding(rev) for rev in processed_reviews])
y = np.array(reviews['Score'])

In [197]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X, y, test_size=0.2, random_state=42)

Naive Bayes with Word2Vec

In [198]:
gnb = GaussianNB()

In [199]:
nb_word2vec = gnb.fit(X_train_w2v, y_train_w2v)

In [200]:
nb_word2vec_train_results = nb_word2vec.predict(X_train_w2v)

In [201]:
nb_word2vec_train_acc = accuracy_score(nb_word2vec_train_results, y_train_w2v)
nb_word2vec_train_acc*100

51.746684956561495

In [202]:
nb_word2vec_train_conf = confusion_matrix(nb_word2vec_train_results, y_train_w2v)
nb_word2vec_train_conf

array([[ 6280,  3952],
       [59366, 61622]], dtype=int64)

In [203]:
nb_word2vec_test_results = nb_word2vec.predict(X_test_w2v)

In [204]:
nb_word2vec_test_acc = accuracy_score(nb_word2vec_test_results, y_test_w2v)
nb_word2vec_test_acc*100

51.81832037799116

In [205]:
nb_word2vec_test_conf = confusion_matrix(nb_word2vec_test_results, y_test_w2v)
nb_word2vec_test_conf

array([[ 1614,  1054],
       [14752, 15385]], dtype=int64)

In [206]:
nb_w2v_Report = classification_report(nb_word2vec_test_results, y_test_w2v)
print(nb_w2v_Report)

              precision    recall  f1-score   support

           0       0.10      0.60      0.17      2668
           1       0.94      0.51      0.66     30137

    accuracy                           0.52     32805
   macro avg       0.52      0.56      0.42     32805
weighted avg       0.87      0.52      0.62     32805



Random Forst with Word2Vec

In [210]:
rf_word2vec = rf.fit(X_train_w2v, y_train_w2v)

In [211]:
rf_word2vec_train_results = rf_word2vec.predict(X_train_w2v)

In [212]:
rf_word2vec_train_acc = accuracy_score(rf_word2vec_train_results, y_train_w2v)
rf_word2vec_train_acc*100

99.99771376314585

In [213]:
rf_word2vec_train_conf = confusion_matrix(rf_word2vec_train_results, y_train_w2v)
rf_word2vec_train_conf

array([[65646,     3],
       [    0, 65571]], dtype=int64)

In [214]:
rf_word2vec_test_results = rf_word2vec.predict(X_test_w2v)

In [215]:
rf_word2vec_test_acc = accuracy_score(rf_word2vec_test_results, y_test_w2v)
rf_word2vec_test_acc*100

74.9702789208962

In [216]:
rf_word2vec_test_conf = confusion_matrix(rf_word2vec_test_results, y_test_w2v)
rf_word2vec_test_conf

array([[11687,  3532],
       [ 4679, 12907]], dtype=int64)

In [217]:
rf_w2v_Report = classification_report(rf_word2vec_test_results, y_test_w2v)
print(rf_w2v_Report)

              precision    recall  f1-score   support

           0       0.71      0.77      0.74     15219
           1       0.79      0.73      0.76     17586

    accuracy                           0.75     32805
   macro avg       0.75      0.75      0.75     32805
weighted avg       0.75      0.75      0.75     32805



Decision Tree with Word2Vec

In [218]:
dt_word2vec = dt.fit(X_train_w2v, y_train_w2v)

In [219]:
dt_word2vec_train_results = dt_word2vec.predict(X_train_w2v)

In [220]:
dt_word2vec_train_acc = accuracy_score(dt_word2vec_train_results, y_train_w2v)
dt_word2vec_train_acc*100

99.99771376314585

In [221]:
dt_word2vec_train_conf = confusion_matrix(dt_word2vec_train_results, y_train_w2v)
dt_word2vec_train_conf

array([[65646,     3],
       [    0, 65571]], dtype=int64)

In [222]:
dt_word2vec_test_results = dt_word2vec.predict(X_test_w2v)

In [223]:
dt_word2vec_test_acc = accuracy_score(dt_word2vec_test_results, y_test_w2v)
dt_word2vec_test_acc*100

68.72732815119646

In [224]:
dt_word2vec_test_conf = confusion_matrix(dt_word2vec_test_results, y_test_w2v)
dt_word2vec_test_conf

array([[11505,  5398],
       [ 4861, 11041]], dtype=int64)

In [225]:
dt_w2v_Report = classification_report(dt_word2vec_test_results, y_test_w2v)
print(dt_w2v_Report)

              precision    recall  f1-score   support

           0       0.70      0.68      0.69     16903
           1       0.67      0.69      0.68     15902

    accuracy                           0.69     32805
   macro avg       0.69      0.69      0.69     32805
weighted avg       0.69      0.69      0.69     32805



BERT

In [None]:
reviews = pd.read_csv(r"path/to\pre_processed.csv")

In [6]:
backup_x, main_x, backup_y, main_y = train_test_split(reviews['Text'], reviews['Score'], test_size = 0.5, random_state = 42)

In [7]:
main_x.shape

(82013,)

In [8]:
main_y.value_counts()

Score
1    41072
0    40941
Name: count, dtype: int64

In [9]:
reviews_sub = pd.concat([main_x, main_y], axis = 1)
reviews_sub = reviews_sub.sample(frac=1).reset_index(drop=True)
reviews_sub.head()

Unnamed: 0,Text,Score
0,I tried some of this chocolate sugar free syru...,0
1,Seriously. This isn't one of those bars of cho...,1
2,I was thinking of buying from here but then I ...,0
3,These olives were introduced to me by a food f...,1
4,This coffee is just a great tasting coffee. I...,1


In [10]:
x_60, x_40, y_60, y_40 = train_test_split(reviews_sub['Text'], reviews_sub['Score'], test_size = 0.4, random_state = 42)

In [11]:
reviews_6 = pd.concat([x_40, y_40], axis = 1)
reviews_6 = reviews_6.sample(frac=1).reset_index(drop=True)
reviews_6.head()

Unnamed: 0,Text,Score
0,The tea is good and fresh. We enjoy it. The sh...,1
1,"Zero out of five cats, all ferals, said euck.<...",0
2,I did not like Truvia at all. I tried baking w...,0
3,This product contains non-organic MonSatan soy...,0
4,I was very disappointed that my Easter message...,0


In [48]:
reviews_6.shape

(32806, 2)

In [13]:
x_train_bert, x_test_bert, y_train_bert, y_test_bert = train_test_split(reviews_6['Text'], reviews_6['Score'], test_size = 0.2, random_state = 42)

In [None]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [15]:
classifier = pipeline("sentiment-analysis", device=0)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
predictions = []
for text in x_test_bert:
    truncated_text = text[:512]
    prediction = classifier(truncated_text)[0]
    predictions.append(prediction)

In [18]:
predicted_labels = [prediction['label'] for prediction in predictions]
label_map = {"NEGATIVE": 0, "POSITIVE": 1}
predicted_labels_numeric = [label_map[label] for label in predicted_labels]
y_true_filtered = y_test_bert[:len(predicted_labels_numeric)]

In [19]:
accuracy_bert = accuracy_score(y_true_filtered, predicted_labels_numeric)
precision_bert = precision_score(y_true_filtered, predicted_labels_numeric)
recall_bert = recall_score(y_true_filtered, predicted_labels_numeric)
f1_bert = f1_score(y_true_filtered, predicted_labels_numeric)

In [21]:
print("Accuracy=", accuracy_bert)
print("Precision=", precision_bert)
print('Recall=', recall_bert)
print('F1 Score=', f1_bert)

Accuracy= 0.8241389820176775
Precision= 0.8717504332755632
Recall= 0.762352227947863
F1 Score= 0.8133893919793014


bert fine tune

In [130]:
texts = reviews_6['Text'].tolist()
labels = reviews_6['Score'].tolist()

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification,  AdamW

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")


In [125]:
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True) for text in texts]
attention_masks = [[1] * len(text) + [0] * (512 - len(text)) for text in tokenized_texts]

max_len = max(len(token) for token in tokenized_texts)
padded_tokenized_texts = [token + [0] * (max_len - len(token)) for token in tokenized_texts]


In [132]:
from transformers import AdamW

print(len(padded_tokenized_texts))
print(len(attention_masks))

o = np.array(padded_tokenized_texts)
a = np.array(attention_masks)

print(o.shape, a.shape)

np.array(labels).shape

32806
32806
(32806, 512) (32806, 512)


(32806,)

In [None]:
input_ids = torch.tensor(padded_tokenized_texts)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)


# print(f"input_ids = {input_ids.shape}, attention masks = {attention_masks.shape}, labels = {labels.shape} ")

In [134]:
train_inputs, val_inputs, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42)

In [89]:
train_inputs.shape, val_inputs.shape, train_attention_masks.shape, val_attention_masks.shape, train_labels.shape, val_labels.shape

(torch.Size([26244, 512]),
 torch.Size([6562, 512]),
 torch.Size([26244, 512]),
 torch.Size([6562, 512]),
 torch.Size([26244]),
 torch.Size([6562]))

In [135]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(train_inputs, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_attention_masks, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

In [137]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [138]:
from tqdm import tqdm

num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    for batch in train_loader_tqdm:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        train_loader_tqdm.set_postfix(loss=loss.item())  

Epoch 1/2: 100%|██████████| 3281/3281 [21:18<00:00,  2.57it/s, loss=0.0848] 
Epoch 2/2: 100%|██████████| 3281/3281 [20:58<00:00,  2.61it/s, loss=0.381]  


In [144]:
model.eval()

total_val_loss = 0
correct = 0
total = 0

with torch.no_grad():  
    val_loader_tqdm = tqdm(val_loader, desc="Validation")
    lbls = []
    preds = []
    for batch in val_loader_tqdm:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_val_loss += loss.item()
        
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        lbls.append(labels.detach().cpu().numpy())
        preds.append(predicted.detach().cpu().numpy())
lbls = np.concatenate(lbls)
preds = np.concatenate(preds)
avg_val_loss = total_val_loss / len(val_loader)
accuracy = correct / total# prec
precision1 = precision_score(lbls, preds)
recall1 = recall_score(lbls, preds)
f1_1 = f1_score(lbls, preds)

print(f'Validation Loss: {avg_val_loss}, Accuracy: {accuracy}')
print(f'Precision: {precision1}, Recall: {recall1}, F1 Score: {f1_1}')


Validation: 100%|██████████| 821/821 [01:33<00:00,  8.79it/s]

Validation Loss: 0.21977859265091584, Accuracy: 0.9198415117342273
Precision: 0.9479806138933764, Recall: 0.8893604122461352, F1 Score: 0.9177353769158586





Bert with Lora

In [156]:
from transformers import BertForSequenceClassification

In [157]:
from peft import LoraConfig, get_peft_model

In [158]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [159]:
lora_config = LoraConfig( r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",)

In [160]:
lorabert = get_peft_model(bert_model, lora_config)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [162]:
criterion = torch.nn.CrossEntropyLoss()

In [163]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lorabert.to(device)

PeftModel(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, ou

In [167]:
num_epochs = 2

for epoch in range(num_epochs):
    lorabert.train()
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    for batch in train_loader_tqdm:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs1 = lorabert(input_ids, attention_mask=attention_mask, labels=labels)
        loss1 = outputs1.loss
        loss1.backward()
        optimizer.step()
        
        train_loader_tqdm.set_postfix(loss1=loss1.item())

Epoch 1/2: 100%|██████████| 3281/3281 [43:01<00:00,  1.27it/s, loss1=0.827]
Epoch 2/2: 100%|██████████| 3281/3281 [43:04<00:00,  1.27it/s, loss1=0.614]


In [171]:
lorabert.eval()

total_val_loss1 = 0
correct1 = 0
total1 = 0
lbls1 = []
preds1 = []
with torch.no_grad():  
    val_loader_tqdm = tqdm(val_loader, desc="Validation")
    for batch in val_loader_tqdm:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        outputs1 = lorabert(input_ids, attention_mask=attention_mask, labels=labels)
        loss1 = outputs1.loss
        total_val_loss1 += loss1.item()
        
        _, predicted1 = torch.max(outputs1.logits, 1)
        total1 += labels.size(0)
        correct1 += (predicted1 == labels).sum().item()
        lbls1.append(labels.detach().cpu().numpy())
        preds1.append(predicted1.detach().cpu().numpy())
print(len(lbls1), len(preds1))

Validation: 100%|██████████| 821/821 [03:11<00:00,  4.30it/s]

821 821





In [173]:
lbls1 = np.concatenate(lbls1)
preds1 = np.concatenate(preds1)
avg_val_loss1 = total_val_loss / len(val_loader)
accuracy1 = correct1 / total1
precision2 = precision_score(lbls1, preds1)
recall2 = recall_score(lbls1, preds1)
f1_2 = f1_score(lbls1, preds1)

print(f'Validation Loss: {avg_val_loss1}, Accuracy: {accuracy1}')
print(f'Precision: {precision2}, Recall: {recall2}, F1 Score: {f1_2}')

Validation Loss: 0.21977859265091584, Accuracy: 0.5022858884486437
Precision: 0.5029940119760479, Recall: 0.840254622612913, F1 Score: 0.6292849035187287
