# ****Sentiment Analysis/Rating****

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Step 1: Load the dataset
# Assuming your dataset has two columns: 'Comments' and 'Rating'
df = pd.read_csv("/kaggle/input/charity-reviews-extended/charity_comments_dataset_large.csv")  # Replace with your actual dataset path

# Step 2: Preprocess and tokenize the data
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the text data
def tokenize_function(examples):
    return tokenizer(examples['Review'], padding='max_length', truncation=True, max_length=128)

# Rename columns to match the format expected by the dataset
df = df.rename(columns={"Comment": "Review"})
df['Rating'] = df['Rating'] - 1
# df.head()
# Split the dataset into training and testing (80-20 split)
train_data, val_data = train_test_split(df, test_size=0.2)

# Tokenize the dataset using the tokenizer function
train_encodings = tokenizer(list(train_data['Review']), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_data['Review']), truncation=True, padding=True, max_length=128)

# Convert the ratings to tensor format
train_labels = torch.tensor(list(train_data['Rating']))
val_labels = torch.tensor(list(val_data['Rating']))

# Step 3: Create a custom Dataset class for PyTorch
class CharityReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CharityReviewDataset(train_encodings, train_labels)
val_dataset = CharityReviewDataset(val_encodings, val_labels)

# Step 4: Initialize the BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

# Step 5: Set up training arguments
training_args = TrainingArguments(
    output_dir='./sentiment_results',              # Output directory for the model checkpoints
    evaluation_strategy="epoch",         # Evaluate after each epoch
    per_device_train_batch_size=8,       # Batch size for training
    per_device_eval_batch_size=8,        # Batch size for evaluation
    num_train_epochs=3,                  # Number of training epochs
    weight_decay=0.01,                   # Weight decay strength
    logging_dir='./sentiment_logs',                # Directory for logs
    logging_steps=10,                    # Logging frequency
)

# Step 6: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Step 7: Train the model
trainer.train()

# Step 8: Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Step 9: Save the model
model.save_pretrained('./sentiment_charity_review_model')
tokenizer.save_pretrained('./sentiment_charity_review_model')

#7b084bd9b24acdf199fcd590a4235f52abad2a1c

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.0003,0.000208
2,0.0001,0.000101
3,0.0001,8e-05


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation results: {'eval_loss': 8.043373964028433e-05, 'eval_runtime': 11.5781, 'eval_samples_per_second': 259.11, 'eval_steps_per_second': 16.238, 'epoch': 3.0}


('./sentiment_charity_review_model/tokenizer_config.json',
 './sentiment_charity_review_model/special_tokens_map.json',
 './sentiment_charity_review_model/vocab.txt',
 './sentiment_charity_review_model/added_tokens.json')

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer from the saved directory
model = BertForSequenceClassification.from_pretrained('./sentiment_charity_review_model')
tokenizer = BertTokenizer.from_pretrained('./sentiment_charity_review_model')


In [4]:
def predict_sentiment(text, model, tokenizer, max_length=128):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=max_length)
    
    # Run the model to get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
    # Convert logits to predicted label
    prediction = torch.argmax(logits, dim=-1).item()  # Get the class with the highest score
    return prediction


In [5]:
# Example review
review = "i was with the performance"

# Predict the rating
predicted_rating = predict_sentiment(review, model, tokenizer)
print(f"The predicted rating for the review is: {predicted_rating + 1}")  # Adding 1 to get the original rating (1-5)


The predicted rating for the review is: 2


# ****Review Classification(General/Charity)****

In [8]:
import pandas as pd

# Step 1: Load the two CSV files
charity = pd.read_csv('/kaggle/input/charity-reviews-extended/charity_comments_dataset_large.csv')  # Replace with your actual file paths
general = pd.read_csv('/kaggle/input/amazon-reviews/train.csv')

In [9]:
general.head()

Unnamed: 0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [10]:
general.rename(columns={ 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^': 'Review'}, inplace=True)
# Dropping both '2' and 'Great CD' columns
general = general.drop(['2', 'Stuning even for the non-gamer'], axis=1)
general["label"]=0
# Confirming the columns are dropped
print(general.columns)

Index(['Review', 'label'], dtype='object')


In [11]:
charity.head()

Unnamed: 0,Comment,Rating
0,This organization seems like a scam. I hope th...,1
1,The volunteers were very rude.,1
2,No transparency in how funds are utilized. Suc...,1
3,The volunteers were very rude. I hope they con...,1
4,This organization seems like a scam.,1


In [12]:
charity.rename(columns={ 'Comment': 'Review'}, inplace=True)
# Dropping both '2' and 'Great CD' columns
charity = charity.drop(['Rating'], axis=1)
charity["label"]=1
# Confirming the columns are dropped
print(charity.columns)

Index(['Review', 'label'], dtype='object')


In [13]:
print(len(charity))
print(len(general))

15000
3599999


In [16]:
general = general.head(len(charity))

In [17]:
print(len(charity))
print(len(general))

15000
15000


In [18]:
combined_reviews = pd.concat([charity[['Review', 'label']], general[['Review', 'label']]], axis=0).reset_index(drop=True)

# Print the first few rows of the combined DataFrame
print(combined_reviews.head())
print(len(combined_reviews))

                                              Review  label
0  This organization seems like a scam. I hope th...      1
1                     The volunteers were very rude.      1
2  No transparency in how funds are utilized. Suc...      1
3  The volunteers were very rude. I hope they con...      1
4               This organization seems like a scam.      1
30000


In [27]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Step 1: Split the dataset into training and testing (80-20 split)
# Assuming 'combined_reviews' is the DataFrame containing your reviews and labels
train_data, val_data = train_test_split(combined_reviews, test_size=0.2)

# Step 2: Preprocess and tokenize the data
# Load the BERT tokenizer
tokenizer_new = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing function
def tokenize_function_new(examples):
    return tokenizer_new(examples['Review'], padding='max_length', truncation=True, max_length=128)

# Tokenize the dataset
train_encodings_new = tokenizer_new(list(train_data['Review']), truncation=True, padding=True, max_length=128)
val_encodings_new = tokenizer_new(list(val_data['Review']), truncation=True, padding=True, max_length=128)

# Convert the labels to tensor format
train_labels_new = torch.tensor(list(train_data['label']))
val_labels_new = torch.tensor(list(val_data['label']))

# Step 3: Create a custom Dataset class for PyTorch
class ReviewDatasetNew(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset_new = ReviewDatasetNew(train_encodings_new, train_labels_new)
val_dataset_new = ReviewDatasetNew(val_encodings_new, val_labels_new)

# Step 4: Initialize the BERT model for binary classification (2 labels: 0 and 1)
model_new = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 5: Set up training arguments
training_args_new = TrainingArguments(
    output_dir='./Category_new_results',              # Output directory for the model checkpoints
    evaluation_strategy="epoch",             # Evaluate after each epoch
    per_device_train_batch_size=8,           # Batch size for training
    per_device_eval_batch_size=8,            # Batch size for evaluation
    num_train_epochs=4,                      # Number of training epochs
    weight_decay=0.01,                       # Weight decay strength
    # logging_dir='./Category_new_logs',                # Directory for logs
    logging_steps=10,                        # Logging frequency
)

# Step 6: Initialize the Trainer
trainer_new = Trainer(
    model=model_new,
    args=training_args_new,
    train_dataset=train_dataset_new,
    eval_dataset=val_dataset_new,
)

# Step 7: Train the model
trainer_new.train()

# Step 8: Evaluate the model
eval_results_new = trainer_new.evaluate()
print(f"Evaluation results: {eval_results_new}")

# Step 9: Save the model and tokenizer
model_new.save_pretrained('./Category_new_charity_review_model')
tokenizer_new.save_pretrained('./Category_new_charity_review_model')

# Optionally, you can also save the evaluation results in a file
# eval_results_df_new = pd.DataFrame([eval_results_new])
# eval_results_df_new.to_csv('./new_evaluation_results.csv', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.0,1e-05
2,0.0,8e-06
3,0.0,7e-06
4,0.0,1e-06


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

Evaluation results: {'eval_loss': 7.532453878411616e-07, 'eval_runtime': 32.736, 'eval_samples_per_second': 183.284, 'eval_steps_per_second': 11.455, 'epoch': 4.0}


('./Category_new_charity_review_model/tokenizer_config.json',
 './Category_new_charity_review_model/special_tokens_map.json',
 './Category_new_charity_review_model/vocab.txt',
 './Category_new_charity_review_model/added_tokens.json')

In [28]:
# Tokenize the test data
test_encodings_new = tokenizer_new(list(val_data['Review']), truncation=True, padding=True, max_length=128)

# Convert the labels to tensor format for test data
test_labels_new = torch.tensor(list(val_data['label']))

# Create a dataset for the test data
test_dataset_new = ReviewDatasetNew(test_encodings_new, test_labels_new)


In [29]:
# Perform prediction on the test data
test_results_new = trainer_new.predict(test_dataset_new)

# test_results_new.predictions contains the logits from the model
# Apply softmax to convert logits to probabilities and get predicted labels
predicted_labels_new = torch.argmax(torch.tensor(test_results_new.predictions), dim=1)

# Print predicted labels and actual labels
print(f"Predicted labels: {predicted_labels_new}")
print(f"Actual labels: {test_labels_new}")

# If you want to see the test metrics (accuracy, loss, etc.)
print(f"Test metrics: {test_results_new.metrics}")


Predicted labels: tensor([1, 0, 0,  ..., 0, 0, 0])
Actual labels: tensor([1, 0, 0,  ..., 0, 0, 0])
Test metrics: {'test_loss': 7.532453878411616e-07, 'test_runtime': 33.3594, 'test_samples_per_second': 179.859, 'test_steps_per_second': 11.241}


In [30]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy_new = accuracy_score(test_labels_new.numpy(), predicted_labels_new.numpy())
print(f"Test Accuracy: {accuracy_new}")


Test Accuracy: 1.0


# Predicting Review Category

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer from the saved directory
model_category = BertForSequenceClassification.from_pretrained('./Category_new_charity_review_model')
tokenizer_category = BertTokenizer.from_pretrained('./Category_new_charity_review_model')

In [5]:
def predict_category(text, model, tokenizer, max_length=128):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=max_length)
    
    # Run the model to get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
    # Convert logits to predicted label
    prediction = torch.argmax(logits, dim=-1).item()  # Get the class with the highest score
    return prediction


In [21]:
# Example review
labels = ['General','Charity']
review = "He is a good sports man"

# Predict the rating
predicted_category = predict_category(review, model_category, tokenizer_category)
print(f"The review is: {labels[predicted_category]}")  # Adding 1 to get the original rating (1-5)


The review is: Charity


In [2]:
import shutil

# Path to the charity_review_model folder
model_folder = './sentiment_charity_review_model'  # Replace with your actual folder path
zip_filename = 'sentiment_charity_review_model.zip'  # Name of the zip file to be created

# Create a zip archive of the charity_review_model folder
shutil.make_archive('sentiment_charity_review_model', 'zip', model_folder)

print(f"Model folder has been zipped as {zip_filename}.")


Model folder has been zipped as sentiment_charity_review_model.zip.


In [2]:
def predict_sentiment(text, model, tokenizer, max_length=128):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=max_length)
    
    # Run the model to get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
    # Convert logits to predicted label
    prediction = torch.argmax(logits, dim=-1).item()  # Get the class with the highest score
    return prediction


In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

review = "That donation helped a lot of students"

category_model = BertForSequenceClassification.from_pretrained('./Category_new_charity_review_model')
category_tokenizer = BertTokenizer.from_pretrained('./Category_new_charity_review_model')

rating_model = BertForSequenceClassification.from_pretrained('./sentiment_charity_review_model')
rating_tokenizer = BertTokenizer.from_pretrained('./sentiment_charity_review_model')

predicted_category = predict_sentiment(review, category_model, category_tokenizer)
if(predicted_category):
    predicted_review = predict_sentiment(review, rating_model, rating_tokenizer)  
    print("Rating: ", predicted_review)
else:
    print("This is a general review")


Rating:  4


In [26]:
import shutil

# Path to the folder you want to delete
folder_path = '/kaggle/working/Category_new_results'  # replace with the actual folder path

# Delete the folder and its contents
shutil.rmtree(folder_path)

print(f"Deleted folder: {folder_path}")


Deleted folder: /kaggle/working/Category_new_results


In [25]:
import os

# Specify the path to the zip file you want to remove
zip_file_path = '/kaggle/working/new_evaluation_results.csv'  # replace with the actual file path

# Remove the zip file
if os.path.exists(zip_file_path):
    os.remove(zip_file_path)
    print(f"{zip_file_path} has been deleted.")
else:
    print(f"{zip_file_path} does not exist.")


IsADirectoryError: [Errno 21] Is a directory: '/kaggle/working/Category_new_results'