<a href="https://colab.research.google.com/github/yinon2592/DL_Project_046211/blob/main/main_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from google.colab import drive
# drive.mount('/content/drive/my-drive/project_calculations')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **picking sentiment classifier**

In [15]:
# !pip install transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the pre-trained sentiment classification model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
sentiment_classifier_model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_classifier_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Text input for sentiment classification
text = "I really enjoyed the movie. It was fantastic!"

# Tokenize the text
encoded_input = sentiment_classifier_tokenizer(text, truncation=True, padding=True, return_tensors='pt')

# Perform sentiment classification
output = sentiment_classifier_model(**encoded_input)

# Retrieve the predicted label and associated probabilities
predicted_label = output.logits.argmax().item()
predicted_probabilities = output.logits.softmax(dim=1).tolist()[0]

# Map the predicted label to sentiment class
sentiment_classes = ["Negative", "Positive"]
predicted_sentiment = sentiment_classes[predicted_label]

# Print the predicted sentiment and associated probabilities
print("Predicted Sentiment:", predicted_sentiment)
print("Sentiment Probabilities:", predicted_probabilities)

Predicted Sentiment: Positive
Sentiment Probabilities: [0.00012409620103426278, 0.9998759031295776]


# **Load 'sentiment140' Test Data**




In [18]:
import pandas as pd
import re
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# load test data (data already cleaned)
test_data_path = '/content/drive/My Drive/project_dataset/test_data.csv'
df = pd.read_csv(test_data_path)
df = df.sample(10)
df['generated_sentence'] = ""

# **prompt engineer gpt2 to rephrase sentenses with opposite sentiment**

In [19]:
# Step 3: Model Loading
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Step 4: Sentence Generation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model.to(device)
model.eval()

with torch.no_grad():
    for i, row in df.iterrows():
        sentiment = row['label']
        opposite_sentiment = 'positive' if sentiment == 'negative' else 'negative'  # Determine the opposite sentiment
        input_prompt = f"rephrase the sentence to {opposite_sentiment} sentiment: {row['text']}"
        input_ids = tokenizer.encode(input_prompt, add_special_tokens=True, return_tensors='pt').to(device)

        # Check if input_ids is not None and has a valid shape
        if input_ids is not None and input_ids.shape[-1] > 0:
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=input_ids.ne(0),  # Pass attention mask to the model (assuming pad_token_id is 0)
                max_length=100,
                num_return_sequences=1,
                pad_token_id=0  # Set pad_token_id to 0
            )
            generated_sentence = tokenizer.decode(generated_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)

            df.at[i, 'generated_sentence'] = generated_sentence
        else:
            df.at[i, 'generated_sentence'] = ""  # Assign an empty string if input_ids is None or has shape (0, )

# Print the generated sentences
# print(df['generated_sentence'])
print(df)


        label                                               text  \
338331      1                         take good care n rest well   
442301      0  my ipod died so i cant finish my movie umm so ...   
403006      0               elena lost today after a great fight   
348506      1                     haha you make me laugh so much   
213289      0  i m going back to bed cus i don t feel well ma...   
99391       1                             i thought you d dig it   
199485      1  watching ace ventura with the sis quality bond...   
288263      1  that is really cute i thought about doing that...   
126371      1  has a interview for an internship wed and one ...   
51296       0  incredibly upset eddie s attic just announced ...   

                                       generated_sentence  
338331  .\n\nThe following is a list of the most commo...  
442301   is over umm so i guess i dont have to sleep u...  
403006   with her husband.\n\nThe woman, who is not na...  
348506  .\n

# **check accuracy with sentiment classifier**

In [21]:
# Create a list to store the accuracy values
accuracies = []

# Iterate through the DataFrame
for i, row in df.iterrows():
    original_sentence = row['text']
    opposite_sentence = row['generated_sentence']

    # Encode the original and opposite sentences
    original_input = sentiment_classifier_tokenizer(original_sentence, truncation=True, padding=True, return_tensors='pt')
    opposite_input = sentiment_classifier_tokenizer(opposite_sentence, truncation=True, padding=True, return_tensors='pt')

    # Get the input IDs and attention mask tensors
    original_input_ids = original_input['input_ids'].to(device)
    original_attention_mask = original_input['attention_mask'].to(device)
    opposite_input_ids = opposite_input['input_ids'].to(device)
    opposite_attention_mask = opposite_input['attention_mask'].to(device)

    # Move the sentiment classification model to the desired device
    sentiment_classifier_model = sentiment_classifier_model.to(device)

    # Make predictions for the original and opposite sentences
    with torch.no_grad():
        original_outputs = sentiment_classifier_model(original_input_ids, attention_mask=original_attention_mask)
        opposite_outputs = sentiment_classifier_model(opposite_input_ids, attention_mask=opposite_attention_mask)

    # Get the predicted labels
    original_predicted_label = torch.argmax(original_outputs.logits).item()
    opposite_predicted_label = torch.argmax(opposite_outputs.logits).item()

    # Check if the opposite rephrase has the opposite sentiment label
    is_opposite = original_predicted_label != opposite_predicted_label

    # Calculate accuracy and add it to the list
    accuracy = 1 if is_opposite else 0
    accuracies.append(accuracy)

# Calculate the overall accuracy
overall_accuracy = sum(accuracies) / len(accuracies)

# Print the overall accuracy
print(f"Accuracy: {overall_accuracy}")

Accuracy: 0.4
