<a href="https://colab.research.google.com/github/yinon2592/DL_Project_046211/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **picking sentiment classifier**

In [None]:
# !pip install transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the pre-trained sentiment classification model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
sentiment_classifier_model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_classifier_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Text input for sentiment classification
text = "I really enjoyed the movie. It was fantastic!"

# Tokenize the text
encoded_input = sentiment_classifier_tokenizer(text, truncation=True, padding=True, return_tensors='pt')

# Perform sentiment classification
output = sentiment_classifier_model(**encoded_input)

# Retrieve the predicted label and associated probabilities
predicted_label = output.logits.argmax().item()
predicted_probabilities = output.logits.softmax(dim=1).tolist()[0]

# Map the predicted label to sentiment class
sentiment_classes = ["Negative", "Positive"]
predicted_sentiment = sentiment_classes[predicted_label]

# Print the predicted sentiment and associated probabilities
print("Predicted Sentiment:", predicted_sentiment)
print("Sentiment Probabilities:", predicted_probabilities)

Predicted Sentiment: Positive
Sentiment Probabilities: [0.00012409620103426278, 0.9998759031295776]


# **Download 'sentiment140' Data and clean it**




In [None]:
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

File ‘data/training.1600000.processed.noemoticon.csv.zip’ already there; not retrieving.

Archive:  data/training.1600000.processed.noemoticon.csv.zip


In [None]:
import pandas as pd
import re
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Step 1: Dataset Preparation
file_path="data/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
df = df[[0, 5]]
df.columns = ['label', 'text']
df = df.sample(100, random_state=1)
df['label'] = df['label'].replace({0: 'negative', 2: 'neutral', 4: 'positive'})
# Drop the rows with 'neutral' sentiment
df = df[df['label'] != 'neutral']
print(df.label.value_counts())
print(df.sample(5))

# Step 2: Data Preprocessing
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove usernames
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)  # Remove special characters
    # Remove newlines and multiple whitespaces
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters and punctuations
    text = re.sub(r'[^\w\s]', '', text)

    text = text.lower().strip()
    return text

df['text'] = df['text'].apply(clean_text)
df['generated_sentence'] = ""

negative    57
positive    43
Name: label, dtype: int64
            label                                               text
26208    negative                                @drummrboy I can't 
1505267  positive                             DONE! Now I can relax 
16105    negative  Ugh... Feeling like death warmed up. All of us...
1244923  positive  @serenebabe Well as long as it's used for good...
657012   negative               @supercoolkp In Oxford that month.  


# **prompt engineer gpt2 to rephrase sentenses with opposite sentiment**

In [None]:
# Step 3: Model Loading
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Step 4: Sentence Generation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model.to(device)
model.eval()

with torch.no_grad():
    for i, row in df.iterrows():
        sentiment = row['label']
        opposite_sentiment = 'positive' if sentiment == 'negative' else 'negative'  # Determine the opposite sentiment
        input_prompt = f"rephrase the sentence to {opposite_sentiment} sentiment: {row['text']}"
        input_ids = tokenizer.encode(input_prompt, add_special_tokens=True, return_tensors='pt').to(device)

        # Check if input_ids is not None and has a valid shape
        if input_ids is not None and input_ids.shape[-1] > 0:
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=input_ids.ne(0),  # Pass attention mask to the model (assuming pad_token_id is 0)
                max_length=100,
                num_return_sequences=1,
                pad_token_id=0  # Set pad_token_id to 0
            )
            generated_sentence = tokenizer.decode(generated_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)

            df.at[i, 'generated_sentence'] = generated_sentence
        else:
            df.at[i, 'generated_sentence'] = ""  # Assign an empty string if input_ids is None or has shape (0, )

# Print the generated sentences
# print(df['generated_sentence'])
print(df)


           label                                               text  \
514293  negative  i miss nikki nu nu already shes always there w...   
142282  negative  so i had a dream last night i remember a sign ...   
403727  negative  ohh poor sickly you hugs hope you feel a littl...   
649503  negative                                it is raining again   
610789  negative                         wish i was in la right now   
...          ...                                                ...   
505068  negative             gumbo love and alirenco has boy issues   
471797  negative             must get into scc basketball next term   
460065  negative  had a great afternoon at the lake with the gom...   
129162  negative                               i feel neglected sad   
202644  negative  how can we enjoy our 2 yr together maybe we wi...   

                                       generated_sentence  
514293  \n\nI'm not sure if this is a good idea or not...  
142282  . i was so excited 

# **check accuracy with sentiment classifier**

In [None]:
# Create a list to store the accuracy values
accuracies = []

# sentiment_classifier_model = AutoModelForSequenceClassification.from_pretrained(model_name)
# sentiment_classifier_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Iterate through the DataFrame
for i, row in df.iterrows():
    original_sentence = row['text']
    opposite_sentence = row['generated_sentence']

    # Encode the original and opposite sentences
    original_input = sentiment_classifier_tokenizer(original_sentence, truncation=True, padding=True, return_tensors='pt')
    opposite_input = sentiment_classifier_tokenizer(opposite_sentence, truncation=True, padding=True, return_tensors='pt')

    # Get the input IDs and attention mask tensors
    original_input_ids = original_input['input_ids'].to(device)
    original_attention_mask = original_input['attention_mask'].to(device)
    opposite_input_ids = opposite_input['input_ids'].to(device)
    opposite_attention_mask = opposite_input['attention_mask'].to(device)

    # Move the sentiment classification model to the desired device
    sentiment_classifier_model = sentiment_classifier_model.to(device)

    # Make predictions for the original and opposite sentences
    with torch.no_grad():
        original_outputs = sentiment_classifier_model(original_input_ids, attention_mask=original_attention_mask)
        opposite_outputs = sentiment_classifier_model(opposite_input_ids, attention_mask=opposite_attention_mask)

    # Get the predicted labels
    original_predicted_label = torch.argmax(original_outputs.logits).item()
    opposite_predicted_label = torch.argmax(opposite_outputs.logits).item()

    # Check if the opposite rephrase has the opposite sentiment label
    is_opposite = original_predicted_label != opposite_predicted_label

    # Calculate accuracy and add it to the list
    accuracy = 1 if is_opposite else 0
    accuracies.append(accuracy)

# Calculate the overall accuracy
overall_accuracy = sum(accuracies) / len(accuracies)

# Print the overall accuracy
print(f"Accuracy: {overall_accuracy}")

Accuracy: 0.39
