In [None]:
# Install necessary libraries
!pip install nltk pandas

# Import libraries
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt_tab')
# Download NLTK data (run this once)
nltk.download('punkt')
nltk.download('stopwords')


df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI project/training.csv')

# Normalize text function
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace user mentions, URLs, and hashtags with placeholders
    text = re.sub(r'@\w+', '<usermention>', text)  # User mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '<url>', text, flags=re.MULTILINE)  # URLs
    text = re.sub(r'#\w+', '<hashtag>', text)  # Hashtags
    return text

# Apply normalization to the 'text' column
df['normalized_text'] = df['text'].apply(normalize_text)

# Tokenization and removing stop words
stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

df['tokens'] = df['normalized_text'].apply(tokenize_and_remove_stopwords)

# Save the preprocessed data to a new CSV file
df[['text', 'normalized_text', 'tokens']].to_csv('/content/drive/MyDrive/Colab Notebooks/AI project/preprocessed_data.csv', index=False)

# Display the first few rows
df[['text', 'normalized_text', 'tokens']].head()




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,normalized_text,tokens
0,i didnt feel humiliated,i didnt feel humiliated,"[didnt, feel, humiliated]"
1,i can go from feeling so hopeless to so damned...,i can go from feeling so hopeless to so damned...,"[go, feeling, hopeless, damned, hopeful, aroun..."
2,im grabbing a minute to post i feel greedy wrong,im grabbing a minute to post i feel greedy wrong,"[im, grabbing, minute, post, feel, greedy, wrong]"
3,i am ever feeling nostalgic about the fireplac...,i am ever feeling nostalgic about the fireplac...,"[ever, feeling, nostalgic, fireplace, know, st..."
4,i am feeling grouchy,i am feeling grouchy,"[feeling, grouchy]"


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install networkx

import networkx as nx
from collections import defaultdict
import os
from google.colab import drive


#  path for saving/loading the graph
graph_save_path = '/content/drive/MyDrive/Colab Notebooks/AI project/emotion_text_graph.gml'

# Function to build a word co-occurrence graph
def build_graph(tokens, window_size=2):
    G = nx.Graph()  # Initialize an undirected graph
    token_pairs = []

    # Iterate over tokens with a sliding window
    for i in range(len(tokens) - window_size + 1):
        window = tokens[i:i+window_size]
        for j in range(len(window)):
            for k in range(j+1, len(window)):
                token_pairs.append((window[j], window[k]))

    # Add edges to the graph
    G.add_edges_from(token_pairs)
    return G

# Check if the graph already exists in Google Drive
if os.path.exists(graph_save_path):
    print("Loading graph from Google Drive...")
    graph = nx.read_gml(graph_save_path)  # Load the saved graph
else:
    print("Building the graph...")
    # Build the graph from all emotion-rich text (since all data is labeled with emotions)
    graph = build_graph(df['tokens'].sum())

    # Save the graph to Google Drive
    nx.write_gml(graph, graph_save_path)
    print(f"Graph saved to: {graph_save_path}")

# Visualizing the graph (Optional)
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 10))
# nx.draw(graph, with_labels=True, font_weight='bold')
# plt.title("Emotion-rich Text Graph")
# plt.show()


Loading graph from Google Drive...


In [None]:
# Checking graph structure
print("Number of nodes:", graph.number_of_nodes())
print("Number of edges:", graph.number_of_edges())

# Print first few nodes and their connections
for i, node in enumerate(graph.nodes):
    if i < 5:  # Limit to 5 nodes
        print(f"Node {node}: connected to {list(graph.neighbors(node))}")


Number of nodes: 15061
Number of edges: 100704
Node didnt: connected to ['feel', 'side', 'really', 'wear', 'using', 'care', 'disturbed', 'want', 'costs', 'kerry', 'supporting', 'cues', 'say', 'abused', 'writing', 'determined', 'take', 'know', 'get', 'surprisingly', 'blessed', 'tell', 'online', 'hugely', 'headed', 'bodyworks', 'invest', 'loud', 'better', 'together', 'avoid', 'regretful', 'accomplish', 'im', 'picture', 'hurt', 'start', 'clear', 'surprised', 'expect', 'remorseful', 'break', 'showed', 'bitch', 'share', 'red', 'strange', 'cause', 'encountered', 'normal', 'though', 'glad', 'last', 'age', 'many', 'humiliated', 'wondering', 'helpless', 'think', 'discomfort', 'often', 'world', 'day', 'like', 'numb', 'thaw', 'usual', 'beaten', 'since', 'sleep', 'horrid', 'smoke', 'thing', 'orci', 'co', 'days', 'use', 'behind', 'dance', 'family', 'terrible', 'let', 'aesthetics', 'hair', 'relationship', 'work', 'babies', 'group', 'expected', 'jon', 'canteen', 'need', 'variants', 'saying', 'cry', '

In [None]:
# Calculate degree centrality
degree_centrality = nx.degree_centrality(graph)

# Sort words by centrality score in descending order
sorted_words = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

# Define thresholds for connector words (CW) and subject words (SW)
# We assume that words with higher centrality are connector words (CW), and lower are subject words (SW)
connector_threshold = 0.05  # Words with centrality greater than 0.05 are considered connector words
subject_threshold = 0.01  # Words with centrality between 0.01 and 0.05 are considered subject words

# Categorize the words
connector_words = [word for word, centrality in sorted_words if centrality > connector_threshold]
subject_words = [word for word, centrality in sorted_words if subject_threshold <= centrality <= connector_threshold]

# Display the top 10 connector words and subject words
print("Top 10 Connector Words (CW):", connector_words[:10])
print("Top 10 Subject Words (SW):", subject_words[:10])


Top 10 Connector Words (CW): ['feel', 'feeling', 'im', 'like', 'know', 'really']
Top 10 Subject Words (SW): ['get', 'time', 'one', 'would', 'think', 'people', 'even', 'want', 'ive', 'still']


In [None]:
from itertools import combinations

# Function to generate patterns from tokens
def generate_patterns(tokens, connector_words, subject_words, pattern_size=3):
    patterns = []

    # Iterate through the tokens in sliding windows of size 3 (you can change this)
    for i in range(len(tokens) - pattern_size + 1):
        window = tokens[i:i+pattern_size]
        connector_found = [token for token in window if token in connector_words]
        subject_found = [token for token in window if token in subject_words]

        # Create patterns if both connector and subject words are found in the window
        if connector_found and subject_found:
            # Example pattern: <connector word, subject word>
            pattern = []
            for token in window:
                if token in subject_words:
                    pattern.append('*')  # Replace subject words with a placeholder
                else:
                    pattern.append(token)  # Keep connector words as-is
            patterns.append(" ".join(pattern))

    return patterns

# Generate patterns from the tokens in the dataset
df['patterns'] = df['tokens'].apply(lambda tokens: generate_patterns(tokens, connector_words, subject_words))

# Display the first few patterns generated
print(df[['text', 'patterns']].head())

# Flatten the list of patterns across all rows for easier analysis
all_patterns = [pattern for sublist in df['patterns'] for pattern in sublist]

# Display top 10 patterns
print("Top 10 patterns:", all_patterns[:10])


                                                text  \
0                            i didnt feel humiliated   
1  i can go from feeling so hopeless to so damned...   
2   im grabbing a minute to post i feel greedy wrong   
3  i am ever feeling nostalgic about the fireplac...   
4                               i am feeling grouchy   

                                            patterns  
0                                [* feel humiliated]  
1                               [* feeling hopeless]  
2                     [minute * feel, * feel greedy]  
3  [* feeling nostalgic, fireplace know *, know *...  
4                                                 []  
Top 10 patterns: ['* feel humiliated', '* feeling hopeless', 'minute * feel', '* feel greedy', '* feeling nostalgic', 'fireplace know *', 'know * property', '* feeling *', 'feeling * burdened', 'faster * feel']


In [None]:
!pip install gensim

import gensim.downloader as api
import os
from google.colab import drive
import pandas as pd


# Define the path for saving/loading enriched patterns and word vectors
enriched_patterns_path = '/content/drive/MyDrive/Colab Notebooks/AI project/enriched_patterns.csv'
word_vectors_path = '/content/drive/MyDrive/Colab Notebooks/AI project/glove_word_vectors.kv'

# Check if enriched patterns already exist
if os.path.exists(enriched_patterns_path):
    print("Loading enriched patterns from Google Drive...")
    df = pd.read_csv(enriched_patterns_path)
else:
    print("Enriched patterns not found, proceeding with enrichment...")

    # Load the word vectors (either from disk or download if not saved)
    if os.path.exists(word_vectors_path):
        print("Loading GloVe word vectors from Google Drive...")
        word_vectors = gensim.models.KeyedVectors.load(word_vectors_path)
    else:
        print("Downloading GloVe word vectors...")
        word_vectors = api.load("glove-twitter-25")  # Smaller Twitter version with 25 dimensions

        # Save word vectors to Google Drive for future use
        word_vectors.save(word_vectors_path)
        print(f"Word vectors saved to: {word_vectors_path}")

    # Function to enrich patterns using word embeddings
    def enrich_patterns_with_embeddings(patterns):
        enriched_patterns = []

        for pattern in patterns:
            enriched_pattern = []
            tokens = pattern.split()

            for token in tokens:
                if token == '*':
                    enriched_pattern.append(token)  # Keep the placeholder
                elif token in word_vectors:
                    # Find most similar word based on embeddings
                    similar_words = word_vectors.most_similar(token, topn=1)
                    enriched_pattern.append(similar_words[0][0])
                else:
                    enriched_pattern.append(token)  # If not in vocab, keep original

            enriched_patterns.append(" ".join(enriched_pattern))

        return enriched_patterns

    # Apply the enrichment to the patterns
    df['enriched_patterns'] = df['patterns'].apply(enrich_patterns_with_embeddings)

    # Save enriched patterns to Google Drive
    df[['text', 'patterns', 'enriched_patterns']].to_csv(enriched_patterns_path, index=False)
    print(f"Enriched patterns saved to: {enriched_patterns_path}")

# Display the enriched patterns
print(df[['text', 'patterns', 'enriched_patterns']].head())

# Flatten the enriched patterns for easier analysis
all_enriched_patterns = [pattern for sublist in df['enriched_patterns'] for pattern in sublist]

# Display top 10 enriched patterns
print("Top 10 Enriched Patterns:", all_enriched_patterns[:10])


Loading enriched patterns from Google Drive...
                                                text  \
0                            i didnt feel humiliated   
1  i can go from feeling so hopeless to so damned...   
2   im grabbing a minute to post i feel greedy wrong   
3  i am ever feeling nostalgic about the fireplac...   
4                               i am feeling grouchy   

                                            patterns  \
0                              ['* feel humiliated']   
1                             ['* feeling hopeless']   
2                 ['minute * feel', '* feel greedy']   
3  ['* feeling nostalgic', 'fireplace know *', 'k...   
4                                                 []   

                                   enriched_patterns  
0                             ['* nothing harassed']  
1                               ['* feel emotional']  
2          ['end * nothing', '* nothing ungrateful']  
3  ['* feel exhilarating', 'hammock think *', 'th...  
4   

In [None]:
import pandas as pd

# Load the training data
training_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI project/training.csv')  # Update the path as necessary

# Check if the 'label' column already exists in df
if 'label' not in df.columns:
    print("Merging label column from training data...")

    # Merge the two DataFrames on the 'text' column
    df = df.merge(training_df[['text', 'label']], on='text', how='left')

    # Display the merged DataFrame to check if the label column has been added
    print("Label column added. Here are the first few rows:")
    print(df.head())
else:
    print("Label column already exists in the DataFrame.")

Merging label column from training data...
Label column added. Here are the first few rows:
                                                text  \
0                            i didnt feel humiliated   
1  i can go from feeling so hopeless to so damned...   
2   im grabbing a minute to post i feel greedy wrong   
3  i am ever feeling nostalgic about the fireplac...   
4                               i am feeling grouchy   

                                            patterns  \
0                              ['* feel humiliated']   
1                             ['* feeling hopeless']   
2                 ['minute * feel', '* feel greedy']   
3  ['* feeling nostalgic', 'fireplace know *', 'k...   
4                                                 []   

                                   enriched_patterns  label  
0                             ['* nothing harassed']      0  
1                               ['* feel emotional']      0  
2          ['end * nothing', '* nothing ungratef

In [None]:
from collections import Counter
import numpy as np

# Function to calculate Pattern Frequency (PF)
def calculate_pattern_frequency(patterns):
    pattern_counter = Counter(patterns)
    return pattern_counter

# Function to calculate Inverse Emotion Frequency (IEF)
def calculate_inverse_emotion_frequency(df, patterns):
    emotion_labels = df['label'].unique()  # Assuming 'label' column has emotion labels
    pattern_to_emotion = defaultdict(set)

    for index, row in df.iterrows():
        for pattern in row['enriched_patterns']:
            pattern_to_emotion[pattern].add(row['label'])

    inverse_emotion_freq = {}
    for pattern, emotions in pattern_to_emotion.items():
        inverse_emotion_freq[pattern] = np.log(len(emotion_labels) / len(emotions))  # Logarithmic scale

    return inverse_emotion_freq

# Function to calculate PF-IEF for each pattern
def calculate_pf_ief(df):
    # Flatten the list of enriched patterns
    all_patterns = [pattern for sublist in df['enriched_patterns'] for pattern in sublist]

    # Calculate PF
    pattern_frequency = calculate_pattern_frequency(all_patterns)

    # Calculate IEF
    inverse_emotion_frequency = calculate_inverse_emotion_frequency(df, all_patterns)

    # Calculate PF-IEF
    pattern_weights = {}
    for pattern, freq in pattern_frequency.items():
        pattern_weights[pattern] = freq * inverse_emotion_frequency.get(pattern, 0)

    return pattern_weights

# Apply PF-IEF weighting to the dataset
pattern_weights = calculate_pf_ief(df)

# Display the top 10 weighted patterns
sorted_pattern_weights = sorted(pattern_weights.items(), key=lambda x: x[1], reverse=True)
print("Top 10 Weighted Patterns:", sorted_pattern_weights[:10])


Top 10 Weighted Patterns: [('ô', 16.125835223052494), ('é', 9.010913347279288), ('ï', 5.375278407684165), ('.', 5.271046405406137), ('ن', 3.58351893845611), ('国', 3.58351893845611), ('内', 3.58351893845611), ('シ', 3.58351893845611), ('ー', 3.58351893845611), ('ズ', 3.58351893845611)]


combining preprocessed_data.csv and training.csv

In [None]:
import pandas as pd

# Load both files
preprocessed_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI project/preprocessed_data.csv')  # Replace with actual path
training_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI project/training.csv')  # Replace with actual path

# Ensure both have the same number of rows
assert len(preprocessed_df) == len(training_df), "Mismatch in number of rows between files."

# Add the 'label' column from training_df to preprocessed_df
preprocessed_df['label'] = training_df['label']

# Define the mapping from label to emotion
label_to_emotion = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'disgust'
}

# Map the label to emotion and create a new 'emotion' column
preprocessed_df['emotion'] = preprocessed_df['label'].map(label_to_emotion)

# Save the updated dataframe to a new CSV file
preprocessed_df.to_csv('/content/drive/MyDrive/Colab Notebooks/AI project/preprocessed_data_with_emotions.csv', index=False)

# Display the first few rows to confirm the changes
preprocessed_df.head()


Unnamed: 0,text,normalized_text,tokens,label,emotion
0,i didnt feel humiliated,i didnt feel humiliated,"['didnt', 'feel', 'humiliated']",0,sadness
1,i can go from feeling so hopeless to so damned...,i can go from feeling so hopeless to so damned...,"['go', 'feeling', 'hopeless', 'damned', 'hopef...",0,sadness
2,im grabbing a minute to post i feel greedy wrong,im grabbing a minute to post i feel greedy wrong,"['im', 'grabbing', 'minute', 'post', 'feel', '...",3,anger
3,i am ever feeling nostalgic about the fireplac...,i am ever feeling nostalgic about the fireplac...,"['ever', 'feeling', 'nostalgic', 'fireplace', ...",2,love
4,i am feeling grouchy,i am feeling grouchy,"['feeling', 'grouchy']",3,anger


Model classifer

In [None]:
!pip install transformers




In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import numpy as np



In [None]:
 preprocessed_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI project/preprocessed_data_with_emotions.csv')

# Load preprocessed text and emotion data
texts = preprocessed_df['normalized_text'].tolist()
emotions = preprocessed_df['label'].tolist()
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the text and pad to a fixed length
max_len = 128  # BERT supports max length up to 512
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
        text,                      # Sentence to encode
        add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
        max_length=max_len,         # Pad & truncate all sentences
        padding='max_length',       # Pad to max length (BERT tokenizer update)
        return_attention_mask=True, # Construct attention masks
        return_tensors='pt',        # Return pytorch tensors
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(emotions)
labels = torch.tensor(labels)

# Train-test split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, test_masks, _, _ = train_test_split(attention_masks, attention_masks, test_size=0.2, random_state=42)

# Create DataLoader for training and testing
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",    # Use the 12-layer BERT model, uncased
    num_labels=len(label_encoder.classes_),  # The number of output classes
    output_attentions=False, # Whether the model returns attentions weights
    output_hidden_states=False, # Whether the model returns all hidden-states
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)



In [None]:
import os

# Define model save path
model_save_path = f'/content/drive/MyDrive/Colab Notebooks/AI project/bert_emotion_model_epoch_{epoch+1}.pth'

# Check if the model file already exists
if os.path.exists(model_save_path):
    print(f"Pretrained model found at {model_save_path}. Loading model...")
    model.load_state_dict(torch.load(model_save_path))
    model.eval()  # Put the model into evaluation mode
    print("Model loaded successfully.")
else:
    print("No pretrained model found. Starting training...")
    epochs = 3  # You can adjust this based on performance

    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        model.train()  # Put the model into training mode

        total_loss = 0

        for step, batch in enumerate(train_dataloader):
            batch_input_ids, batch_input_mask, batch_labels = tuple(b.to(device) for b in batch)

            model.zero_grad()

            outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f'Training Loss: {total_loss / len(train_dataloader)}')

    # Save the trained model
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}.")


Pretrained model found at /content/drive/MyDrive/Colab Notebooks/AI project/bert_emotion_model_epoch_1.pth. Loading model...


  model.load_state_dict(torch.load(model_save_path))


Model loaded successfully.


RUn it only if you want to train the model individually.

In [None]:
epochs = 3  # You can adjust this based on performance

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    model.train()  # Put the model into training mode

    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch_input_ids, batch_input_mask, batch_labels = tuple(b.to(device) for b in batch)

        model.zero_grad()

        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Training Loss: {total_loss / len(train_dataloader)}')

        # Save the model after each epoch (optional)
    model_save_path = f'/content/drive/MyDrive/Colab Notebooks/AI project/bert_emotion_model_epoch_{epoch+1}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f'Model saved to {model_save_path}')


Epoch 1/3


KeyboardInterrupt: 

In [None]:
model.eval()  # Put the model in evaluation mode

predictions = []
true_labels = []

for batch in test_dataloader:
    batch_input_ids, batch_input_mask, batch_labels = tuple(b.to(device) for b in batch)

    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)

    logits = outputs.logits
    predictions.append(logits.argmax(dim=1).cpu().numpy())
    true_labels.append(batch_labels.cpu().numpy())

# Flatten predictions and labels
predictions = np.concatenate(predictions)
true_labels = np.concatenate(true_labels)

# Convert numeric predictions back to string labels
predicted_emotions = label_encoder.inverse_transform(predictions)
true_emotions = label_encoder.inverse_transform(true_labels)

# Classification report
print(classification_report(true_emotions, predicted_emotions))
print(f"Test Accuracy: {accuracy_score(true_emotions, predicted_emotions)}")


              precision    recall  f1-score   support

           0       0.97      0.94      0.96       946
           1       0.93      0.95      0.94      1021
           2       0.89      0.81      0.85       296
           3       0.89      0.95      0.92       427
           4       0.87      0.90      0.88       397
           5       0.83      0.81      0.82       113

    accuracy                           0.92      3200
   macro avg       0.90      0.89      0.89      3200
weighted avg       0.92      0.92      0.92      3200

Test Accuracy: 0.9234375


In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer

# Load the preprocessed data with labels and emotions to map labels to emotions
preprocessed_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI project/preprocessed_data_with_emotions.csv')
label_to_emotion = preprocessed_df[['label', 'emotion']].drop_duplicates().set_index('label')['emotion'].to_dict()

# Function to preprocess input text
def preprocess_input_text(text, tokenizer, max_len=128):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=max_len,
        padding='max_length',     # Pad to max length
        truncation=True,          # Truncate to max length
        return_attention_mask=True,
        return_tensors='pt',      # Return PyTorch tensors
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

# Function to predict emotion from input text
def predict_emotion(text, model, tokenizer, label_to_emotion):
    # Preprocess the input text
    input_ids, attention_mask = preprocess_input_text(text, tokenizer)

    # Put the model in evaluation mode
    model.eval()

    # No gradient calculations needed during inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        prediction = torch.argmax(logits, dim=1).item()

    # Decode the predicted label using the label_to_emotion mapping
    predicted_emotion = label_to_emotion[prediction]
    return predicted_emotion

# Load the tokenizer and model (assuming you have them loaded in memory)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Loop to ask for user input and predict emotion
while True:
    # Ask the user to input a sentence
    new_text = input("Enter a sentence to detect the emotion (or type 'exit' to quit): ")

    if new_text.lower() == 'exit':
        print("Exiting the program.")
        break

    # Predict the emotion for the input sentence
    predicted_emotion = predict_emotion(new_text, model, tokenizer, label_to_emotion)

    # Print the predicted emotion
    print(f"Predicted Emotion: {predicted_emotion}\n")


Predicted Emotion: anger

Predicted Emotion: love

Predicted Emotion: anger

Predicted Emotion: joy

Predicted Emotion: sadness

Predicted Emotion: joy

Predicted Emotion: disgust

Predicted Emotion: fear

Predicted Emotion: joy

Predicted Emotion: sadness

Predicted Emotion: sadness

Predicted Emotion: joy

Predicted Emotion: joy

