# Activity 1: Emoji-Based Sentiment Analysis
### Group 7: Claire and Willard

This notebook addresses the requirements for Activity 1, an emoji-based sentiment analysis project. It is divided into two main parts: Question A, which focuses on training a machine learning model, and Question B, which involves building a real-time sentiment analyzer.

## Question A: Sentiment Analysis using a Machine Learning Algorithm

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [28]:
# Load the main dataset for Question A
file_path_q_a = "1k_data_emoji_tweets_senti_posneg.csv"
df_main = pd.read_csv(file_path_q_a)

# Load the reference dataset for feature engineering
file_path_emoticons = "15_emoticon_data.csv"
df_emoticons = pd.read_csv(file_path_emoticons)

# Display the first 5 rows of each DataFrame to verify they loaded correctly
print("Main Dataset Head:")
print(df_main.head())
print("\nEmoticons Dataset Head:")
print(df_emoticons.head())

Main Dataset Head:
   Unnamed: 0  sentiment                                               post
0           0          1                             Good morning every one
1           1          0  TW: S AssaultActually horrified how many frien...
2           2          1  Thanks by has notice of me Greetings : Jossett...
3           3          0                      its ending soon aah unhappy 😧
4           4          1                               My real time happy 😊

Emoticons Dataset Head:
   Unnamed: 0 Emoji Unicode codepoint                         Unicode name
0           0     😍           0x1f60d  SMILING FACE WITH HEART-SHAPED EYES
1           1     😭           0x1f62d                   LOUDLY CRYING FACE
2           2     😘           0x1f618                 FACE THROWING A KISS
3           3     😊           0x1f60a       SMILING FACE WITH SMILING EYES
4           4     😁           0x1f601      GRINNING FACE WITH SMILING EYES


### Exploratory Data Analysis

In [29]:
# Get a concise summary of the DataFrame
df_main.info()

# Display descriptive statistics for numerical columns
print("\nDescriptive Statistics:")
print(df_main.describe(include='all'))

# Check for any missing values
print("\nMissing Values:")
print(df_main.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   sentiment   1000 non-null   int64 
 2   post        1000 non-null   object
dtypes: int64(2), object(1)
memory usage: 23.6+ KB

Descriptive Statistics:
         Unnamed: 0   sentiment    post
count   1000.000000  1000.00000    1000
unique          NaN         NaN     999
top             NaN         NaN  #NAME?
freq            NaN         NaN       2
mean     499.500000     0.50000     NaN
std      288.819436     0.50025     NaN
min        0.000000     0.00000     NaN
25%      249.750000     0.00000     NaN
50%      499.500000     0.50000     NaN
75%      749.250000     1.00000     NaN
max      999.000000     1.00000     NaN

Missing Values:
Unnamed: 0    0
sentiment     0
post          0
dtype: int64


### Data Cleaning

In [30]:
# Drop the redundant 'Unnamed: 0' column
df_main = df_main.drop(columns=['Unnamed: 0'])

# Clean the 'post' column by replacing '#NAME?' with a null value and then dropping the row
df_main['post'].replace('#NAME?', np.nan, inplace=True)
df_main.dropna(subset=['post'], inplace=True)

# Verify the changes
print("--- After Cleaning ---")
df_main.info()
print("\nMissing Values:")
print(df_main.isnull().sum())

--- After Cleaning ---
<class 'pandas.core.frame.DataFrame'>
Index: 998 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  998 non-null    int64 
 1   post       998 non-null    object
dtypes: int64(1), object(1)
memory usage: 23.4+ KB

Missing Values:
sentiment    0
post         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_main['post'].replace('#NAME?', np.nan, inplace=True)


### Feature Engineering

In [31]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK data (if not already downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')

negation_words = {"no", "not", "don't", "don't"}
stop_words = set(stopwords.words('english'))

def handle_negation(text):
    tokens = word_tokenize(text)
    negation_found = False
    processed_tokens = []
    
    for word in tokens:
        # Check if the word is a punctuation mark to stop negation
        if word in ['.', ',', '!', '?']:
            negation_found = False
            processed_tokens.append(word)
        elif word in negation_words:
            negation_found = True
            processed_tokens.append(word)
        elif negation_found and word not in stop_words:
            processed_tokens.append(f"{word}_NEG")
        else:
            processed_tokens.append(word)
            
    return ' '.join(processed_tokens)

# Your original function to replace emojis
def replace_emojis(text):
    # Create a list of all emojis from the emoticons DataFrame
    emojis = df_emoticons['Emoji'].tolist()
    # Create a regular expression pattern to find any of the emojis
    emoji_pattern = re.compile('|'.join(re.escape(e) for e in emojis))
    return emoji_pattern.sub('_EMOJI_', str(text))

# Combine both functions in your preprocessing
df_main['post_cleaned'] = df_main['post'].apply(replace_emojis)
df_main['post_cleaned'] = df_main['post_cleaned'].apply(handle_negation)

ModuleNotFoundError: No module named 'nltk'

### Model Training and Evaluation

#### Pre-Processing

##### Data Splitting

In [None]:
# Define features (X) and target (y)
X = df_main['post_cleaned']
y = df_main['sentiment']

# Split the data into 70% training and 30% for validation + testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the remaining 30% into 15% validation and 15% testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


##### Text Vectorization

In [None]:
# Vectorize the text data using TF-IDF. The vectorizer is fit only on the training data.
# Add n-grams (sequences of words).
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

#### Model Selection

In [None]:
# Import additional models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# --- Model 1: Multinomial Naive Bayes ---
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)
nb_val_pred = nb_model.predict(X_val_vectorized)
nb_accuracy = accuracy_score(y_val, nb_val_pred)
print(f"Multinomial Naive Bayes Validation Accuracy: {nb_accuracy:.4f}")

# --- Model 2: Logistic Regression ---
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vectorized, y_train)
lr_val_pred = lr_model.predict(X_val_vectorized)
lr_accuracy = accuracy_score(y_val, lr_val_pred)
print(f"Logistic Regression Validation Accuracy: {lr_accuracy:.4f}")

# --- Model 3: Support Vector Machine (SVC) ---
svc_model = SVC()
svc_model.fit(X_train_vectorized, y_train)
svc_val_pred = svc_model.predict(X_val_vectorized)
svc_accuracy = accuracy_score(y_val, svc_val_pred)
print(f"Support Vector Machine (SVC) Validation Accuracy: {svc_accuracy:.4f}")

# Now, based on these results, select the best-performing model to evaluate on the final test set.

Multinomial Naive Bayes Validation Accuracy: 0.7800
Logistic Regression Validation Accuracy: 0.7400
Support Vector Machine (SVC) Validation Accuracy: 0.7800


#### Chosen Model/s

In [None]:
# --- Evaluate Multinomial Naive Bayes on the Test Set ---
nb_test_pred = nb_model.predict(X_test_vectorized)
print("Multinomial Naive Bayes Test Set Accuracy:", accuracy_score(y_test, nb_test_pred))
print("Multinomial Naive Bayes Classification Report:")
print(classification_report(y_test, nb_test_pred))

# --- Evaluate Support Vector Machine (SVC) on the Test Set ---
svc_test_pred = svc_model.predict(X_test_vectorized)
print("\nSupport Vector Machine (SVC) Test Set Accuracy:", accuracy_score(y_test, svc_test_pred))
print("Support Vector Machine (SVC) Classification Report:")
print(classification_report(y_test, svc_test_pred))

Multinomial Naive Bayes Test Set Accuracy: 0.8133333333333334
Multinomial Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.82        74
           1       0.83      0.79      0.81        76

    accuracy                           0.81       150
   macro avg       0.81      0.81      0.81       150
weighted avg       0.81      0.81      0.81       150


Support Vector Machine (SVC) Test Set Accuracy: 0.8066666666666666
Support Vector Machine (SVC) Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.76      0.79        74
           1       0.78      0.86      0.82        76

    accuracy                           0.81       150
   macro avg       0.81      0.81      0.81       150
weighted avg       0.81      0.81      0.81       150



## Question B: Real-Time Tweet Sentiment Analyzer

In [None]:
# Import the ipywidgets library
import ipywidgets as widgets
from IPython.display import display

# Define a function to predict sentiment
def predict_sentiment(text):
    # Use the same emoji replacement logic
    processed_text = replace_emojis(text)
    
    # Vectorize the new text
    vectorized_text = vectorizer.transform([processed_text])
    
    # Make a prediction using the best-performing model (MultinomialNB)
    prediction = nb_model.predict(vectorized_text)[0]
    
    # Convert prediction to a readable sentiment label
    sentiment = "POSITIVE" if prediction == 1 else "NEGATIVE"
    
    # Display the result
    output.clear_output()
    with output:
        print(f"Your input is: \"{text}\"")
        print(f"Your input is of \"{sentiment} SENTIMENT\"")

# Create the interactive widgets
text_input = widgets.Text(
    value='',
    placeholder='Type your sentence here',
    description='Tweet:',
    disabled=False
)

button = widgets.Button(
    description='Say your Sentiment',
    button_style='info',
    tooltip='Click to analyze sentiment'
)

output = widgets.Output()

# Link the button to the prediction function
def on_button_clicked(b):
    predict_sentiment(text_input.value)

button.on_click(on_button_clicked)

# Display the widgets
display(text_input, button, output)

Text(value='', description='Tweet:', placeholder='Type your sentence here')

Button(button_style='info', description='Say your Sentiment', style=ButtonStyle(), tooltip='Click to analyze s…

Output()