In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
import spacy
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, BaggingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImblearnPipeline
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

In [None]:
csv_file_path = 'sentiment-emotion-labelled_Dell_tweets.csv'
df = pd.read_csv(csv_file_path)
nlp = spacy.load('en_core_web_sm')
# Define a text preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove usernames, hashtags, and URLs
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Use SpaCy for tokenization and lemmatization, and remove stop words and punctuation
    doc = nlp(text)
    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_tokens)
df.drop(columns=['Unnamed: 0', 'Datetime', 'Tweet Id', 'Username'], axis=1, inplace=True)
print(df['emotion'].unique())
# Handle missing values
df.dropna(subset=['Text', 'sentiment', 'emotion'], inplace=True)
df['processed_text'] = df['Text'].apply(preprocess_text)
print(df.head())

['anticipation' 'joy' 'anger' 'sadness' 'fear' 'optimism' 'disgust'
 'surprise']
                                                Text sentiment  \
0  @Logitech @apple @Google @Microsoft @Dell @Len...   neutral   
1  @MK_habit_addict @official_stier @MortalKombat...   neutral   
2  As @CRN celebrates its 40th anniversary, Bob F...  positive   
3  @dell your customer service is horrible especi...  negative   
4  @zacokalo @Dell @DellCares @Dell give the man ...   neutral   

   sentiment_score       emotion  emotion_score  \
0         0.853283  anticipation       0.587121   
1         0.519470           joy       0.886913   
2         0.763791           joy       0.960347   
3         0.954023         anger       0.983203   
4         0.529170         anger       0.776124   

                                      processed_text  
0          qwerty modify programmer thing like br...  
1       s get new      year old   triple price   ...  
2     celebrate th anniversary   bob faletra     .

In [7]:
def batch_preprocess(texts, batch_size=500):
    preprocessed_texts = []
    # Use n_process=-1 to use all available CPU cores for parallel processing
    for doc in nlp.pipe(texts, batch_size=batch_size, n_process=-1):
        filtered_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        preprocessed_texts.append(' '.join(filtered_tokens))
    return preprocessed_texts
df2 = pd.read_csv('go_emotions_dataset.csv')
df2['processed_text'] = df2['text'].apply(preprocess_text)
print(df2.head())

        id                                               text  \
0  eew5j0j                                    That game hurt.   
1  eemcysk   >sexuality shouldn’t be a grouping category I...   
2  ed2mah1     You do right, if you don't care then fuck 'em!   
3  eeibobj                                 Man I love reddit.   
4  eda6yn6  [NAME] was nowhere near them, he was by the Fa...   

   example_very_unclear  admiration  amusement  anger  annoyance  approval  \
0                 False           0          0      0          0         0   
1                  True           0          0      0          0         0   
2                 False           0          0      0          0         0   
3                 False           0          0      0          0         0   
4                 False           0          0      0          0         0   

   caring  confusion  ...  nervousness  optimism  pride  realization  relief  \
0       0          0  ...            0         0      0     

In [8]:
# Updated mapping to consolidate 'positive' into 'joy' and 'negative' into 'sad'
emotion_cols = ['admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'
]
df2['emotion'] = df2[emotion_cols].idxmax(axis=1)
updated_emotion_mapping = {
    'anger': 'anger',
    'surprise': 'surprise',
    'disgust': 'sad',  # Map to 'sad'
    'joy': 'joy',
    'sadness': 'sad',
    'admiration': 'love',  # Map to 'joy'
    'amusement': 'joy',  # Map to 'joy'
    'annoyance': 'sad',  # Map to 'sad'
    'approval': 'joy',  # Map to 'joy'
    'caring': 'joy',  # Map to 'joy'
    'confusion': 'neutral',
    'curiosity': 'joy',  # Map to 'joy'
    'desire': 'love',  # Map to 'joy'
    'disappointment': 'sad',  # Map to 'sad'
    'disapproval': 'sad',  # Map to 'sad'
    'embarrassment': 'sad',  # Map to 'sad'
    'excitement': 'joy',  # Map to 'joy'
    'gratitude': 'joy',  # Map to 'joy'
    'grief': 'sad',  # Map to 'sad'
    'nervousness': 'fear',  # Map to 'sad'
    'optimism': 'joy',  # Map to 'joy'
    'pride': 'joy',  # Map to 'joy'
    'realization': 'surprise',
    'relief': 'joy',  # Map to 'joy'
    'remorse': 'sad',  # Map to 'sad'
    'neutral': 'neutral'  # Optional: remove if not needed
}

# Apply the updated mapping
df['mapped_emotion'] = df['emotion'].map(updated_emotion_mapping).fillna('unmapped')
df2['mapped_emotion'] = df2['emotion'].map(updated_emotion_mapping).fillna('unmapped')

# Remove rows with 'unmapped' emotions
df = df[df['mapped_emotion'] != 'unmapped']
df2 = df2[df2['mapped_emotion'] != 'unmapped']
# df.drop(columns=emotion_cols, axis=1, inplace=True)
# Concatenate the two dataframes
combined_df = pd.concat([
    df[['processed_text', 'mapped_emotion']],
    df2[['processed_text', 'mapped_emotion']]
], ignore_index=True)

# Check the first few rows of the combined dataframe
print(combined_df.head())


                                      processed_text mapped_emotion
0       s get new      year old   triple price   ...            joy
1     celebrate th anniversary   bob faletra     ...            joy
2    customer service horrible especially agent s...          anger
3                                            man pay          anger
4  screenshot act website     latitude   laptop k...            sad


In [9]:
file_path = '/content/val.txt'  # Update with the actual path to your text file

# Read and parse the text file
data = []
with open(file_path, 'r') as file:
    for line in file:
        text, emotion = line.strip().split(';')
        processed_text = preprocess_text(text)  # Apply preprocessing to the text
        data.append({'processed_text': processed_text, 'mapped_emotion': emotion})
new_data_df = pd.DataFrame(data)
print(new_data_df.columns)
# Concatenate this new DataFrame with the existing combined_df
combined_df = pd.concat([combined_df, new_data_df], ignore_index=True)
print(combined_df.head)

Index(['processed_text', 'mapped_emotion'], dtype='object')
<bound method NDFrame.head of                                            processed_text mapped_emotion
0            s get new      year old   triple price   ...            joy
1          celebrate th anniversary   bob faletra     ...            joy
2         customer service horrible especially agent s...          anger
3                                                 man pay          anger
4       screenshot act website     latitude   laptop k...            sad
...                                                   ...            ...
242829          feel like unkind wrong think people close          anger
242830   m feel little cranky negative doctor appointment          anger
242831  feel useful people give great feeling achievement            joy
242832     m feel comfortable derby feel start step shell            joy
242833  feel weird meet w people text like not talk fa...           fear

[242834 rows x 2 columns]>


In [10]:
X = combined_df['processed_text']
y = combined_df['mapped_emotion']
# Encode the labels numerically
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("NaN in features:", X.isna().any())
print("NaN in labels:", pd.isna(y_encoded).any())

# Remove rows with NaN values in features or labels
combined_df = combined_df.dropna(subset=['processed_text', 'mapped_emotion'])

# Update X and y_encoded after removing NaN values
X = combined_df['processed_text']
y_encoded = label_encoder.fit_transform(combined_df['mapped_emotion'])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a pipeline with TfidfVectorizer and MultinomialNB
pipeline = ImblearnPipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.5, ngram_range=(1, 3))),
    ('sampling', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])


NaN in features: False
NaN in labels: False


In [11]:
base_models = [
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
    ('lr', LogisticRegression(max_iter=500, random_state=42))
]
final_estimator = LogisticRegression(max_iter=500, random_state=42)
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=3, n_jobs=-1)

# Create a pipeline with the StackingClassifier
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', stacking_clf)
])

# Adjusted parameter grid for RandomizedSearchCV
param_dist = {
    'vectorizer__max_df': np.linspace(0.4, 0.7, 4),
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__rf__n_estimators': [50, 100],  # Specify RandomForest parameters
    'classifier__rf__max_depth': [10, 15]
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=8,
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Perform hyperparameter tuning on a subset of data
subset_size = 2000  # Adjust as needed
X_subset = X[:subset_size]
y_subset = y[:subset_size]

random_search.fit(X_subset, y_subset)

# Output the best parameters and score
best_parameters = random_search.best_params_
best_score = random_search.best_score_
print("Best Parameters:", best_parameters)
print("Best Score:", best_score)

Fitting 3 folds for each of 8 candidates, totalling 24 fits




Best Parameters: {'vectorizer__ngram_range': (1, 2), 'vectorizer__max_df': 0.6, 'classifier__rf__n_estimators': 100, 'classifier__rf__max_depth': 15}
Best Score: 0.7074991033012022


In [12]:
# Create the base models for the StackingClassifier with the tuned parameters
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=15, class_weight='balanced', random_state=42)),
    ('lr', LogisticRegression(max_iter=500, random_state=42))  # Logistic Regression as another base model
]

# Final estimator for the StackingClassifier
final_estimator = LogisticRegression(max_iter=500, random_state=42)

# Create the StackingClassifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=3, n_jobs=-1)

# Create a pipeline with TfidfVectorizer and the StackingClassifier
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.6, ngram_range=(1, 2))),
    ('classifier', stacking_clf)
])

# Assuming X and y are your dataset features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert all labels to string type
y_train_str = y_train.astype(str)
y_test_str = y_test.astype(str)

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_str)

# Train the pipeline with the training data
pipeline.fit(X_train, y_train_encoded)

# Prepare target names for the classification report
target_names = label_encoder.classes_

# Evaluate the pipeline on the test data
y_pred = pipeline.predict(X_test)
y_test_encoded = label_encoder.transform(y_test_str)
report = classification_report(y_test_encoded, y_pred, target_names=target_names)

# Print the classification report
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

       anger       0.66      0.52      0.58      3617
        fear       0.78      0.57      0.66       646
         joy       0.58      0.66      0.62     15274
        love       0.59      0.34      0.43      5039
     neutral       0.48      0.60      0.53     12433
         sad       0.49      0.45      0.47      8564
     sadness       0.77      0.82      0.80      1126
    surprise       0.42      0.13      0.20      1868

    accuracy                           0.54     48567
   macro avg       0.60      0.51      0.54     48567
weighted avg       0.55      0.54      0.54     48567



In [20]:
# New text to analyze
# new_text = "We bought this for our nieces birthday. She loved it! Thank you so much. We will definitely shop with this vender in the future! Fast shipping. Just as described. Would recommend highly."
new_text = "I ordered my coffee, like i have at least 5 or 6 other times this year. When it arrived it was in a bag, not the normal shipping box that it usually comes in. When I opened it, immediately, coffee grounds fell out of bag. Upon further inspection, all the boxes in the bag are damaged, and not one of them has undamaged K-cups. I love the coffee, regular drinker, but a good portion of this delivery is destroyed or highly damaged."
# Preprocess the text (make sure this matches the preprocessing used during training)
# For example, if you used certain tokenization or cleaning steps, apply them here
# Since this is an example, I'm directly using the text as is

# Use the trained pipeline to predict the emotion
predicted_emotion = pipeline.predict([new_text])[0]

# Get the emotion label from the encoded prediction
emotion_label = label_encoder.inverse_transform([predicted_emotion])[0]

# Format and print the output statement
print(f"Your feelings of {emotion_label} are valid.")


Your feelings of joy are valid.
