In [2]:
# models/statistical/random_forest.py
#----------------------------------------------------------
# vishnuam300@gmail.com
# VISHNU A M

### Data Collection and Preprocessing

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
#Load data set function 

def load_data(filename, label):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return pd.DataFrame({'text': lines, 'label': label})
#Load data
sadness = load_data('sadness-ratings-0to1.train.txt', 'sadness')
anger = load_data('anger-ratings-0to1.train.txt', 'anger')
joy = load_data('joy-ratings-0to1.train.txt', 'joy')
fear = load_data('fear-ratings-0to1.train.txt', 'fear')

In [5]:
#Combine data
data = pd.concat([sadness, anger, joy, fear], ignore_index=True)

In [6]:
#Shuffle data
data = data.sample(frac=1).reset_index(drop=True)

### Exploratory Data Analysis (EDA)

In [7]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#Text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-z\s]', '', text) 
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]  
    return ' '.join(tokens)

#Apply cleaning
data['text'] = data['text'].apply(clean_text)

### Train-test split

In [8]:
#Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

#Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

### Model Training  and Evaluation

###    -- Naive_bayes
###    -- RandomForestClassifier
###    -- SVC

### Naivebayes

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train model
model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

#Evaluate model
y_pred = model_nb.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9543568464730291
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       160
           1       0.90      0.99      0.94       230
           2       0.98      0.96      0.97       169
           3       0.99      0.88      0.93       164

    accuracy                           0.95       723
   macro avg       0.96      0.95      0.96       723
weighted avg       0.96      0.95      0.95       723



### RandomForestClassifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

#Train Random Forest model
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_tfidf, y_train)

#Evaluate model
y_pred_Rfc = model_rf.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred_Rfc))
print(classification_report(y_test, y_pred_Rfc))

Accuracy: 0.9972337482710927
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       160
           1       1.00      1.00      1.00       230
           2       0.99      1.00      1.00       169
           3       1.00      0.99      0.99       164

    accuracy                           1.00       723
   macro avg       1.00      1.00      1.00       723
weighted avg       1.00      1.00      1.00       723



### SVM

In [11]:

from sklearn import svm
model_Lr = svm.SVC()
model_Lr.fit(X_train_tfidf, y_train)

#Evaluate model
y_pred_svm = model_Lr.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.9972337482710927
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       160
           1       1.00      1.00      1.00       230
           2       0.99      1.00      1.00       169
           3       1.00      0.99      0.99       164

    accuracy                           1.00       723
   macro avg       1.00      1.00      1.00       723
weighted avg       1.00      1.00      1.00       723



## Final Results, Evaluation and comparision of all the models

In [12]:
print('Accuracy Svm:', accuracy_score(y_test, y_pred_svm))
print('Accuracy Random Forest:', accuracy_score(y_test, y_pred_Rfc))
print('Accuracy Naives:', accuracy_score(y_test, y_pred))

Accuracy Svm: 0.9972337482710927
Accuracy Random Forest: 0.9972337482710927
Accuracy Naives: 0.9543568464730291


## To check Sample Inputs.....

In [13]:
sample_texts = [
    "I am feeling very sad and down today.",
    "I am so angry about what happened!",
    "I am absolutely overjoyed with the news!",
    "I am really scared about the future."
]

#Clean the sample inputs
cleaned_samples = [clean_text(text) for text in sample_texts]
sample_tfidf = vectorizer.transform(cleaned_samples)
#Predict the emotion labels
sample_predictions = model_rf.predict(sample_tfidf)
#Decode the predicted labels
decoded_predictions = le.inverse_transform(sample_predictions)

#Displays
for text, emotion in zip(sample_texts, decoded_predictions):
    print(f"Text: {text} => Predicted Emotion: {emotion}")


Text: I am feeling very sad and down today. => Predicted Emotion: sadness
Text: I am so angry about what happened! => Predicted Emotion: anger
Text: I am absolutely overjoyed with the news! => Predicted Emotion: joy
Text: I am really scared about the future. => Predicted Emotion: sadness
