In [None]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier



import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('punkt')

print("Libraries imported")


In [None]:
file_path = 'c:\work\learnai\SentimentAnalysis\Emotion_classify_Data.csv'
# Read the CSV file
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

In [None]:
data.info()

In [None]:
data['Emotion'].value_counts()

In [23]:
"use this function to get preprocessed text"
def preprocess_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())

    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    
    # Perform stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    print(stemmed_tokens)
    
    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text    

In [None]:
data['preprocessed_comment'] = data['Comment'].apply(preprocess_text)

In [None]:
data.head()

In [31]:
encoder = LabelEncoder()
data['emotion_num'] = encoder.fit_transform(data['Emotion'])

In [None]:
data.head()

In [34]:
X = data['preprocessed_comment']
y = data['emotion_num']

In [35]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)

In [None]:
#use random forest classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


In [None]:
test_text = "i am so angry and will hit you"
test_text_preprocessed = preprocess_text(test_text)
print("Preprocessed text:", test_text_preprocessed)
test_text_vectorized = vectorizer.transform([test_text_preprocessed])
print("Vectorized text:", test_text_vectorized)
prediction = classifier.predict(test_text_vectorized)
print("Predicted emotion:", encoder.inverse_transform(prediction))