# Import Needed Modules

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import spacy

# EDA

In [None]:
# Read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df
columns = ['id','country','Label','Text']
df = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv", names=columns)

# Print the shape of dataframe
print(df.shape)

# Print top 5 rows
df.head(5)

In [None]:
df.info()

In [None]:
# Check the distribution of Emotion
df['Label'].value_counts()

In [None]:
# Show sample
for i in range(5):
    print(f"{i+1}: {df['Text'][i]} -> {df['Label'][i]}")

# Preprocessing

### Drop nan values

In [None]:
df.dropna(inplace=True)

### Preprocess Function

In [None]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

In [None]:
# use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

Apply preprocess function on dataframe

In [None]:
df['Preprocessed Text'] = df['Text'].apply(preprocess) 

In [None]:
df

Encoding target column

In [None]:
le_model = LabelEncoder()
df['Label'] = le_model.fit_transform(df['Label'])

In [None]:
df.head(5)

Split data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Text'], df['Label'], 
                                                    test_size=0.2, random_state=42, stratify=df['Label'])

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

# Machine Learning Model

### Naive Bayes Model

In [None]:
# Create classifier
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))         
])

In [None]:
# Model training
clf.fit(X_train, y_train)

In [None]:
# Get prediction
y_pred = clf.predict(X_test)

In [None]:
# Print score
print(accuracy_score(y_test, y_pred))

In [None]:
# Print classification report
print(classification_report(y_test, y_pred))

Random Forest

In [None]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (RandomForestClassifier()))         
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
# Get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [None]:
# Print Accuracy
print(accuracy_score(y_test, y_pred))

In [None]:
# Print the classfication report
print(classification_report(y_test, y_pred))

# Test Model

Get text

In [None]:
test_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', names=columns)
test_df.head()

In [None]:
test_text = test_df['Text'][10]
print(f"{test_text} ===> {test_df['Label'][10]}")

Apply preprocess

In [None]:
test_text_processed = [preprocess(test_text)]
test_text_processed

Get Prediction

In [None]:
test_text = clf.predict(test_text_processed)

Output

In [None]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_df['Label'][10]}")
print(f'Predict Label: {classes[test_text[0]]}')

Irrelevant : 0
Natural : 1
Negative: 2
Positive: 3