In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('/kaggle/input/amerix-sa-nlp-deep-cleaned-data/sentiment_adv_preprocessed_data.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,tweet_created_at,text_clean,Sentiment
0,2022-05-20 16:01:49+00:00,actor amp cookbook author recently joined the ...,1
1,2022-08-23 15:26:31+00:00,sprinkled a little kindness to brighten someon...,1
2,2021-10-15 17:23:25+00:00,the body is not stupid it has an efficient hom...,1
3,2022-11-19 16:26:01+00:00,food for the thought masculinitysaturday,2
4,2022-12-12 08:59:22+00:00,why though,2


In [4]:
ros = RandomOverSampler()

train_x, train_y = ros.fit_resample(np.array(df['text_clean']).reshape(-1, 1), np.array(df['Sentiment']).reshape(-1, 1));

ros_df = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text_clean', 'Sentiment']);

In [5]:
ros_df['Sentiment'].value_counts()

1    129038
2    129038
0    129038
Name: Sentiment, dtype: int64

In [6]:
X = ros_df['text_clean'].values
y = ros_df['Sentiment'].values

In [7]:
print(X.shape)
print(y.shape)

(387114,)
(387114,)


In [8]:
x_, x_test, y_, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [9]:
print(x_.shape)
print(y_.shape)

(270979,)
(270979,)


In [10]:
x_train, x_val, y_train, y_val = train_test_split(x_, y_, test_size=0.2, stratify=y_, random_state=42)

In [11]:
y_train_c = y_train.copy()
y_valid_c = y_val.copy()
y_test_c = y_test.copy()

In [12]:
encoder = preprocessing.OneHotEncoder()

y_train = encoder.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_val = encoder.fit_transform(np.array(y_val).reshape(-1, 1)).toarray()
y_test = encoder.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [13]:
print(f"Training Data: {x_train.shape[0]}\nValidation Data: {x_val.shape[0]}\nTesting Data: {x_test.shape[0]}" )

Training Data: 216783
Validation Data: 54196
Testing Data: 116135


In [14]:
clf = CountVectorizer()

X_train_cv =  clf.fit_transform(x_train)
X_test_cv = clf.transform(x_test)

In [15]:
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)

X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

In [16]:
naive_classifier = MultinomialNB()

In [17]:
naive_classifier.fit(X_train_tf, y_train_c)

MultinomialNB()

In [18]:
naive_prediction = naive_classifier.predict(X_test_tf)

In [19]:
print('\tNaive Bayesian Classification Report:\n\n',
      classification_report(y_test_c,naive_prediction, target_names=['Negative', 'Positive', 'Neutral']))

	Naive Bayesian Classification Report:

               precision    recall  f1-score   support

    Negative       0.63      0.91      0.74     38711
    Positive       0.77      0.70      0.73     38712
     Neutral       0.87      0.57      0.69     38712

    accuracy                           0.73    116135
   macro avg       0.76      0.73      0.72    116135
weighted avg       0.76      0.73      0.72    116135

