In [1]:
import pandas as pd 

import catboost as cb

import numpy as np

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('/kaggle/input/amerix-sa-nlp-deep-cleaned-data/sentiment_adv_preprocessed_data.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0,tweet_created_at,text_clean,Sentiment
0,2022-05-20 16:01:49+00:00,actor amp cookbook author recently joined the ...,1
1,2022-08-23 15:26:31+00:00,sprinkled a little kindness to brighten someon...,1
2,2021-10-15 17:23:25+00:00,the body is not stupid it has an efficient hom...,1
3,2022-11-19 16:26:01+00:00,food for the thought masculinitysaturday,2
4,2022-12-12 08:59:22+00:00,why though,2


In [5]:
ros = RandomOverSampler()

train_x, train_y = ros.fit_resample(np.array(df['text_clean']).reshape(-1, 1), np.array(df['Sentiment']).reshape(-1, 1));

ros_df = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text_clean', 'Sentiment']);

In [6]:
ros_df['Sentiment'].value_counts()

1    129038
2    129038
0    129038
Name: Sentiment, dtype: int64

In [7]:
X = ros_df['text_clean'].values
y = ros_df['Sentiment'].values

In [8]:
print(X.shape)
print(y.shape)

(387114,)
(387114,)


In [9]:
x_, x_test, y_, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [10]:
x_train, x_val, y_train, y_val = train_test_split(x_, y_, test_size=0.2, stratify=y_, random_state=42)

In [11]:
y_train_c = y_train.copy()
y_valid_c = y_val.copy()
y_test_c = y_test.copy()

In [12]:
encoder = preprocessing.OneHotEncoder()

y_train = encoder.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_val = encoder.fit_transform(np.array(y_val).reshape(-1, 1)).toarray()
y_test = encoder.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [13]:
print(f"Training Data: {x_train.shape[0]}\nValidation Data: {x_val.shape[0]}\nTesting Data: {x_test.shape[0]}" )

Training Data: 216783
Validation Data: 54196
Testing Data: 116135


In [14]:
clf = CountVectorizer()

X_train_cv =  clf.fit_transform(x_train)
X_test_cv = clf.transform(x_test)

In [15]:
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)

X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

In [16]:
cb_classifier = cb.CatBoostClassifier()

In [17]:
cb_classifier.fit(X_train_tf, y_train_c)

Learning rate set to 0.104102
0:	learn: 1.0725530	total: 3.54s	remaining: 59m 2s
1:	learn: 1.0524012	total: 6.49s	remaining: 53m 58s
2:	learn: 1.0360748	total: 9.22s	remaining: 51m 5s
3:	learn: 1.0221652	total: 12.2s	remaining: 50m 32s
4:	learn: 1.0099578	total: 15.3s	remaining: 50m 46s
5:	learn: 0.9999002	total: 18.1s	remaining: 50m 6s
6:	learn: 0.9908772	total: 21.2s	remaining: 50m 7s
7:	learn: 0.9823320	total: 24.1s	remaining: 49m 44s
8:	learn: 0.9745865	total: 26.5s	remaining: 48m 38s
9:	learn: 0.9680720	total: 28.7s	remaining: 47m 22s
10:	learn: 0.9613729	total: 31.2s	remaining: 46m 43s
11:	learn: 0.9555031	total: 33.6s	remaining: 46m 10s
12:	learn: 0.9500827	total: 36.1s	remaining: 45m 41s
13:	learn: 0.9447756	total: 38.6s	remaining: 45m 21s
14:	learn: 0.9397063	total: 41.1s	remaining: 45m
15:	learn: 0.9344291	total: 43.5s	remaining: 44m 35s
16:	learn: 0.9301361	total: 45.9s	remaining: 44m 13s
17:	learn: 0.9258005	total: 48.4s	remaining: 44m
18:	learn: 0.9216894	total: 50.4s	rema

<catboost.core.CatBoostClassifier at 0x7f412893d810>

In [18]:
cb_prediction = cb_classifier.predict(X_test_tf)

In [19]:
print('\tExtreme Gradient Boosters Classification Report:\n\n',
      classification_report(y_test_c,cb_prediction, target_names=['Negative', 'Positive', 'Neutral']))

	Extreme Gradient Boosters Classification Report:

               precision    recall  f1-score   support

    Negative       0.84      0.78      0.81     38711
    Positive       0.87      0.78      0.82     38712
     Neutral       0.78      0.92      0.84     38712

    accuracy                           0.83    116135
   macro avg       0.83      0.83      0.83    116135
weighted avg       0.83      0.83      0.83    116135

