In [1]:
import pandas as pd 

import catboost as cb

import numpy as np

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('/kaggle/input/amerix-sa-nlp-deep-cleaned-data/sentiment_adv_preprocessed_data.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,tweet_created_at,text_clean,Sentiment
0,2022-05-20 16:01:49+00:00,actor amp cookbook author recently joined the ...,1
1,2022-08-23 15:26:31+00:00,sprinkled a little kindness to brighten someon...,1
2,2021-10-15 17:23:25+00:00,the body is not stupid it has an efficient hom...,1
3,2022-11-19 16:26:01+00:00,food for the thought masculinitysaturday,2
4,2022-12-12 08:59:22+00:00,why though,2


In [4]:
ros = RandomOverSampler()

train_x, train_y = ros.fit_resample(np.array(df['text_clean']).reshape(-1, 1), np.array(df['Sentiment']).reshape(-1, 1));

ros_df = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text_clean', 'Sentiment']);

In [5]:
ros_df['Sentiment'].value_counts()

1    129038
2    129038
0    129038
Name: Sentiment, dtype: int64

In [6]:
X = ros_df['text_clean'].values
y = ros_df['Sentiment'].values

In [7]:
print(X.shape)
print(y.shape)

(387114,)
(387114,)


In [8]:
x_, x_test, y_, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [9]:
x_train, x_val, y_train, y_val = train_test_split(x_, y_, test_size=0.2, stratify=y_, random_state=42)

In [10]:
y_train_c = y_train.copy()
y_valid_c = y_val.copy()
y_test_c = y_test.copy()

In [11]:
encoder = preprocessing.OneHotEncoder()

y_train = encoder.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_val = encoder.fit_transform(np.array(y_val).reshape(-1, 1)).toarray()
y_test = encoder.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [12]:
print(f"Training Data: {x_train.shape[0]}\nValidation Data: {x_val.shape[0]}\nTesting Data: {x_test.shape[0]}" )

Training Data: 216783
Validation Data: 54196
Testing Data: 116135


In [13]:
clf = CountVectorizer()

X_train_cv =  clf.fit_transform(x_train)
X_test_cv = clf.transform(x_test)

In [14]:
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)

X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

In [15]:
cb_classifier = cb.CatBoostClassifier()

In [16]:
cb_classifier.fit(X_train_tf, y_train_c)

Learning rate set to 0.104102
0:	learn: 1.0729814	total: 4.35s	remaining: 1h 12m 26s
1:	learn: 1.0527387	total: 8.14s	remaining: 1h 7m 40s
2:	learn: 1.0358546	total: 11.3s	remaining: 1h 2m 42s
3:	learn: 1.0218419	total: 14.9s	remaining: 1h 2m
4:	learn: 1.0096503	total: 18.4s	remaining: 1h 54s
5:	learn: 0.9992191	total: 21.7s	remaining: 59m 51s
6:	learn: 0.9901375	total: 25.4s	remaining: 1h 2s
7:	learn: 0.9820780	total: 28.6s	remaining: 59m 10s
8:	learn: 0.9741166	total: 31.6s	remaining: 58m 1s
9:	learn: 0.9667601	total: 34.5s	remaining: 56m 57s
10:	learn: 0.9601612	total: 37.5s	remaining: 56m 8s
11:	learn: 0.9540531	total: 40.7s	remaining: 55m 48s
12:	learn: 0.9482543	total: 43.6s	remaining: 55m 10s
13:	learn: 0.9429532	total: 46.5s	remaining: 54m 38s
14:	learn: 0.9380122	total: 49.8s	remaining: 54m 27s
15:	learn: 0.9334610	total: 52.6s	remaining: 53m 54s
16:	learn: 0.9289702	total: 55.4s	remaining: 53m 22s
17:	learn: 0.9248575	total: 58s	remaining: 52m 45s
18:	learn: 0.9206008	total: 

<catboost.core.CatBoostClassifier at 0x7f8308ff8ed0>

In [17]:
cb_prediction = cb_classifier.predict(X_test_tf)

In [18]:
print('\tExtreme Gradient Boosters Classification Report:\n\n',
      classification_report(y_test_c,cb_prediction, target_names=['Negative', 'Positive', 'Neutral']))

	Extreme Gradient Boosters Classification Report:

               precision    recall  f1-score   support

    Negative       0.84      0.78      0.81     38711
    Positive       0.87      0.78      0.82     38712
     Neutral       0.78      0.92      0.84     38712

    accuracy                           0.83    116135
   macro avg       0.83      0.83      0.83    116135
weighted avg       0.83      0.83      0.83    116135

