In [1]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

seed=42

In [2]:
from transformers import BertTokenizerFast
from transformers import TFBertModel

In [3]:
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

<Figure size 640x480 with 0 Axes>

In [4]:
def conf_matrix(y, y_pred, title):
    fig, ax =plt.subplots(figsize=(5,5))
    labels=['Negative', 'Positive', 'Neutral']
    ax=sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap="Blues", fmt='g', cbar=False, annot_kws={"size":25})
    plt.title(title, fontsize=20)
    ax.xaxis.set_ticklabels(labels, fontsize=17) 
    ax.yaxis.set_ticklabels(labels, fontsize=17)
    ax.set_ylabel('Test', fontsize=20)
    ax.set_xlabel('Predicted', fontsize=20)
    plt.show()

In [5]:
df = pd.read_csv('/kaggle/input/smsa-tokenized-emoji-dataset/amerix_smsa_adv_emoji_ecoded_data.csv', index_col=0)

In [6]:
df.head()

Unnamed: 0,tweet_created_at,text_clean_deeper,Sentiment
154462,2022-06-08 11:44:34+00:00,wishing the team the best today the team has i...,1
98658,2022-11-07 22:49:01+00:00,all humans are bettertogether voteblue resist ...,2
73057,2022-10-29 05:34:53+00:00,lemme retweet share screenshot and tag this fe...,1
124877,2022-08-21 23:58:59+00:00,keep pushing multiple aeds are so important fo...,1
99068,2022-11-06 02:45:54+00:00,this is my only points slip this was 80k this ...,1


In [7]:
df.shape

(233687, 3)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233687 entries, 154462 to 17279
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   tweet_created_at   233687 non-null  object
 1   text_clean_deeper  233687 non-null  object
 2   Sentiment          233687 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 7.1+ MB


In [9]:
ros = RandomOverSampler()

train_x, train_y = ros.fit_resample(np.array(df['text_clean_deeper']).reshape(-1, 1), np.array(df['Sentiment']).reshape(-1, 1))

ros_df = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text_clean_deeper', 'Sentiment'])

In [10]:
ros_df['Sentiment'].value_counts()

1    129190
2    129190
0    129190
Name: Sentiment, dtype: int64

In [11]:
X = ros_df['text_clean_deeper'].values
y = ros_df['Sentiment'].values

In [12]:
x_, x_test, y_, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [13]:
x_train, x_val, y_train, y_val = train_test_split(x_, y_, test_size=0.2, stratify=y_, random_state=42)

In [14]:
y_train_c = y_train.copy()
y_valid_c = y_val.copy()
y_test_c = y_test.copy()

In [15]:
print(f"Training Data: {x_train.shape[0]}\nValidation Data: {x_val.shape[0]}\nTesting Data: {x_test.shape[0]}")

Training Data: 217039
Validation Data: 54260
Testing Data: 116271


In [16]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
MAX_LEN = 128

def tokenize(data, max_len=MAX_LEN) :
    
    input_ids = []
    attention_masks = []
    
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids),np.array(attention_masks)