In [185]:
# For preprocessing
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import emoji
from urllib.parse import urlparse

# For VaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# For RNN model 
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# 1. Data Preprocessing

In [2]:
train_df = pd.read_parquet('datasets/btc_tweets_train.parquet.gzip')
test_df = pd.read_parquet('datasets/btc_tweets_test.parquet.gzip')

In [3]:
train_df = train_df.reset_index()
test_df = test_df.reset_index()

In [None]:
def df_info(df):
    return df.shape, df.isnull().sum().sum(), df.info(), df.head()

df_info(train_df)

In [4]:
train_df = train_df.drop(['tweet ID', 'user_displayname', 'hashtags'], axis=1)
test_df = test_df.drop(['tweet ID', 'user_displayname', 'hashtags'], axis=1)

In [None]:
# Converting the sentiment labels from bool to int
train_df['sentiment'] = train_df['sentiment'].astype(int)
test_df['sentiment'] = test_df['sentiment'].astype(int)

In [5]:
class DataPreprocessor:
    def __init__(self):
        """
        Initialize the DataPreprocessor class.
        
        Args commonly used in the Methods defined below:
        
        - df (pd.DataFrame): The DataFrame containing the dataset.
        
        - content_column (str): The name of the column containing the text content to analyze.
        
        - sentiment_column (str): The name of the column containing the sentiment labels.
            
        (other Args specific to defined methods are explained in that method's docstring)
        """
    
    def remove_spam_content(self, df, content_column, similarity_threshold=0.65):
        """
        Removes extremly similar samples using cosine similarity.
        
        Args:
        - similarity_threshold (float): (default=0.65) The threshold for cosine similarity above which samples are considered similar.
        """
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(df[content_column])
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        indices_to_remove = set()

        for i in range(len(cosine_sim)):
            for j in range(i + 1, len(cosine_sim)):
                if cosine_sim[i, j] > similarity_threshold:
                    indices_to_remove.add(i)
                    indices_to_remove.add(j)

        df = df.drop(index=indices_to_remove).reset_index(drop=True)
        return df

    def remove_hashtags(self, df, content_column, top_n=30):
        """
        Remove non-whitelisted hashtags from the tweet.
        
        Args:
        - top_n (int): Retain top n number of hashtags and remove others from the tweets.
        """
        
        def extract_hashtags(tweet):
            """Extract all hashtags from the text."""
            return re.findall(r'#\w+', tweet)
        
        def create_hashtag_whitelist(top_n):
            """Create a whitelist of the top N most frequent hashtags."""
            # Extract all hashtags from the dataset
            all_hashtags = sum(df[content_column].apply(extract_hashtags), [])
            # Count the occurrences of each hashtag
            hashtag_counts = Counter(all_hashtags)
            # Get the top N hashtags
            top_hashtags = [hashtag for hashtag, count in hashtag_counts.most_common(top_n)]
            return top_hashtags
        
        # Create the whitelist of top N hashtags
        whitelist = create_hashtag_whitelist(top_n=top_n)
        
        def clean_hashtags(tweet):
            hashtags = extract_hashtags(tweet)
            # Retain only hashtags in the whitelist
            for hashtag in hashtags:
                if hashtag not in whitelist:
                    tweet = tweet.replace(hashtag, '')
            return tweet
        
        # Apply the cleaning function to the dataframe
        df['hashtag_removed'] = df[content_column].apply(clean_hashtags)
        return df
    
    def remove_link(self, df, content_column):
        """
        Removes the links from the tweet
        """
        def remove_links(tweet):
            return re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
        df['links_removed'] = df[content_column].apply(remove_links)
        return df
    
    def remove_whitespace_html(self, df, content_column):
        """
        Removes unnecessary whitespace and html markups from the text
        """
        def remove_whitesp(tweet):
            tweet = BeautifulSoup(tweet, 'html.parser').get_text()
            tweet = ' '.join(tweet.split())
            return tweet
        df['whitespace_html_removed'] = df[content_column].apply(remove_whitesp)
        return df

    def clean_text(self, df, content_column):
        """
        Cleans text data by removing punctuation, stopwords, and applying lemmatization.
        """
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def clean(tweet):
             # Remove links
            tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
            # Convert emojis to text # Converts emojis to text, e.g., "😊" becomes ":smiling_face:"
            tweet = emoji.demojize(tweet, delimiters=(" ", " "))
            # Removes unnecessary whitespace and html markups from the text
            tweet = BeautifulSoup(tweet, 'html.parser').get_text()
            tweet = ' '.join(tweet.split())
            # Remove punctuation and numbers
            tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
            # Tokenize
            words = nltk.word_tokenize(tweet)
            # Remove stop words
            words = [word for word in words if word.lower() not in stop_words]
            # Lemmatize words
            lemmatizer = nltk.WordNetLemmatizer()
            words = [lemmatizer.lemmatize(word.lower()) for word in words]
            return ' '.join(words)
        
        df['cleaned_content'] = df[content_column].apply(clean)
        return df
    
    def handle_class_imbalance_with_SMOTE(self):
        """
        Handles class imbalance using SMOTE (Synthetic Minority Over-sampling Technique).
        """
        
        tfidf = TfidfVectorizer(stop_words='english')
        X = tfidf.fit_transform(self.df[self.content_column])
        y = self.df[self.sentiment_column]
        
        # Apply SMOTE to the vectorized text
        smote = SMOTE(random_state=21)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        self.df = pd.DataFrame(X_resampled.toarray(), columns=tfidf.get_feature_names_out())
        self.df[self.sentiment_column] = y_resampled
        return self.df

In [6]:
datapreprocessor = DataPreprocessor()

**NOTE: both the cleaned datasets above are still imbalanced with True values largly outnumbered than False. The imbalance needs to be taken care of by assigning class weights dring model training.**

# 2. Benchmark: vaderSentiment Sentiment Dictionary

For sentiment analysis specifically using VADER, it's best to apply VADER to the raw, uncleaned text to leverage its strengths in handling informal language, punctuation, and emojis. However, when it comes to Links, it is best to remove them. Links are irrelevent to the sentiment and could add unnecessary noise, potentially influencing the sentiment.

The preprocessing flow for vaderSentiment
- Remove spam tweets
- Remove links
- Remove unnecessary hashtags
- Then apply vader sentiment dictionary on the cleaned tweets

### 2.1. Getting the dataset ready for VaderSentiment

In [None]:
# Asign a 
vader_test_df = train_df.copy()

In [None]:
# Remove spam and bot samples
vader_test_df = datapreprocessor.remove_spam_content(df=vader_test_df, content_column='content')

In [None]:
# Remove links from the tweets
vader_test_df = datapreprocessor.remove_link(df=vader_test_df, content_column='content')

In [None]:
# Remove unnecessary hashtags from the tweets
vader_test_df = datapreprocessor.remove_hashtags(df=vader_test_df, content_column='links_removed', top_n = 25)

In [None]:
vader_test_df = datapreprocessor.remove_whitespace_html(df=vader_test_df, content_column='hashtag_removed')

In [None]:
vader_test_df.head()

In [None]:
# look over on example to see if the links have been removed
index = 4
vader_test_df['content'].iloc[index], vader_test_df['links_removed'].iloc[index], vader_test_df['hashtag_removed'].iloc[index], vader_test_df['whitespace_html_removed'].iloc[index]

### 2.2. Fitting the VaderSentiment on processed data

In [None]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to apply VADER sentiment analysis and get the compound score
def get_sentiment_score(text):
    sentiment_dict = analyzer.polarity_scores(text)
    return sentiment_dict['compound']  # 'compound' score is a normalized score between -1 (negative) and +1 (positive)

# Apply sentiment analysis to the 'content' column and create a new column for the sentiment score
vader_test_df['vader_sentiment'] = vader_test_df['whitespace_html_removed'].apply(get_sentiment_score)

# Optional: Classify the sentiment based on the compound score
def classify_sentiment(score):
    if score >= -0.03:
        return True
    else:
        return False

# Apply the classification and create a new column for the sentiment label
vader_test_df['vader_sentiment_label'] = vader_test_df['vader_sentiment'].apply(classify_sentiment)


In [None]:
#vader_test_df.head()

### 2.3. Evaluating the vaderSentiment classification performance

In [None]:
y_true = vader_test_df['sentiment']
y_pred = vader_test_df['vader_sentiment_label']

# Generate classification report
print('Classification report:\n')
print(classification_report(y_true, y_pred))

# Generate a confusion matrix
vader_conf_matrix = confusion_matrix(y_true, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(vader_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative (0)', 'Positive(1)'], yticklabels=['Negative (0)', 'Positive (1)'])
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrx')
plt.show()

# 3. RNN 

### 3.1. Getting the dataset ready for generating the embeddings

In [7]:
# Remove spam samples
unspammed_train_df = datapreprocessor.remove_spam_content(df=train_df, content_column='content')
unspammed_test_df = datapreprocessor.remove_spam_content(df=test_df, content_column='content')

In [8]:
# Remove unnecessary hashtags
unhashtag_train_df = datapreprocessor.remove_hashtags(df=unspammed_train_df, content_column='content')
unhashtag_test_df = datapreprocessor.remove_hashtags(df=unspammed_test_df, content_column='content')

In [9]:
# Cleaned tweets (Free from links, emojis, whitespace, html markup and then Lemmetized)
cleaned_train_df = datapreprocessor.clean_text(df=unhashtag_train_df, content_column='hashtag_removed')
cleaned_test_df = datapreprocessor.clean_text(df=unhashtag_test_df, content_column='hashtag_removed')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  tweet = BeautifulSoup(tweet, 'html.parser').get_text()
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  tweet = BeautifulSoup(tweet, 'html.parser').get_text()


### 3.2. Training the FastText embeddings

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

In [130]:
# Tokenize the cleaned tweets (split by spaces)
tokenized_tweets = [tweet.split() for tweet in cleaned_train_df['cleaned_content']]

# Train FastText model using Gensim's implementation
fasttext_model = FastText(sentences=tokenized_tweets, vector_size=75, window=5, min_count=1, sg=1, epochs=10)

# Save the model
#fasttext_model.save("fasttext_vs50.model")

# Load the model (for future use)
#fasttext_model = FastText.load("fasttext_vs50.model")

In [94]:
# Example: Get vector for a word 
# print(f"Vector for 'bitcoin': {fasttext_model.wv['bitcoin']}")

# Example: Get most similar words
#print(f"Words similar to 'bitcoin': {fasttext_model.wv.most_similar('bitcoin')}")

### 3.3. Building and Training the RNN model

In [131]:
# Parameters
max_sequence_length = 40
embedding_dim = 75
embedding_matrix = fasttext_model.wv.vectors # Create embedding matrix
vocab_size = len(fasttext_model.wv) #len(tokenizer.word_index)+1

In [68]:
#fasttext_model.wv.vector_size, len(tokenizer.word_counts), vocab_size, tokenizer.word_index.items()

In [132]:
# Tokenize the tweets
tokenizer = Tokenizer(num_words=vocab_size, oov_token='OOV')
tokenizer.fit_on_texts(cleaned_train_df['cleaned_content'])

train_sequences = tokenizer.texts_to_sequences(cleaned_train_df['cleaned_content']) # sequences are arrays where the each word is replaced by number which corresponds to the position of that word in the vocabulary
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')

test_sequences = tokenizer.texts_to_sequences(cleaned_test_df['cleaned_content'])
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

In [133]:
# Dimension check
padded_train_sequences.shape, cleaned_train_df['sentiment'].shape

((1399, 40), (1399,))

In [134]:
# Split the data to training and validation
X_train, X_val, y_train, y_val = train_test_split(padded_train_sequences, cleaned_train_df['sentiment'], test_size = 0.2, random_state = 9)
# Changing the dtype from series to array
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

# Preparing the test data
X_test = padded_test_sequences
y_test = cleaned_test_df['sentiment'].to_numpy()

In [135]:
# Calculate the class weights to handle imbalance
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))
class_weights

{0: 2.6023255813953488, 1: 0.6189159292035398}

Study about kernel_regularizer, learning_rate

In [186]:
# Build the RNN model
def build_rnn_model(embd_dim, rnn_type='LSTM'):
    model = Sequential()
    model.add(Input(shape=(max_sequence_length,)))
    model.add(Embedding(input_dim = vocab_size, 
                        output_dim = embd_dim, 
                        weights=[embedding_matrix],  
                        trainable=False))
    
    if rnn_type == 'LSTM':
        model.add(LSTM(units = 128, return_sequences = False))
    elif rnn_type == 'GRU':
        model.add(GRU(units = 128, return_sequences= False))
        
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid', kernel_regularizer=l2(0.02)))
    
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [187]:
# Initialize the model
model = build_rnn_model(embd_dim=embedding_dim, rnn_type='LSTM')
model.summary()

In [188]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_val, y_val), class_weight=class_weights, callbacks=[early_stopping])

Epoch 1/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.5922 - loss: 0.7407 - val_accuracy: 0.2179 - val_loss: 0.7346
Epoch 2/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.4297 - loss: 0.7535 - val_accuracy: 0.7893 - val_loss: 0.7277
Epoch 3/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.4637 - loss: 0.7595 - val_accuracy: 0.7893 - val_loss: 0.7223
Epoch 4/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.5396 - loss: 0.7421 - val_accuracy: 0.7893 - val_loss: 0.7206
Epoch 5/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.5661 - loss: 0.7253 - val_accuracy: 0.2107 - val_loss: 0.7330
Epoch 6/15
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.3892 - loss: 0.7567 - val_accuracy: 0.7893 - val_loss: 0.7270
Epoch 7/15
[1m35/35[0m [32m━━━━

In [189]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7923 - loss: 0.7167
Test Loss: 0.7164530158042908
Test Accuracy: 0.7978947162628174


In [190]:
# Predicting sentiments on new tweets
predictions = model.predict(X_test)
# Convert probabilities to binary predictions
#predictions = (predictions > 0.612).astype(int)  

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


In [191]:
y_test, predictions.flatten()

(array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
        1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
        0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
        0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 

In [146]:
predictions.max()-predictions.min()

0.0049908757

In [179]:
print(classification_report(y_test, predictions))
confusion_matrix(y_test, predictions)

              precision    recall  f1-score   support

           0       0.27      0.11      0.16        96
           1       0.80      0.92      0.86       379

    accuracy                           0.76       475
   macro avg       0.54      0.52      0.51       475
weighted avg       0.70      0.76      0.72       475



array([[ 11,  85],
       [ 30, 349]], dtype=int64)

In [91]:
cleaned_test_df['sentiment'].value_counts()

sentiment
1    379
0     96
Name: count, dtype: int64

In [None]:
len(X_train[0])

In [None]:
invalid_roe = [index for index, row in enumerate(X_val) if len(row) > 50]

print(invalid_roe)

In [None]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token=1)
# The oov_token is a placeholder token that replaces any OOV words during the text_to_sequence calls.
#This ensures that your model can handle new, unseen words gracefully.
tokenizer.fit_on_texts(X_train)

In [None]:
# On how many tweets did we train?
print(tokenizer.document_count)

In [None]:
# How many unique words?
len(tokenizer.word_counts)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [None]:
# Pad the sequences
max_length = 28  # Set max length for padding
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post')

In [None]:
import numpy as np

# Create an embedding matrix
embedding_dim = 100  # Must match the dimension of FastText vectors
vocab_size = len(tokenizer.word_index) + 1   #it's the total number of unique words in your tokenizer’s vocabulary plus one. The +1 accounts for the padding token (index 0).
embedding_matrix = np.zeros((vocab_size, embedding_dim)) #embedding_matrix is initialized as a matrix of zeros with shape (vocab_size, embedding_dim). 
                                                        #This matrix will eventually hold the FastText vectors for each word in your vocabulary.

for word, i in tokenizer.word_index.items():
    if word in fasttext_model.wv:
        embedding_matrix[i] = fasttext_model.wv[word]

print(f"Embedding matrix shape: {embedding_matrix.shape}")

In [None]:
embedding_matrix

### RNN Model

In [None]:
#import keras as kp

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim, 
                    weights=[embedding_matrix], 
                    input_length=max_length, 
                    trainable=False))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
import gensim
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

# Assuming 'contents' is the column with the tweets and 'sentiment' is the label (0 or 1)
texts = cleaned_train_df['cleaned_contents'].astype(str).tolist() 
labels = cleaned_train_df['sentiment'].values

# Load pre-trained FastText embeddings
fasttext.util.download_model('en', if_exists='ignore')  # Download the model if not already present
ft = fasttext.load_model('cc.en.300.bin')  # Load the pre-trained FastText model

# Tokenize the texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences to ensure uniform input length
max_sequence_length = 50  # You can adjust this based on your data
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Create an embedding matrix using the FastText embeddings
embedding_dim = 300  # FastText embeddings typically have 300 dimensions
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = ft.get_word_vector(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, labels, test_size=0.2, random_state=42)

# Compute class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim, 
                    weights=[embedding_matrix], 
                    input_length=max_sequence_length, 
                    trainable=False))  # Set trainable to False to keep FastText embeddings static
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, 
          epochs=10, 
          batch_size=32, 
          validation_data=(X_val, y_val), 
          class_weight=class_weights_dict)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy}')

