In [12]:
import numpy as np
import pandas as pd
import gensim.downloader as api
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Load pre-trained GloVe model (from Gensim API)
word2vec_model = api.load("glove-wiki-gigaword-50")  # Automatically downloads 50-dimensional vectors

# Step 2: Load your dataset
df = pd.read_csv("Features_For_Traditional_ML_Techniques.csv")
df = df.drop('Unnamed: 0', axis=1)  # Drop unnecessary index column

# Convert 'majority_target' to integers (0 for false, 1 for true)
df['majority_target'] = df['majority_target'].astype(int)

# Define features (X) and labels (y)
X = df['tweet']
y = df['majority_target']

# Step 3: Tokenize and pad the sequences
max_words = 10000  # Maximum number of words to keep in the vocabulary
max_seq_length = 50  # Reduced maximum sequence length for padding

# Tokenizer to convert tweets to sequences of integers
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure equal length
X_padded = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')

# Step 4: Prepare the embedding matrix using GloVe
embedding_dim = 50  # GloVe model's dimensionality
word_index = tokenizer.word_index

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

# Populate the embedding matrix with GloVe vectors for words in the dataset
for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]
    else:
        # Randomly initialize words not found in GloVe model
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Step 5: Define the Sequential model
model = Sequential()

# Add the embedding layer (with pre-trained GloVe embeddings)
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim,
                    weights=[embedding_matrix], input_length=max_seq_length, trainable=False))

# Step 6: Add the LSTM layers and dropout
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.5))

# Step 7: Add a fully connected dense layer with ReLU
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Step 8: Add the output layer (for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Step 9: Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 10: Implement EarlyStopping and ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

# Step 11: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 12: Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=16,
                    validation_data=(X_test, y_test), callbacks=[early_stopping, reduce_lr])

# Step 13: Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9272


Test Accuracy: 0.9272

In [11]:
import numpy as np
import pandas as pd
from gensim.models import FastText
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Load your dataset
df = pd.read_csv("Features_For_Traditional_ML_Techniques.csv")  # Replace with your file path
tweets = df['tweet'].values
y = df['majority_target'].values  # Replace with the actual target column

# Step 2: Tokenize the 'tweet' column
tokenizer = Tokenizer(num_words=5000)  # Limit the vocabulary to the 5000 most frequent words
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
word_index = tokenizer.word_index  # Dictionary mapping words to their indices

# Step 3: Pad sequences to ensure uniform input length
max_len = 100  # You can adjust this based on your dataset
X_padded = pad_sequences(sequences, maxlen=max_len)

# Step 4: Train a FastText model on your dataset
# FastText learns subword information, making it robust for rare and unseen words
fasttext_model = FastText(sentences=[tweet.split() for tweet in tweets], vector_size=100, window=5, min_count=1, workers=4)

# Step 5: Create the embedding matrix from FastText model
embedding_dim = 100  # FastText embedding dimension
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in fasttext_model.wv:
        embedding_matrix[i] = fasttext_model.wv[word]

# Step 6: Define the Sequential model
model = Sequential()

# Add the embedding layer (with FastText embeddings)
model.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))  # Freeze the embedding layer

# Step 7: Add the LSTM layers and dropout
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.5))

# Step 8: Add a fully connected dense layer with ReLU
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Step 9: Add the output layer (for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Step 10: Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 11: Implement EarlyStopping and ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

# Step 12: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 13: Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=16,
                    validation_data=(X_test, y_test), callbacks=[early_stopping, reduce_lr])

# Step 14: Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9280


Test Accuracy: 0.9280

In [14]:
import numpy as np
import pandas as pd
import gensim.downloader as api
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Load pre-trained Word2Vec model (from Gensim API)
word2vec_model = api.load("word2vec-google-news-300")  # Automatically downloads 300-dimensional vectors

# Step 2: Load your dataset
df = pd.read_csv("Features_For_Traditional_ML_Techniques.csv")
df = df.drop('Unnamed: 0', axis=1)  # Drop unnecessary index column

# Convert 'majority_target' to integers (0 for false, 1 for true)
df['majority_target'] = df['majority_target'].astype(int)

# Define features (X) and labels (y)
X = df['tweet']
y = df['majority_target']

# Step 3: Tokenize and pad the sequences
max_words = 10000  # Maximum number of words to keep in the vocabulary
max_seq_length = 50  # Reduced maximum sequence length for padding

# Tokenizer to convert tweets to sequences of integers
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure equal length
X_padded = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')

# Step 4: Prepare the embedding matrix using Word2Vec
embedding_dim = 300  # Word2Vec model's dimensionality
word_index = tokenizer.word_index

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

# Populate the embedding matrix with Word2Vec vectors for words in the dataset
for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]
    else:
        # Randomly initialize words not found in Word2Vec model
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Step 5: Define the Sequential model
model = Sequential()

# Add the embedding layer (with pre-trained Word2Vec embeddings)
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim,
                    weights=[embedding_matrix], input_length=max_seq_length, trainable=False))

# Step 6: Add the LSTM layers and dropout
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.5))

# Step 7: Add a fully connected dense layer with ReLU
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Step 8: Add the output layer (for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Step 9: Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 10: Implement EarlyStopping and ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

# Step 11: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 12: Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=16,
                    validation_data=(X_test, y_test), callbacks=[early_stopping, reduce_lr])

# Step 13: Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9322


Test Accuracy: 0.9322