In [4]:
# Load libraries
import pandas as pd
import joblib
import re
import nltk
import os
import tensorflow as tf


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import mixed_precision

# Check for GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# Optionally, use mixed precision for faster training on GPU
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Manually specify the path to NLTK data
nltk_data_path = '/usr/share/nltk_data'  # This is the default location for Kaggle
os.environ['NLTK_DATA'] = nltk_data_path

# Check where NLTK is looking for its resources
print("NLTK data path:", nltk.data.path)

# Verify WordNet and other resources
try:
    from nltk.corpus import wordnet
    print("WordNet is successfully loaded!")
except Exception as e:
    print("Error loading WordNet:", e)

# Define lemmatizer (after making sure WordNet is available)
lemmatizer = nltk.WordNetLemmatizer()

# Download NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')  # For wordnet language support if needed
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

Num GPUs Available:  2
NLTK data path: ['/root/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
WordNet is successfully loaded!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import nltk
print(nltk.data.path)


['/root/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [6]:
# Load the Sentiment140 dataset
df = pd.DataFrame()

data_path = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'
try:
    df = pd.read_csv(data_path, encoding='latin-1', header=None)
    df.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']  # Rename columns

    # Filter out neutral polarity
    df = df[df['polarity'] != 2]

    # Map polarity to binary labels: 0 = negative, 1 = positive
    df['polarity'] = df['polarity'].map({0: 0, 4: 1})

    # Drop unnecessary columns
    df = df[['polarity', 'text']]

    print(df.head())  # Preview dataset

except Exception as e:
    print(f"Error loading dataset: {e}")

   polarity                                               text
0         0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1         0  is upset that he can't update his Facebook by ...
2         0  @Kenichan I dived many times for the ball. Man...
3         0    my whole body feels itchy and like its on fire 
4         0  @nationwideclass no, it's not behaving at all....


In [7]:
# Preprocessing text

# Fetch emoji and sentiment from csv
def load_emoji_sentiment(csv_file):
    emoji_df = pd.read_csv(csv_file)
    emoji_sentiment_dict = dict(zip(emoji_df['Emoji'], emoji_df['Sentiment']))
    return emoji_sentiment_dict

# Load the emoji sentiment dictionary at runtime
emoji_sentiment_dict = load_emoji_sentiment('/kaggle/input/emoji-with-sentiments/emoji_sentiment.csv')

def replace_emojis(text):
    for emoji, replacement in emoji_sentiment_dict.items():
        text = text.replace(emoji, replacement)
    return text

def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) >= 2])

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def apply_stemming(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

def apply_lemmatization(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = replace_emojis(text)  # Replace emojis with their meaningful text
    text = re.sub(r'http\S+', 'url', text)  # Replace urls with 'url'
    text = re.sub(r'\b\w*@\w*\.\w*\b', 'email', text)  # Replace email addresses with 'email'
    text = re.sub(r'@\w+', 'user', text)  # Replace user-mentions with 'user'
    text = re.sub(r'#', '', text)  # Remove hashtag symbols
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # Remove repeated characters
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)  # Remove consecutive duplicate words

    # Handle repeated words without spaces
    text = re.sub(r'(\b\w+)\1+', r'\1', text)

    # Reduce consecutive duplicates
    text = re.sub(r'(\b\w+)(\1)+', r'\1', text)  # Reduce repeated words
    
    text = remove_short_words(text)  # Remove short words
    text = remove_stopwords(text)   # Remove stopwords
    # text = apply_stemming(text)  # Apply stemming
    # text = apply_lemmatization(text)  # Optional: apply both or just one
    
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    
    return text


df['text'] = df['text'].apply(clean_text)

df.head()

Unnamed: 0,polarity,text
0,0,user url aww thats bummer shoulda got david ca...
1,0,upset cant update facebook texting might cry r...
2,0,user dived many times ball managed save rest g...
3,0,whole body feels itchy like fire
4,0,user behaving im mad cant see


In [8]:
# Prepare text data for neural network
X = df['text']  # Text data
y = df['polarity']  # Labels (0 = negative, 1 = positive)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure uniform input size
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

# Build the neural network model
model = Sequential()

# Add embedding layer (turns text into dense vectors)
# model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(Embedding(input_dim=5001, output_dim=128))

# Add LSTM layer
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Add dense layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))


# Check GPU status at the start  (Optional)
!nvidia-smi

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Use GPU if available and mixed precision (faster computation on GPU)
with tf.device('/GPU:0'):  # Use '/TPU:0' for TPU
    # Train the model with larger batch size to utilize GPU/TPU efficiently
    model.fit(X_train_padded, y_train, epochs=5, batch_size=128, validation_data=(X_test_padded, y_test), verbose=2)

    # Monitor GPU usage after each epoch (Optional)
    !nvidia-smi

# Check GPU status after training (Optional)
!nvidia-smi

  pid, fd = os.forkpty()


Wed Nov  6 23:24:04 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P0             31W /   70W |   14047MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [9]:
print(X_train_padded.shape)  # Should be (num_samples, max_length)


(1280000, 100)


In [10]:
# Evaluate the model
accuracy = model.evaluate(X_test_padded, y_test, verbose=2)
print(f"Model Accuracy: {accuracy[1]:.2f}")

# Save the model for later use
# model.save('models/sentiment_analysis_model.h5')
model.save('models/lstm_deep_learning_model.h5')


# Save the tokenizer for later use (e.g., to process new text data)
# joblib.dump(tokenizer, 'vectorizers/tokenizer.pkl')
joblib.dump(tokenizer, 'lstm_deep_learning_tokenizer.pkl')

# Monitor GPU usage after evaluation (Optional)
!nvidia-smi

10000/10000 - 413s - 41ms/step - accuracy: 0.7942 - loss: 0.4388
Model Accuracy: 0.79
Thu Nov  7 01:12:45 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             34W /   70W |   14061MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+-----------

In [11]:
# pip install optuna-integration[tfkeras]

In [1]:
import tensorflow as tf
import optuna
from optuna.integration import TFKerasPruningCallback
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import mixed_precision  # Correct import without `experimental`

# Enable mixed precision to boost performance on GPU
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Define the objective function for Optuna
def objective(trial):
    # Hyperparameters to tune
    embedding_dim = trial.suggest_int('embedding_dim', 64, 256)
    lstm_units = trial.suggest_int('lstm_units', 64, 256)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    recurrent_dropout_rate = trial.suggest_float('recurrent_dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])

    # Build the model
    model = Sequential()
    # model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=100))
    model.add(Embedding(input_dim=5001, output_dim=embedding_dim))
    model.add(LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=recurrent_dropout_rate))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))  # Explicitly specify output dtype

    # Compile the model with GPU-optimized Adam
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy'])

    # Train with the GPU if available
    with tf.device('/GPU:0'):
        history = model.fit(
            X_train_padded, y_train,
            epochs=5,  # Fewer epochs during tuning to save time
            batch_size=batch_size,
            validation_data=(X_test_padded, y_test),
            verbose=2,
            callbacks=[TFKerasPruningCallback(trial, 'val_accuracy')]
        )
    
    # Return the validation accuracy of the last epoch
    val_accuracy = history.history['val_accuracy'][-1]
    return val_accuracy

# Run the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Adjust `n_trials` for a balance between speed and thoroughness

# Show best parameters
print("Best hyperparameters: ", study.best_params)


ModuleNotFoundError: 
Could not find `optuna-integration` for `tfkeras`.
Please run `pip install optuna-integration[tfkeras]`.