In [18]:
import zipfile
import os

# Define the path to the zip file
zip_file_path = '/content/dataset_emotion.zip'

# Define the directory to extract the files to
extracted_path = '/content/'

# Create the extraction directory if it doesn't exist
os.makedirs(extracted_path, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

print(f"Dataset extracted to: {extracted_path}")

Dataset extracted to: /content/


In [19]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('/content/train.txt', sep=';', names=['text', 'emotion'])
test_df = pd.read_csv('/content/test.txt', sep=';', names=['text', 'emotion'])
val_df = pd.read_csv('/content/val.txt', sep=';', names=['text', 'emotion'])

# Display the head of each DataFrame
print("Train DataFrame Head:")
display(train_df.head())

print("\nTest DataFrame Head:")
display(test_df.head())

print("\nValidation DataFrame Head:")
display(val_df.head())

Train DataFrame Head:


Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger



Test DataFrame Head:


Unnamed: 0,text,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness



Validation DataFrame Head:


Unnamed: 0,text,emotion
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


## Data Preprocessing

### Subtask:
Clean and prepare the text data for model training. This may include tokenization, removing stop words, and converting text to numerical representations.

**Reasoning**:
Define a preprocessing function to clean the text data by removing special characters and converting to lowercase. Apply this function to the text column of each DataFrame.

In [20]:
import re

def preprocess_text(text):
    # Remove special characters and lowercase the text
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)
val_df['cleaned_text'] = val_df['text'].apply(preprocess_text)

print("Train DataFrame with cleaned text:")
display(train_df.head())

Train DataFrame with cleaned text:


Unnamed: 0,text,emotion,cleaned_text
0,i didnt feel humiliated,sadness,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,anger,i am feeling grouchy


**Reasoning**:
Tokenize the cleaned text data using TensorFlow's Tokenizer, convert the text to sequences, and pad the sequences to a fixed length. Also, encode the emotion labels into numerical format.

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Tokenize the text
max_words = 10000 # based on common practice in text classification
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['cleaned_text'])

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(train_df['cleaned_text'])
X_test_sequences = tokenizer.texts_to_sequences(test_df['cleaned_text'])
X_val_sequences = tokenizer.texts_to_sequences(val_df['cleaned_text'])

# Pad sequences
max_len = max([len(x) for x in X_train_sequences]) # based on the maximum length of sequences in the training data
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_len, padding='post', truncating='post')

# Encode emotion labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['emotion'])
y_test = label_encoder.transform(test_df['emotion'])
y_val = label_encoder.transform(val_df['emotion'])

print("Text tokenization and padding complete.")
print("Emotion labels encoded.")

Text tokenization and padding complete.
Emotion labels encoded.


## Model Selection

### Subtask:
Choose a suitable model for text classification, such as a recurrent neural network (RNN) or a transformer-based model.

**Reasoning**:
Define a Bidirectional LSTM model using TensorFlow/Keras for text classification.

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2

embedding_dim = 64 # based on common practice in text classification
lstm_units = 64 # based on common practice in text classification
dropout_rate = 0.5 # based on common practice to prevent overfitting
vocab_size = len(tokenizer.word_index) + 1 # based on the size of the tokenizer's vocabulary
num_classes = len(label_encoder.classes_) # based on the number of unique emotion labels

model = Sequential([
    Embedding(vocab_size, embedding_dim), # Removed input_length
    Bidirectional(LSTM(lstm_units, return_sequences=True)), # Added return_sequences=True
    Dropout(dropout_rate), # Added dropout layer
    Bidirectional(LSTM(lstm_units)), # Added another Bidirectional LSTM layer
    Dropout(dropout_rate), # Added dropout layer
    Dense(lstm_units, activation='relu', kernel_regularizer=l2(0.01)), # Added L2 regularization
    Dropout(dropout_rate), # Added dropout layer
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

## Model Training

### Subtask:
Train the selected model on the preprocessed data.

**Reasoning**:
Train the Bidirectional LSTM model using the training data and validate it using the validation data.

In [24]:
epochs = 10 # based on common practice in similar tasks
batch_size = 32 # based on common practice in similar tasks

history = model.fit(X_train_padded, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val_padded, y_val))

print("Model training complete.")

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 155ms/step - accuracy: 0.3296 - loss: 1.9072 - val_accuracy: 0.5325 - val_loss: 1.1287
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 151ms/step - accuracy: 0.6451 - loss: 0.9205 - val_accuracy: 0.7540 - val_loss: 0.6380
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 155ms/step - accuracy: 0.7724 - loss: 0.5788 - val_accuracy: 0.7580 - val_loss: 0.5578
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 154ms/step - accuracy: 0.8002 - loss: 0.4854 - val_accuracy: 0.8075 - val_loss: 0.4876
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 152ms/step - accuracy: 0.8307 - loss: 0.3951 - val_accuracy: 0.8225 - val_loss: 0.4640
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 152ms/step - accuracy: 0.8595 - loss: 0.3389 - val_accuracy: 0.8665 - val_loss: 0.4230
Epoch 7/10

## Model Evaluation

### Subtask:
Evaluate the performance of the trained model using appropriate metrics.

**Reasoning**:
Evaluate the trained model on the test set to assess its performance and display the accuracy and loss.

In [25]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9172 - loss: 0.3592
Test Loss: 0.3603
Test Accuracy: 0.9110


## Prediction

### Subtask:
Use the trained model to predict emotions on new text data.

**Reasoning**:
Preprocess the new text input using the same steps as the training data, use the trained model to predict the emotion, and then display the predicted emotion label.

In [26]:
import numpy as np

# Example new text input
new_text = "I feel a sense of accomplishment after finishing this task."

# Preprocess the new text
cleaned_new_text = preprocess_text(new_text)
new_text_sequence = tokenizer.texts_to_sequences([cleaned_new_text])
new_text_padded = pad_sequences(new_text_sequence, maxlen=max_len, padding='post', truncating='post')

# Predict the emotion
prediction = model.predict(new_text_padded)
predicted_emotion_encoded = np.argmax(prediction, axis=1)[0]
predicted_emotion = label_encoder.inverse_transform([predicted_emotion_encoded])[0]

print(f"The predicted emotion for the text '{new_text}' is: {predicted_emotion}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 887ms/step
The predicted emotion for the text 'I feel a sense of accomplishment after finishing this task.' is: joy


# Task
Deploy the trained emotion classification model as a Flask application. The application should accept text input, predict the emotion (joy, anger, fear, or sadness), and return the prediction. The model and necessary preprocessing objects should be saved and loaded within the Flask app.

## Save model and preprocessing objects

### Subtask:
Save the trained Keras model, the tokenizer, and the label encoder to files so they can be loaded later in the Flask app.


**Reasoning**:
Save the trained Keras model, the tokenizer, and the label encoder to files using the specified names and directory.



In [27]:
import pickle

# Save the Keras model
model.save('/content/emotion_model.h5')

# Save the tokenizer
with open('/content/tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoder
with open('/content/label_encoder.pkl', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model, tokenizer, and label encoder saved.")



Model, tokenizer, and label encoder saved.


## Create flask app structure

### Subtask:
Set up the necessary files and directories for a basic Flask application, including a Python file for the application and a directory for templates (if needed).


**Reasoning**:
Create the necessary directories and copy the saved files into the flask_app directory.



In [28]:
import os
import shutil

# Create the main application directory
app_dir = 'flask_app'
os.makedirs(app_dir, exist_ok=True)

# Create the templates directory (optional, but good practice)
templates_dir = os.path.join(app_dir, 'templates')
os.makedirs(templates_dir, exist_ok=True)

# Create the app.py file
app_file_path = os.path.join(app_dir, 'app.py')
with open(app_file_path, 'w') as f:
    f.write("# This is the main Flask application file\n")
    f.write("from flask import Flask, request, jsonify, render_template\n")
    f.write("import numpy as np\n")
    f.write("import pickle\n")
    f.write("import tensorflow as tf\n")
    f.write("from tensorflow.keras.preprocessing.sequence import pad_sequences\n")
    f.write("import re\n")
    f.write("\n")
    f.write("# Define preprocessing function\n")
    f.write("def preprocess_text(text):\n")
    f.write("    text = re.sub(r'[^a-zA-Z\\s]', '', text)\n")
    f.write("    text = text.lower()\n")
    f.write("    return text\n")
    f.write("\n")
    f.write("app = Flask(__name__)\n")
    f.write("\n")
    f.write("# Load the model and preprocessing objects\n")
    f.write("model = tf.keras.models.load_model('emotion_model.h5')\n")
    f.write("with open('tokenizer.pkl', 'rb') as handle:\n")
    f.write("    tokenizer = pickle.load(handle)\n")
    f.write("with open('label_encoder.pkl', 'rb') as handle:\n")
    f.write("    label_encoder = pickle.load(handle)\n")
    f.write("\n")
    f.write("# Get max_len from padding during training\n")
    f.write("max_len = model.layers[0].input_shape[1] # This assumes the Embedding layer has input_length defined\n")
    f.write("if max_len is None:\n")
    f.write("    # Fallback if input_length is not defined in the Embedding layer\n")
    f.write("    # In this case, we need to know the max_len from the training phase\n")
    f.write("    # Assuming max_len was saved or is a known constant from the training notebook\n")
    f.write("    # Replace with the actual max_len value used during training if needed\n")
    f.write("    max_len = 66 # Replace with the actual max_len used in the notebook\n")
    f.write("\n")
    f.write("@app.route('/')\n")
    f.write("def index():\n")
    f.write("    return render_template('index.html')\n")
    f.write("\n")
    f.write("@app.route('/predict', methods=['POST'])\n")
    f.write("def predict():\n")
    f.write("    if request.method == 'POST':\n")
    f.write("        data = request.get_json()\n")
    f.write("        text = data['text']\n")
    f.write("\n")
    f.write("        # Preprocess the input text\n")
    f.write("        cleaned_text = preprocess_text(text)\n")
    f.write("        sequence = tokenizer.texts_to_sequences([cleaned_text])\n")
    f.write("        padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')\n")
    f.write("\n")
    f.write("        # Predict the emotion\n")
    f.write("        prediction = model.predict(padded_sequence)\n")
    f.write("        predicted_emotion_encoded = np.argmax(prediction, axis=1)[0]\n")
    f.write("        predicted_emotion = label_encoder.inverse_transform([predicted_emotion_encoded])[0]\n")
    f.write("\n")
    f.write("        return jsonify({'emotion': predicted_emotion})\n")
    f.write("\n")
    f.write("if __name__ == '__main__':\n")
    f.write("    app.run(debug=True)\n")


# Copy the saved files to the flask_app directory
shutil.copy('/content/emotion_model.h5', app_dir)
shutil.copy('/content/tokenizer.pkl', app_dir)
shutil.copy('/content/label_encoder.pkl', app_dir)

print(f"Directory structure created: {app_dir}/")
print(f"Files copied to {app_dir}: emotion_model.h5, tokenizer.pkl, label_encoder.pkl")


Directory structure created: flask_app/
Files copied to flask_app: emotion_model.h5, tokenizer.pkl, label_encoder.pkl


## Load model and preprocessing objects in flask app

### Subtask:
Write code in the Flask application to load the saved model, tokenizer, and label encoder when the application starts.


**Reasoning**:
Load the saved model, tokenizer, and label encoder within the Flask application's `app.py` file.



In [30]:
from flask import Flask, request, jsonify, render_template
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import os

# Define preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\\s]', '', text)
    text = text.lower()
    return text

app = Flask(__name__)

# Define paths to saved files relative to the app.py file
model_path = os.path.join(app.root_path, 'emotion_model.h5')
tokenizer_path = os.path.join(app.root_path, 'tokenizer.pkl')
label_encoder_path = os.path.join(app.root_path, 'label_encoder.pkl')

# Load the model and preprocessing objects when the app starts
try:
    # Disable eager execution to avoid potential issues with Flask and TensorFlow
    tf.compat.v1.disable_eager_execution()
    model = tf.keras.models.load_model(model_path)
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)
    with open(label_encoder_path, 'rb') as handle:
        label_encoder = pickle.load(handle)
    print("Model, tokenizer, and label encoder loaded successfully.")
except Exception as e:
    print(f"Error loading model or preprocessing objects: {e}")
    model = None
    tokenizer = None
    label_encoder = None

# Get max_len from padding during training
# In this case, we need to know the max_len from the training phase
# Replacing with the actual max_len value used during training
max_len = 66 # This value comes from the training notebook

@app.route('/')
def index():
    return "Flask app is running. Go to /predict for predictions."

@app.route('/predict', methods=['POST'])
def predict():
    if model is None or tokenizer is None or label_encoder is None:
        return jsonify({'error': 'Model or preprocessing objects not loaded.'}), 500

    if request.method == 'POST':
        try:
            data = request.get_json()
            text = data['text']

            # Preprocess the input text
            cleaned_text = preprocess_text(text)
            sequence = tokenizer.texts_to_sequences([cleaned_text])
            padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

            # Predict the emotion
            # Use model.predict with the loaded model
            with app.app_context():
              prediction = model.predict(padded_sequence)
            predicted_emotion_encoded = np.argmax(prediction, axis=1)[0]
            predicted_emotion = label_encoder.inverse_transform([predicted_emotion_encoded])[0]

            return jsonify({'emotion': predicted_emotion})
        except Exception as e:
            return jsonify({'error': str(e)}), 400

if __name__ == '__main__':
    # This is for running the app directly, typically used for development
    # In a deployment environment, a WSGI server like Gunicorn would be used
    app.run(debug=True, host='0.0.0.0') # Added host='0.0.0.0' for Colab compatibility



Model, tokenizer, and label encoder loaded successfully.
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with watchdog (inotify)
