In [2]:
# Load Packages
import os
import pandas as pd
import numpy as np
import torch
# Load Dataset
train = pd.read_csv("/kaggle/input/cs-3780-5780-how-do-you-feel/train.csv")
train_text = train["text"]
train_label = train["label"]

test = pd.read_csv("/kaggle/input/cs-3780-5780-how-do-you-feel/test.csv")
test_id = test["id"]
test_text = test["text"]

In [3]:
print(test.head)

<bound method NDFrame.head of           id                                               text
0          0             im feeling like a hot potato right now
1          1  i feel that are becoming impressed upon my lit...
2          2  id ever held any girls hand but boy did i sure...
3          3  i feel thats when i feel my grief over the bra...
4          4  i feel will never been resolved in a way to ke...
...      ...                                                ...
14995  14995  i feel greedy in that im looking forward to th...
14996  14996                                 i was feeling cold
14997  14997  Yeah deffo seen mom do that trick with a Hotdo...
14998  14998                       i go to bed feeling defeated
14999  14999  i have gained more confidence within myself as...

[15000 rows x 2 columns]>


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

text_col = "text"  # Replace with the actual column name for text
label_col = "label"  # Replace with the actual column name for labels in training data

# Step 1: Preprocess data (if needed)
# Example: Lowercase the text
train[text_col] = train[text_col].str.lower()
test[text_col] = test[text_col].str.lower()

y_train = to_categorical(train_label, num_classes=28)

In [5]:
# Bag-of-Words Vectorization
vectorizer = CountVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train[text_col]).toarray()
X_test = vectorizer.transform(test[text_col]).toarray()

In [6]:
y_train = train[label_col]

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42  # 30% validation set
)

# One-hot encode the labels
y_train_split = to_categorical(y_train_split, num_classes=28)
y_val_split = to_categorical(y_val_split, num_classes=28)


Build the CNN Model

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential()
# Embedding layer 
model.add(Embedding(input_dim=5000, output_dim=128)) 
# 1D Convolutional layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
# MaxPooling layer to down-sample the feature maps
model.add(MaxPooling1D(pool_size=2))

# Flatten the feature maps to 1D
model.add(Flatten())
# Fully connected layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout to avoid overfitting

# Output layer (28 classes for emotion classification)
model.add(Dense(28, activation='softmax'))# Compile the model
optimizer= Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

Train the model with the training data

In [8]:
history = model.fit(
    X_train_split,           # Training data
    y_train_split,           # Training labels
    epochs=7,                # Number of training epochs
    batch_size=68,           # Batch size
    validation_data=(X_val_split, y_val_split)  # Validation data
)

Epoch 1/7
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 1s/step - accuracy: 0.1968 - loss: 3.0269 - val_accuracy: 0.5110 - val_loss: 1.6681
Epoch 2/7
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 1s/step - accuracy: 0.5405 - loss: 1.5298 - val_accuracy: 0.6407 - val_loss: 1.3061
Epoch 3/7
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 1s/step - accuracy: 0.7283 - loss: 0.9389 - val_accuracy: 0.6637 - val_loss: 1.2446
Epoch 4/7
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 1s/step - accuracy: 0.7996 - loss: 0.6788 - val_accuracy: 0.6657 - val_loss: 1.3400
Epoch 5/7
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 1s/step - accuracy: 0.8395 - loss: 0.5157 - val_accuracy: 0.6637 - val_loss: 1.4243
Epoch 6/7
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 1s/step - accuracy: 0.8780 - loss: 0.3971 - val_accuracy: 0.6577 - val_loss: 1.5710
Epoch 7/7
[1m103/103[0m [

For predicting y label

In [9]:
# Predict on the test set (since you don't have labels for the test set)
y_pred = model.predict(X_test)

# Convert the predictions to class labels (use argmax to get the highest probability class)
y_pred_classes = np.argmax(y_pred, axis=1)

# Create the DataFrame for submission
submission = pd.DataFrame({
    'id': test_id,  # Use the 'id' from the test set
    'label': y_pred_classes  # The predicted class labels (from your model)
})

# Save the submission DataFrame as a CSV in the correct location
submission.to_csv('/kaggle/working/submission.csv', index=False)

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 138ms/step
