In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from nltk.tokenize import sent_tokenize
from keras import layers, models, optimizers
import csv



In [2]:
import pandas as pd
import numpy as np

In [3]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random

[nltk_data] Downloading package punkt to /Users/james/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])

In [5]:
train['class'].value_counts()

class
3    17870
1    14047
4     9995
2     6942
Name: count, dtype: int64

In [6]:
one = train.loc[train['class'] == 1]
two = train.loc[train['class'] == 2]
three = train.loc[train['class'] == 3]
four = train.loc[train['class'] == 4]

In [7]:
train = pd.concat([train, one.sample(n=3953), two.sample(n=6000), two.sample(n=5058), three.sample(n=130), four.sample(n=8005)], ignore_index=True)


In [8]:
train['class'].value_counts()

class
1    18000
2    18000
3    18000
4    18000
Name: count, dtype: int64

In [9]:
x_train = train['text']
x_test = test['text']

In [10]:
tokenizer = Tokenizer(num_words=700)
tokenizer.fit_on_texts(x_train.values)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# Padding
padding_type = 'post'
truncating_type = 'post'

In [11]:
vocab_size = len(word_index) + 1
max_length = 1000

X_train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)
X_test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)


In [12]:
vocab_size = len(word_index) + 1                          

maxlen = 1000

X_train = pad_sequences(train_sequences, padding='post', maxlen=maxlen)
X_test = pad_sequences(test_sequences, padding='post', maxlen=maxlen)

In [13]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tensorflow.keras.layers import Embedding

from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate




In [14]:
def create_embedding_matrix(word_vectors, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word_index.items():
        if word in word_vectors:
            embedding_matrix[idx] = word_vectors[word]
    return embedding_matrix

# Load Word2Vec KeyedVectors
word2vec_path = 'word2vec-google-news-300.model'  # Adjust as needed
word2vec = KeyedVectors.load(word2vec_path)

# Create the embedding matrix for your tokenizer's vocabulary
embedding_dim = 300  # or the dimensions of your word vectors
embedding_matrix = create_embedding_matrix(word2vec, tokenizer.word_index, embedding_dim)



In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # Adjust the number of max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(x_train)  # Replace 'your_text_data' with your actual text data
tfidf_feature_length = tfidf_matrix.shape[1]  # Number of TF-IDF features

In [17]:
y_train = train['class'] - 1
y_test = test['class'] - 1

In [18]:
num_classes = 4
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)

In [19]:
y_test_categorical

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [20]:
sequence_input = Input(shape=(max_length,), dtype='int32')
tfidf_input = Input(shape=(tfidf_feature_length,))

In [21]:
import tensorflow as tf
embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],  # Vocabulary size
                            output_dim=embedding_matrix.shape[1],  # Embedding dimensionality
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),  # Use pre-trained weights
                            input_length=max_length,  # Length of input sequences
                            trainable=False)(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedding_layer)
x = GlobalMaxPooling1D()(x)
x = Dense(32, activation='relu')(x)
cnn_path = Dropout(0.7)(x)




In [22]:
# TF-IDF path
y = Dense(128, activation='relu')(tfidf_input)
tfidf_path = Dropout(0.3)(y)


In [23]:
from tensorflow.keras.layers import Concatenate

# Assuming cnn_path and tfidf_path are defined in your model as outputs from previous layers
combined = Concatenate()([cnn_path, tfidf_path])

In [24]:
# Output layer
output = Dense(4, activation='softmax')(combined)

# Create and compile the model
model = Model(inputs=[sequence_input, tfidf_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])


In [25]:
model.summary()

In [26]:
X_train_sequences = tokenizer.texts_to_sequences(x_train)  # Replace with your training text data
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length)
X_train_tfidf = tfidf_vectorizer.transform(x_train).toarray()  # Replace with your training text data


In [27]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min', restore_best_weights=True)


In [28]:
X_test_sequences = tokenizer.texts_to_sequences(x_test)  # Replace with your training text data
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length)
X_test_tfidf = tfidf_vectorizer.transform(x_test).toarray() 

In [29]:
model1 = model

In [30]:
model.fit([X_train_padded, X_train_tfidf], y_train_categorical, batch_size=128, epochs=30, validation_data=([X_test_padded, X_test_tfidf], y_test_categorical), callbacks=[early_stopping])


Epoch 1/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 297ms/step - accuracy: 0.8395 - loss: 0.5049 - val_accuracy: 0.7283 - val_loss: 0.7656
Epoch 2/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 308ms/step - accuracy: 0.9903 - loss: 0.0427 - val_accuracy: 0.7190 - val_loss: 0.9220
Epoch 3/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 304ms/step - accuracy: 0.9976 - loss: 0.0155 - val_accuracy: 0.7183 - val_loss: 1.0168
Epoch 4/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 305ms/step - accuracy: 0.9990 - loss: 0.0085 - val_accuracy: 0.7267 - val_loss: 1.0523
Epoch 5/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 309ms/step - accuracy: 0.9994 - loss: 0.0051 - val_accuracy: 0.7297 - val_loss: 1.0879
Epoch 6/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 305ms/step - accuracy: 0.9996 - loss: 0.0032 - val_accuracy: 0.7210 - val_loss: 1.2092
Epoc

<keras.src.callbacks.history.History at 0x3c0eb0a30>

In [31]:
model1.fit([X_train_padded, X_train_tfidf], y_train_categorical, batch_size=128, epochs=30, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 304ms/step - accuracy: 0.9887 - loss: 0.0426 - val_accuracy: 0.9981 - val_loss: 0.0151
Epoch 2/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 302ms/step - accuracy: 0.9975 - loss: 0.0114 - val_accuracy: 0.9997 - val_loss: 0.0047
Epoch 3/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 307ms/step - accuracy: 0.9993 - loss: 0.0049 - val_accuracy: 1.0000 - val_loss: 0.0030
Epoch 4/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 318ms/step - accuracy: 0.9996 - loss: 0.0032 - val_accuracy: 1.0000 - val_loss: 0.0011
Epoch 5/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 321ms/step - accuracy: 0.9998 - loss: 0.0019 - val_accuracy: 1.0000 - val_loss: 6.4642e-04
Epoch 6/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 322ms/step - accuracy: 0.9997 - loss: 0.0019 - val_accuracy: 1.0000 - val_loss: 7.1059e

<keras.src.callbacks.history.History at 0x3be85b790>

In [32]:
y_pred = model.predict([X_test_padded, X_test_tfidf])
y_pred_classes = y_pred.argmax(axis=1)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step


In [33]:
y_pred1 = model1.predict([X_test_padded, X_test_tfidf])
y_pred1_classes = y_pred1.argmax(axis=1)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step


In [34]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [35]:
score = f1_score(y_test, y_pred_classes, average='macro')
print('Score on validation = {}'.format(score))

Score on validation = 0.6977421389630287


In [36]:
print('Classification report on test data:')
# Make sure y_test is in the correct format for comparison
print(classification_report(y_test, y_pred_classes))

Classification report on test data:
              precision    recall  f1-score   support

           0       0.88      0.73      0.80       750
           1       0.76      0.42      0.54       750
           2       0.56      0.73      0.63       750
           3       0.72      0.94      0.82       750

    accuracy                           0.71      3000
   macro avg       0.73      0.71      0.70      3000
weighted avg       0.73      0.71      0.70      3000



In [37]:
score = f1_score(y_test, y_pred1_classes, average='macro')
print('Score on validation = {}'.format(score))

Score on validation = 0.6977421389630287


In [38]:
print('Classification report on test data:')
# Make sure y_test is in the correct format for comparison
print(classification_report(y_test, y_pred1_classes))

Classification report on test data:
              precision    recall  f1-score   support

           0       0.88      0.73      0.80       750
           1       0.76      0.42      0.54       750
           2       0.56      0.73      0.63       750
           3       0.72      0.94      0.82       750

    accuracy                           0.71      3000
   macro avg       0.73      0.71      0.70      3000
weighted avg       0.73      0.71      0.70      3000

