In [161]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from nltk.tokenize import sent_tokenize
from keras import layers, models, optimizers
import csv



In [162]:
import pandas as pd
import numpy as np

In [163]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random

[nltk_data] Downloading package punkt to /Users/james/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [164]:
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])

In [165]:
train['class'].value_counts()

class
3    17870
1    14047
4     9995
2     6942
Name: count, dtype: int64

In [166]:
one = train.loc[train['class'] == 1]
two = train.loc[train['class'] == 2]
three = train.loc[train['class'] == 3]
four = train.loc[train['class'] == 4]

In [167]:
train = pd.concat([train, one.sample(n=3953), two.sample(n=6000), two.sample(n=5058), three.sample(n=130), four.sample(n=8005)], ignore_index=True)


In [168]:
train['class'].value_counts()

class
1    18000
2    18000
3    18000
4    18000
Name: count, dtype: int64

In [169]:
x_train = train['text']
x_test = test['text']

In [170]:
tokenizer = Tokenizer(num_words=700)
tokenizer.fit_on_texts(x_train.values)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

padding_type = 'post'
truncating_type = 'post'

In [171]:
vocab_size = len(word_index) + 1
max_length = 1000

X_train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)
X_test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)


In [172]:
vocab_size = len(word_index) + 1                          

maxlen = 1000

X_train = pad_sequences(train_sequences, padding='post', maxlen=maxlen)
X_test = pad_sequences(test_sequences, padding='post', maxlen=maxlen)

In [173]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tensorflow.keras.layers import Embedding

from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate




In [174]:
def create_embedding_matrix(word_vectors, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word_index.items():
        if word in word_vectors:
            embedding_matrix[idx] = word_vectors[word]
    return embedding_matrix

word2vec_path = 'word2vec-google-news-300.model'  # Adjust as needed
word2vec = KeyedVectors.load(word2vec_path)

embedding_dim = 300 
embedding_matrix = create_embedding_matrix(word2vec, tokenizer.word_index, embedding_dim)



In [175]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [176]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000) 
tfidf_matrix = tfidf_vectorizer.fit_transform(x_train) 
tfidf_feature_length = tfidf_matrix.shape[1]

In [177]:
y_train = train['class'] - 1
y_test = test['class'] - 1

In [178]:
num_classes = 4
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)

In [179]:
y_test_categorical

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [180]:
sequence_input = Input(shape=(max_length,), dtype='int32')
tfidf_input = Input(shape=(tfidf_feature_length,))

In [181]:
import tensorflow as tf
embedding_layer = Embedding(input_dim=embedding_matrix.shape[0], 
                            output_dim=embedding_matrix.shape[1], 
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), 
                            input_length=max_length, 
                            trainable=False)(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedding_layer)
x = GlobalMaxPooling1D()(x)
x = Dense(32, activation='relu')(x)
cnn_path = Dropout(0.3)(x)




In [182]:
y = Dense(128, activation='relu')(tfidf_input)
tfidf_path = Dropout(0.7)(y)


In [183]:
from tensorflow.keras.layers import Concatenate

combined = Concatenate()([cnn_path, tfidf_path])

In [184]:
output = Dense(4, activation='softmax')(combined)

model = Model(inputs=[sequence_input, tfidf_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])


In [185]:
model.summary()

In [186]:
X_train_sequences = tokenizer.texts_to_sequences(x_train) 
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length)
X_train_tfidf = tfidf_vectorizer.transform(x_train).toarray() 


In [187]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min', restore_best_weights=True)


In [188]:
X_test_sequences = tokenizer.texts_to_sequences(x_test) 
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length)
X_test_tfidf = tfidf_vectorizer.transform(x_test).toarray() 

In [189]:
model1 = model

In [190]:
model.fit([X_train_padded, X_train_tfidf], y_train_categorical, batch_size=128, epochs=30, validation_data=([X_test_padded, X_test_tfidf], y_test_categorical), callbacks=[early_stopping])


Epoch 1/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 339ms/step - accuracy: 0.8439 - loss: 0.4725 - val_accuracy: 0.6470 - val_loss: 0.9724
Epoch 2/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 342ms/step - accuracy: 0.9881 - loss: 0.0485 - val_accuracy: 0.6650 - val_loss: 1.0853
Epoch 3/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 342ms/step - accuracy: 0.9958 - loss: 0.0200 - val_accuracy: 0.6613 - val_loss: 1.2334
Epoch 4/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 336ms/step - accuracy: 0.9979 - loss: 0.0106 - val_accuracy: 0.6863 - val_loss: 1.1380
Epoch 5/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 339ms/step - accuracy: 0.9988 - loss: 0.0067 - val_accuracy: 0.6627 - val_loss: 1.4906
Epoch 6/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 322ms/step - accuracy: 0.9990 - loss: 0.0051 - val_accuracy: 0.6750 - val_loss: 1.4015
Epoc

<keras.src.callbacks.history.History at 0x31d311480>

In [191]:
model1.fit([X_train_padded, X_train_tfidf], y_train_categorical, batch_size=128, epochs=30, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 332ms/step - accuracy: 0.9852 - loss: 0.0513 - val_accuracy: 0.9981 - val_loss: 0.0153
Epoch 2/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 335ms/step - accuracy: 0.9960 - loss: 0.0160 - val_accuracy: 0.9981 - val_loss: 0.0104
Epoch 3/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 334ms/step - accuracy: 0.9981 - loss: 0.0087 - val_accuracy: 1.0000 - val_loss: 0.0024
Epoch 4/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 341ms/step - accuracy: 0.9991 - loss: 0.0054 - val_accuracy: 1.0000 - val_loss: 0.0013
Epoch 5/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 339ms/step - accuracy: 0.9991 - loss: 0.0044 - val_accuracy: 1.0000 - val_loss: 0.0010
Epoch 6/30
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 333ms/step - accuracy: 0.9995 - loss: 0.0032 - val_accuracy: 0.9999 - val_loss: 0.0029
Epoc

<keras.src.callbacks.history.History at 0x390465a80>

In [192]:
y_pred = model.predict([X_test_padded, X_test_tfidf])
y_pred_classes = y_pred.argmax(axis=1)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step


In [193]:
y_pred1 = model1.predict([X_test_padded, X_test_tfidf])
y_pred1_classes = y_pred1.argmax(axis=1)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step


In [194]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [195]:
score = f1_score(y_test, y_pred_classes, average='macro')
print('Score on validation = {}'.format(score))

Score on validation = 0.6674831524688168


In [196]:
print('Classification report on test data:')
print(classification_report(y_test, y_pred_classes))

Classification report on test data:
              precision    recall  f1-score   support

           0       0.89      0.67      0.76       750
           1       0.76      0.51      0.61       750
           2       0.56      0.54      0.55       750
           3       0.60      0.98      0.75       750

    accuracy                           0.67      3000
   macro avg       0.70      0.67      0.67      3000
weighted avg       0.70      0.67      0.67      3000



In [197]:
score = f1_score(y_test, y_pred1_classes, average='macro')
print('Score on validation = {}'.format(score))

Score on validation = 0.6674831524688168


In [198]:
print('Classification report on test data:')
print(classification_report(y_test, y_pred1_classes))

Classification report on test data:
              precision    recall  f1-score   support

           0       0.89      0.67      0.76       750
           1       0.76      0.51      0.61       750
           2       0.56      0.54      0.55       750
           3       0.60      0.98      0.75       750

    accuracy                           0.67      3000
   macro avg       0.70      0.67      0.67      3000
weighted avg       0.70      0.67      0.67      3000

