In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from nltk.tokenize import sent_tokenize
from keras import layers, models, optimizers
import csv



In [3]:
import pandas as pd
import numpy as np

In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random

[nltk_data] Downloading package punkt to /Users/james/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])

In [6]:
train['class'].value_counts()

class
3    17870
1    14047
4     9995
2     6942
Name: count, dtype: int64

In [7]:
one = train.loc[train['class'] == 1]
two = train.loc[train['class'] == 2]
three = train.loc[train['class'] == 3]
four = train.loc[train['class'] == 4]

In [8]:
train = pd.concat([train, one.sample(n=3953), two.sample(n=6000), two.sample(n=5058), three.sample(n=130), four.sample(n=8005)], ignore_index=True)


In [9]:
train['class'].value_counts()

class
1    18000
2    18000
3    18000
4    18000
Name: count, dtype: int64

In [10]:
x_train = train['text']
x_test = test['text']

In [12]:
tokenizer = Tokenizer(num_words=700)
tokenizer.fit_on_texts(x_train.values)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# Padding
padding_type = 'post'
truncating_type = 'post'

In [13]:
vocab_size = len(word_index) + 1
max_length = 100

X_train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)
X_test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)


In [14]:
vocab_size = len(word_index) + 1                          

maxlen = 100

X_train = pad_sequences(train_sequences, padding='post', maxlen=maxlen)
X_test = pad_sequences(test_sequences, padding='post', maxlen=maxlen)

In [22]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tensorflow.keras.layers import Embedding


In [26]:
import numpy as np
from gensim.models import KeyedVectors

def create_embedding_matrix(model_path, word_index, embedding_dim):
    word_vectors = KeyedVectors.load(model_path)   
    vocab_size = len(word_index) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    # Iterate through word_index to create the embedding matrix
    for word, idx in word_index.items():
        if word in word_vectors.key_to_index:  # Check if the word is in the KeyedVectors
            embedding_vector = word_vectors[word]
            if embedding_vector is not None:
                embedding_matrix[idx] = embedding_vector[:embedding_dim]
    
    return embedding_matrix

# Assuming your tokenizer has been fitted on your text data
embedding_dim = 300  # Dimensionality of Google News Word2Vec vectors
model_path = 'word2vec-google-news-300.model' 

# Create the embedding matrix
embedding_matrix = create_embedding_matrix(model_path, tokenizer.word_index, embedding_dim)


In [15]:
y_train = train['class'] - 1
y_test = test['class'] - 1

In [16]:
num_classes = 4
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)

In [17]:
y_test_categorical

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [31]:
import tensorflow as tf
embedding_dim = 300

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(128, 7, kernel_regularizer=tf.keras.regularizers.l2( l2 = 0.3), activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.5),  # Dropout layer to reduce overfitting
    Dense(4, activation='softmax')
    ])





In [32]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=[tf.keras.metrics.Precision(),tf.keras.metrics.Recall(),'accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min', restore_best_weights=True)


In [33]:
model.fit(X_train_padded, y_train_categorical, 
          validation_split=0.2,  # Use a portion of the training data for validation
          epochs=20, batch_size=64, callbacks=[early_stopping])

Epoch 1/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 119ms/step - accuracy: 0.5398 - loss: 4.8340 - precision: 0.6683 - recall: 0.2848 - val_accuracy: 0.6881 - val_loss: 0.9249 - val_precision: 0.7007 - val_recall: 0.6525
Epoch 2/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 120ms/step - accuracy: 0.7916 - loss: 0.8267 - precision: 0.8265 - recall: 0.7383 - val_accuracy: 0.7099 - val_loss: 0.9018 - val_precision: 0.7284 - val_recall: 0.6778
Epoch 3/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 121ms/step - accuracy: 0.8187 - loss: 0.8017 - precision: 0.8459 - recall: 0.7827 - val_accuracy: 0.6895 - val_loss: 1.0250 - val_precision: 0.7016 - val_recall: 0.6721
Epoch 4/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 129ms/step - accuracy: 0.8236 - loss: 0.7825 - precision: 0.8493 - recall: 0.7924 - val_accuracy: 0.7734 - val_loss: 0.8210 - val_precision: 0.7850 - val_recall: 0.7581
Epoch 5/

<keras.src.callbacks.history.History at 0x383168550>

In [34]:
y_pred = model.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=1)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step


In [39]:
model1 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(128, 7, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.3)),
    MaxPooling1D(pool_size=2),  # Optional pooling layer after the first convolution
    Conv1D(256, 5, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.3)),  # Additional convolutional layer
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.5),  # Dropout layer to reduce overfitting
    Dense(4, activation='softmax')
])



In [40]:
model1.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=[tf.keras.metrics.Precision(),tf.keras.metrics.Recall(),'accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min', restore_best_weights=True)


In [41]:
model1.fit(X_train_padded, y_train_categorical, 
          validation_split=0.2,  # Use a portion of the training data for validation
          epochs=20, batch_size=64, callbacks=[early_stopping])

Epoch 1/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 130ms/step - accuracy: 0.3127 - loss: 9.1991 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - val_accuracy: 0.0090 - val_loss: 1.6458 - val_precision_1: 0.0000e+00 - val_recall_1: 0.0000e+00
Epoch 2/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 129ms/step - accuracy: 0.3075 - loss: 1.3574 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 1.6533 - val_precision_1: 0.0000e+00 - val_recall_1: 0.0000e+00
Epoch 3/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 129ms/step - accuracy: 0.3153 - loss: 1.3550 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - val_accuracy: 0.0090 - val_loss: 1.6740 - val_precision_1: 0.0000e+00 - val_recall_1: 0.0000e+00
Epoch 4/20
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 130ms/step - accuracy: 0.3119 - loss: 1.3555 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - val_a

<keras.src.callbacks.history.History at 0x383ccf550>

In [48]:
y_pred1 = model1.predict(X_test_padded)
y_pred_classes1 = y_pred1.argmax(axis=1)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step


In [43]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [44]:
score = f1_score(y_test, y_pred_classes, average='macro')
print('Score on validation = {}'.format(score))

Score on validation = 0.455602919317178


In [49]:
score = f1_score(y_test, y_pred_classes1, average='macro')
print('Score on validation = {}'.format(score))

Score on validation = 0.1


In [46]:
print('Classification report on test data:')
# Make sure y_test is in the correct format for comparison
print(classification_report(y_test, y_pred_classes))

Classification report on test data:
              precision    recall  f1-score   support

           0       0.39      0.21      0.27       750
           1       0.55      0.51      0.53       750
           2       0.35      0.51      0.42       750
           3       0.58      0.64      0.61       750

    accuracy                           0.47      3000
   macro avg       0.47      0.47      0.46      3000
weighted avg       0.47      0.47      0.46      3000



In [50]:
print('Classification report on test data:')
# Make sure y_test is in the correct format for comparison
print(classification_report(y_test, y_pred_classes1))

Classification report on test data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       750
           1       0.00      0.00      0.00       750
           2       0.25      1.00      0.40       750
           3       0.00      0.00      0.00       750

    accuracy                           0.25      3000
   macro avg       0.06      0.25      0.10      3000
weighted avg       0.06      0.25      0.10      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
