# Importing the libraries 

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
training_df=pd.read_csv("./training_df.csv")
testing_df=pd.read_csv("./testing_df.csv")


# Applying the tf-idf 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer()

tf_idf_vectorizer.fit(training_df['Tokens'])
tf_idf_training = tf_idf_vectorizer.transform(training_df['Tokens'])
tf_idf_testing = tf_idf_vectorizer.transform(testing_df['Tokens'])

# Loading the pre-trained word2vec model from the pickle file

In [4]:
with open('./word2vec-google-news-300.pkl', 'rb') as file:
    word2vec_model = pickle.load(file)

In [5]:
import numpy as np
def document_vector(word2vec_model, doc):
    # Tokenize the document and filter out words not in the Word2Vec vocabulary
    words = [word for word in doc.split() if word in word2vec_model.key_to_index]
    
    # If no words in the doc are in the Word2Vec vocabulary, return a zero vector
    if len(words) == 0:
        return np.zeros(word2vec_model.vector_size)
    
    # Compute the document vector as the mean of the word vectors
    doc_vector = np.mean(word2vec_model[words], axis=0)
    return doc_vector

training_doc_vectors = np.array([document_vector(word2vec_model, doc) for doc in training_df['Tokens']])
testing_doc_vectors = np.array([document_vector(word2vec_model, doc) for doc in testing_df['Tokens']])

Applying feature scaling (standard scaler) on the vectors 

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(training_doc_vectors)

# Transform both training and testing data
scaled_training_doc_vectors = scaler.transform(training_doc_vectors)
scaled_testing_doc_vectors = scaler.transform(testing_doc_vectors)

# Merging the features 

In [10]:
from scipy.sparse import csr_matrix

# Convert Word2Vec features to CSR format
training_doc_vectors_csr = csr_matrix(scaled_training_doc_vectors)
testing_doc_vectors_csr = csr_matrix(scaled_testing_doc_vectors)

# Combine TF-IDF and Word2Vec features for training data
X_train_combined = csr_matrix((tf_idf_training.shape[0], tf_idf_training.shape[1] + training_doc_vectors_csr.shape[1]))
X_train_combined[:, :tf_idf_training.shape[1]] = tf_idf_training
X_train_combined[:, tf_idf_training.shape[1]:] = training_doc_vectors_csr


# Combine TF-IDF and Word2Vec features for testing data
X_test_combined = csr_matrix((tf_idf_testing.shape[0], tf_idf_testing.shape[1] + testing_doc_vectors_csr.shape[1]))
X_test_combined[:, :tf_idf_testing.shape[1]] = tf_idf_testing
X_test_combined[:, tf_idf_testing.shape[1]:] = testing_doc_vectors_csr

In [11]:
y_train = training_df['Class'].values
y_test = testing_df['Class'].values


# Applying the machine learning algorithms on a single feature 

applying the naive bayes algorithm on the tf-idf 

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

nb_classifier = MultinomialNB()

nb_classifier.fit(tf_idf_training, y_train)

y_pred_tfidf = nb_classifier.predict(tf_idf_testing)

f1_tfidf = f1_score(y_test, y_pred_tfidf, average='macro')
print(f'F1 Score (TF-IDF): {f1_tfidf}')


F1 Score (TF-IDF): 0.03779686385139291


applying the naive bayes on the word embedding  

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

nb_classifier = MultinomialNB()

nb_classifier.fit(scaled_training_doc_vectors, y_train)

y_pred_word2vec = nb_classifier.predict(scaled_testing_doc_vectors)

f1_word2vec = f1_score(y_test, y_pred_word2vec, average='macro')
print(f'F1 Score (Word2Vec): {f1_word2vec}')


F1 Score (Word2Vec): 0.03014145878299118


Applying the naive bayes algorithm on both features 

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

nb_classifier = MultinomialNB()

nb_classifier.fit(X_train_combined, y_train)

y_pred_both = nb_classifier.predict(X_test_combined)

f1_both = f1_score(y_test, y_pred_both, average='macro')
print(f'F1 Score (TF-IDF): {f1_both}')


F1 Score (TF-IDF): 0.01719166702790167


applying the LogisticRegression algorithm on tf_idf feature

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Initialize the Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000)

# Train the classifier using TF-IDF features alone
lr_classifier.fit(tfidf_training_matrix, y_train)

# Predict on the test set using TF-IDF features alone
y_pred_tfidf = lr_classifier.predict(tfidf_testing_matrix)

# Evaluate the model using the F1 score with TF-IDF features alone
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='macro')
print(f'F1 Score (TF-IDF): {f1_tfidf}')


F1 Score (TF-IDF): 0.011454557493862661


applying the LogisticRegression algorithm on word embedding feature

In [64]:
lr_classifier = LogisticRegression(max_iter=1000)

lr_classifier.fit(training_doc_vectors, y_train)
y_pred_word2vec = lr_classifier.predict(testing_doc_vectors)
f1_word2vec = f1_score(y_test, y_pred_word2vec, average='macro')

print(f'F1 Score (Word2Vec): {f1_word2vec}')


F1 Score (Word2Vec): 0.11540709760731548


applying the LogisticRegression algorithm on both features

In [65]:
lr_classifier.fit(X_train_combined, y_train)
y_pred_both_log = lr_classifier.predict(X_test_combined)
f1_both_log = f1_score(y_test, y_pred_both_log, average='macro')

print(f'F1 Score (both for log reg): {f1_both_log}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score (both for log reg): 0.29358370987457877


applying the SVM algorithm on tf_idf feature

In [66]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

svm_classifier = SVC()
svm_classifier.fit(tf_idf_training, y_train)
y_pred_tfidf_svm = svm_classifier.predict(tf_idf_testing)
f1_tfidf_svm = f1_score(y_test, y_pred_tfidf_svm, average='macro')

print(f'F1 Score (TF-IDF with SVM): {f1_tfidf_svm}')

F1 Score (TF-IDF with SVM): 0.012087712004266992


applying the SVM algorithm on word embedding feature

In [67]:
svm_classifier.fit(training_doc_vectors, y_train)

# Predict on the test set using Word2Vec features alone
y_pred_word2vec_svm = svm_classifier.predict(testing_doc_vectors)

# Evaluate the model using the F1 score with Word2Vec features alone
f1_word2vec_svm = f1_score(y_test, y_pred_word2vec_svm, average='macro')
print(f'F1 Score (Word2Vec with SVM): {f1_word2vec_svm}')

F1 Score (Word2Vec with SVM): 0.16426533999270984


applying the SVM algorithm on both features

In [68]:
svm_classifier.fit(X_train_combined, y_train)
y_pred_both_svm = svm_classifier.predict(X_test_combined)
f1_both_svm = f1_score(y_test, y_pred_both_svm, average='macro')

print(f'F1 Score (TF-IDF with SVM): {f1_both_svm}')

F1 Score (TF-IDF with SVM): 0.09028871786045488


applying the RandomForest algorithm on tf idf feature 

In [69]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier()

# Train the classifier using TF-IDF features alone
rf_classifier.fit(tf_idf_training, y_train)

# Predict on the test set using TF-IDF features alone
y_pred_tfidf_rf = rf_classifier.predict(tf_idf_testing)

# Evaluate the model using the F1 score with TF-IDF features alone
f1_tfidf_rf = f1_score(y_test, y_pred_tfidf_rf, average='macro')
print(f'F1 Score (TF-IDF with Random Forest): {f1_tfidf_rf}')


F1 Score (TF-IDF with Random Forest): 0.011562898027323609


applying the RandomForest algorithm on word embedding feature 

In [70]:
rf_classifier.fit(training_doc_vectors, y_train)

# Predict on the test set using Word2Vec features alone
y_pred_word2vec_rf = rf_classifier.predict(testing_doc_vectors)

# Evaluate the model using the F1 score with Word2Vec features alone
f1_word2vec_rf = f1_score(y_test, y_pred_word2vec_rf, average='macro')
print(f'F1 Score (Word2Vec with Random Forest): {f1_word2vec_rf}')

F1 Score (Word2Vec with Random Forest): 0.14618659502070733


applying the RandomForest algorithm on both features

In [120]:
rf_classifier.fit(X_train_combined, y_train)

# Predict on the test set using Word2Vec features alone
y_pred_both_rf = rf_classifier.predict(X_test_combined)

# Evaluate the model using the F1 score with Word2Vec features alone
f1_both_rf = f1_score(y_test, y_pred_word2vec_rf, average='macro')
print(f'F1 Score (both features with Random Forest): {f1_both_rf}')

NameError: name 'rf_classifier' is not defined

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential  
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

# Example setup
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(training_texts)  
test_sequences = tokenizer.texts_to_sequences(testing_df["Tokens"])
x_test = pad_sequences(test_sequences, maxlen=100)

In [22]:

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(training_df["Tokens"])  
test_sequences = tokenizer.texts_to_sequences(testing_df["Tokens"])
x_test = pad_sequences(test_sequences, maxlen=100)

train_sequences = tokenizer.texts_to_sequences(training_df["Tokens"])
x_train = pad_sequences(train_sequences, maxlen=100)


In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

num_classes = np.unique(y_train).size 

# Define your LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(128))
model.add(Dense(num_classes, activation='softmax')) 
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [24]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=91)  
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test) 

In [25]:
model.fit(x_train, y_train_one_hot, batch_size=1500, epochs=5)


Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.2045 - loss: 4.4420
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.2487 - loss: 3.3279
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.3265 - loss: 2.8940
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.3315 - loss: 2.6943
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.3546 - loss: 2.5540


<keras.src.callbacks.history.History at 0x24fddfe3ed0>

In [26]:

y_pred_lstm_probs = model.predict(x_test)
y_pred_lstm = np.argmax(y_pred_lstm_probs, axis=1)

f1_lstm = f1_score(y_test_encoded, y_pred_lstm, average='macro')
print(f'F1 Score (LSTM Network): {f1_lstm}')


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step
F1 Score (LSTM Network): 0.016838948822817355
