## extract training data as data frame

In [25]:
import os
import pandas as pd

def read_documents(folder_path):
    classes = []
    documents = []

    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r') as file:
                    content = file.read()
                    class_name = os.path.basename(root)
                    classes.append(class_name)
                    documents.append(content.strip())

    df = pd.DataFrame({'Class': classes, 'Document': documents})
    return df


folder_path = './Data'
df = read_documents(folder_path)

df 


Unnamed: 0,Class,Document
0,acq,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SAL...
1,acq,OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET...
2,acq,MCLEAN'S <MII> U.S. LINES SETS ASSET TRANSFER\...
3,acq,CHEMLAWN <CHEM> RISES ON HOPES FOR HIGHER BIDS...
4,acq,<COFAB INC> BUYS GULFEX FOR UNDISCLOSED AMOUNT...
...,...,...
11408,zinc,PEGASUS GOLD <PGULF> STARTS MILLING IN MONTANA...
11409,zinc,"WORLD ZINC STOCKS FALL 7,700 TONNES IN FEBRUAR..."
11410,zinc,"LME DETAILS MARCH 1987 TURNOVER\n\n LONDON,..."
11411,zinc,BALL <BLL> TO SUPPLY PENNY BLANKS TO MINTS\n\n...


## extract test data as data frame

In [26]:
folder_path = './test'
testing_df = read_documents(folder_path)

remove Not important  tags (\n)

In [27]:
df['Document'] = df['Document'].str.replace('\n', ' ')
testing_df['Document'] = testing_df['Document'].str.replace('\n', ' ')


## Tokenization

In [28]:
import nltk
from nltk.tokenize import word_tokenize

df['Tokens'] = df['Document'].apply(lambda x: word_tokenize(x))
testing_df['Tokens'] = testing_df['Document'].apply(lambda x: word_tokenize(x))


## Normalization

In [29]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()
df['Tokens'] = df['Tokens'].apply(lambda tokens: [porter_stemmer.stem(token) for token in tokens])
testing_df['Tokens'] = testing_df['Tokens'].apply(lambda tokens: [porter_stemmer.stem(token) for token in tokens])


## Vocabulary set extraction for training Data

In [30]:
unique_tokens = set()
for tokens in df['Tokens']:
    unique_tokens.update(tokens)

Vocabulary  = pd.DataFrame({'Tokens': list(unique_tokens)})

Vocabulary 

Unnamed: 0,Tokens
0,342.7
1,132-1/2
2,b.c
3,clement
4,automak
...,...
39517,esb
39518,qtec
39519,120/125
39520,7703625


In [31]:
# file_path = 'vocabulary.csv'
# Vocabulary.to_csv(file_path)

clean vocabulart

In [32]:
import re

pattern = re.compile('[a-zA-Z]')

filtered_tokens = [
    [token for token in tokens if pattern.search(token) and not token.isdigit()and len(token) > 2]
    for tokens in df['Tokens']
]
df['Tokens'] = filtered_tokens


filtered_token = [
    [token for token in tokens if pattern.search(token) and not token.isdigit()and len(token) > 2]
    for tokens in testing_df['Tokens']
]
testing_df['Tokens'] = filtered_token

In [33]:

df['Document'] = df['Document'].str.replace('/', ' ')
testing_df['Document'] = testing_df['Document'].str.replace('/', ' ')

In [34]:
df

Unnamed: 0,Class,Document,Tokens
0,acq,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SAL...,"[comput, termin, system, cpml, complet, sale, ..."
1,acq,OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET...,"[ohio, mattress, omt, may, have, lower, 1st, q..."
2,acq,MCLEAN'S <MII> U.S. LINES SETS ASSET TRANSFER ...,"[mclean, mii, u.s., line, set, asset, transfer..."
3,acq,CHEMLAWN <CHEM> RISES ON HOPES FOR HIGHER BIDS...,"[chemlawn, chem, rise, hope, for, higher, bid,..."
4,acq,<COFAB INC> BUYS GULFEX FOR UNDISCLOSED AMOUNT...,"[cofab, inc, buy, gulfex, for, undisclos, amou..."
...,...,...,...
11408,zinc,PEGASUS GOLD <PGULF> STARTS MILLING IN MONTANA...,"[pegasu, gold, pgulf, start, mill, montana, je..."
11409,zinc,"WORLD ZINC STOCKS FALL 7,700 TONNES IN FEBRUAR...","[world, zinc, stock, fall, tonn, februari, ein..."
11410,zinc,"LME DETAILS MARCH 1987 TURNOVER LONDON, A...","[lme, detail, march, turnov, london, april, th..."
11411,zinc,BALL <BLL> TO SUPPLY PENNY BLANKS TO MINTS ...,"[ball, bll, suppli, penni, blank, mint, munci,..."


## Feature Encoding

In [35]:
df['Tokens'] = df['Tokens'].apply(' '.join)
testing_df['Tokens'] = testing_df['Tokens'].apply(' '.join)

tf-idf

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer()

tf_idf_vectorizer.fit(df['Tokens'])
tf_idf_training = tf_idf_vectorizer.transform(df['Tokens'])
tf_idf_testing = tf_idf_vectorizer.transform(testing_df['Tokens'])

Word2Vec

In [37]:
import gensim.downloader as api

word2vec = api.load('word2vec-google-news-300')


In [38]:
import numpy as np
def document_vector(word2vec_model, doc):
    words = [word for word in doc.split() if word in word2vec_model.key_to_index]
    
    if len(words) == 0:
        return np.zeros(word2vec_model.vector_size)
    
    doc_vector = np.mean(word2vec_model[words], axis=0)
    return doc_vector

training_doc_vectors = np.array([document_vector(word2vec, doc) for doc in df['Tokens']])
testing_doc_vectors = np.array([document_vector(word2vec, doc) for doc in testing_df['Tokens']])

In [39]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(training_doc_vectors)

scaled_training_doc_vectors = scaler.transform(training_doc_vectors)


In [40]:
y_train = df['Class'].values
y_test = testing_df['Class'].values

 Naïve Bayes

tf_idf

In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

nb_classifier = MultinomialNB()

nb_classifier.fit(tf_idf_training, y_train)

y_pred_tfidf = nb_classifier.predict(tf_idf_testing)

f1_tfidf = f1_score(y_test, y_pred_tfidf, average='macro')
print(f'F1 Score (TF-IDF): {f1_tfidf}')

F1 Score (TF-IDF): 0.03779686385139291


word2vec

In [42]:

nb_classifier = MultinomialNB()

nb_classifier.fit(scaled_training_doc_vectors, y_train)

y_pred_word2vec = nb_classifier.predict(testing_doc_vectors)

f1_word2vec = f1_score(y_test, y_pred_word2vec, average='macro')
print(f'F1 Score (Word2Vec): {f1_word2vec}')

F1 Score (Word2Vec): 0.0046742535492290925


SVC


tf_idf

In [43]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

svm_classifier = SVC()
svm_classifier.fit(tf_idf_training, y_train)
y_pred_tfidf_svm = svm_classifier.predict(tf_idf_testing)
f1_tfidf_svm = f1_score(y_test, y_pred_tfidf_svm, average='macro')

print(f'F1 Score (TF-IDF with SVC): {f1_tfidf_svm}')

F1 Score (TF-IDF with SVC): 0.23927578465071792


word2vec

In [44]:
svm_classifier.fit(training_doc_vectors, y_train)

y_pred_word2vec_svm = svm_classifier.predict(testing_doc_vectors)

f1_word2vec_svm = f1_score(y_test, y_pred_word2vec_svm, average='macro')
print(f'F1 Score (Word2Vec with SVC): {f1_word2vec_svm}')

F1 Score (Word2Vec with SVC): 0.16426533999270984


Random Forest

tf_idf

In [45]:
rf_classifier = RandomForestClassifier()

rf_classifier.fit(tf_idf_training, y_train)

y_pred_tfidf_rf = rf_classifier.predict(tf_idf_testing)

f1_tfidf_rf = f1_score(y_test, y_pred_tfidf_rf, average='macro')
print(f'F1 Score (TF-IDF with Random Forest): {f1_tfidf_rf}')

F1 Score (TF-IDF with Random Forest): 0.19291925417100453


Word2Vec 

In [46]:
rf_classifier = RandomForestClassifier()

rf_classifier.fit(training_doc_vectors, y_train)

y_pred_word2vec_rf = rf_classifier.predict(testing_doc_vectors)

f1_word2vec_rf = f1_score(y_test, y_pred_word2vec_rf, average='macro')
print(f'F1 Score (Word2Vec with Random Forest): {f1_word2vec_rf}')

F1 Score (Word2Vec with Random Forest): 0.1519665811367221
