# Importing the required libraries

In [17]:
import os
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# The path of the training and the testing data

In [18]:
training_folder_path = './training'
testing_folder_path = './testing'


In [19]:
def read_documents(training_folder_path):
    classes = []
    documents = []

    for root, dirs, files in os.walk(training_folder_path):
        for file_name in files:
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r') as file:
                    content = file.read()
                    class_name = os.path.basename(root)
                    classes.append(class_name)
                    documents.append(content.strip())

    df = pd.DataFrame({'Class': classes, 'Document': documents})
    return df


training_df = read_documents(training_folder_path)
testing_df=read_documents(testing_folder_path)


# Word Tokenization: Split the text into uni-gram tokens.


In [20]:
training_df['Tokens'] = training_df['Document'].apply(lambda x: word_tokenize(x))

# Token Normalization: Using Porter's Stemmer to return tokens to their base form


In [21]:
porter_stemmer = PorterStemmer()
training_df['Tokens'] = training_df['Tokens'].apply(lambda tokens: [porter_stemmer.stem(token) for token in tokens])
training_df['ProcessedDocument'] = training_df['Tokens'].apply(' '.join)
training_df

Unnamed: 0,Class,Document,Tokens,ProcessedDocument
0,acq,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SAL...,"[comput, termin, system, <, cpml, >, complet, ...",comput termin system < cpml > complet sale com...
1,acq,OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET...,"[ohio, mattress, <, omt, >, may, have, lower, ...",ohio mattress < omt > may have lower 1st qtr n...
2,acq,MCLEAN'S <MII> U.S. LINES SETS ASSET TRANSFER\...,"[mclean, 's, <, mii, >, u.s., line, set, asset...",mclean 's < mii > u.s. line set asset transfer...
3,acq,CHEMLAWN <CHEM> RISES ON HOPES FOR HIGHER BIDS...,"[chemlawn, <, chem, >, rise, on, hope, for, hi...",chemlawn < chem > rise on hope for higher bid ...
4,acq,<COFAB INC> BUYS GULFEX FOR UNDISCLOSED AMOUNT...,"[<, cofab, inc, >, buy, gulfex, for, undisclos...",< cofab inc > buy gulfex for undisclos amount ...
...,...,...,...,...
11408,zinc,PEGASUS GOLD <PGULF> STARTS MILLING IN MONTANA...,"[pegasu, gold, <, pgulf, >, start, mill, in, m...",pegasu gold < pgulf > start mill in montana je...
11409,zinc,"WORLD ZINC STOCKS FALL 7,700 TONNES IN FEBRUAR...","[world, zinc, stock, fall, 7,700, tonn, in, fe...","world zinc stock fall 7,700 tonn in februari e..."
11410,zinc,"LME DETAILS MARCH 1987 TURNOVER\n\n LONDON,...","[lme, detail, march, 1987, turnov, london, ,, ...","lme detail march 1987 turnov london , april 2 ..."
11411,zinc,BALL <BLL> TO SUPPLY PENNY BLANKS TO MINTS\n\n...,"[ball, <, bll, >, to, suppli, penni, blank, to...",ball < bll > to suppli penni blank to mint mun...


# Vocabulary Set Extraction.


In [22]:
vocabulary = set()
for tokens in training_df['Tokens']:
    vocabulary.update(tokens)

Vocabulary_df  = pd.DataFrame({'Tokens': list(vocabulary)})

In [24]:
testing_df

Unnamed: 0,Class,Document
0,acq,SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERG...
1,acq,BOND CORP STILL CONSIDERING ATLAS MINING BAIL-...
2,acq,CRA SOLD FORREST GOLD FOR 76 MLN DLRS - WHIM C...
3,acq,ANHEUSER-BUSCH JOINS BID FOR SAN MIGUEL\n\n ...
4,acq,MONIER SAYS BRITAIN'S REDLAND MAY BID FOR IT\n...
...,...,...
4019,zinc,FORCE MAJEURE LIFTED AT CAJAMARQUILLA\n\n L...
4020,zinc,NORANDA BRUNSWICK MINERS VOTE MONDAY ON CONTRA...
4021,zinc,NO COMINCO STRIKE TALKS SCHEDULED\n\n TRAIL...
4022,zinc,AMAX ZINC CO RAISES DYECAST ALLOY PRICES\n\n ...


In [26]:
testing_df.to_csv("../testing_df.csv",index=False)
training_df.to_csv("../training_df.csv",index=False)

In [28]:
training_df

Unnamed: 0,Class,Document,Tokens,ProcessedDocument
0,acq,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SAL...,"[comput, termin, system, <, cpml, >, complet, ...",comput termin system < cpml > complet sale com...
1,acq,OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET...,"[ohio, mattress, <, omt, >, may, have, lower, ...",ohio mattress < omt > may have lower 1st qtr n...
2,acq,MCLEAN'S <MII> U.S. LINES SETS ASSET TRANSFER\...,"[mclean, 's, <, mii, >, u.s., line, set, asset...",mclean 's < mii > u.s. line set asset transfer...
3,acq,CHEMLAWN <CHEM> RISES ON HOPES FOR HIGHER BIDS...,"[chemlawn, <, chem, >, rise, on, hope, for, hi...",chemlawn < chem > rise on hope for higher bid ...
4,acq,<COFAB INC> BUYS GULFEX FOR UNDISCLOSED AMOUNT...,"[<, cofab, inc, >, buy, gulfex, for, undisclos...",< cofab inc > buy gulfex for undisclos amount ...
...,...,...,...,...
11408,zinc,PEGASUS GOLD <PGULF> STARTS MILLING IN MONTANA...,"[pegasu, gold, <, pgulf, >, start, mill, in, m...",pegasu gold < pgulf > start mill in montana je...
11409,zinc,"WORLD ZINC STOCKS FALL 7,700 TONNES IN FEBRUAR...","[world, zinc, stock, fall, 7,700, tonn, in, fe...","world zinc stock fall 7,700 tonn in februari e..."
11410,zinc,"LME DETAILS MARCH 1987 TURNOVER\n\n LONDON,...","[lme, detail, march, 1987, turnov, london, ,, ...","lme detail march 1987 turnov london , april 2 ..."
11411,zinc,BALL <BLL> TO SUPPLY PENNY BLANKS TO MINTS\n\n...,"[ball, <, bll, >, to, suppli, penni, blank, to...",ball < bll > to suppli penni blank to mint mun...


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_df['ProcessedDocument'])
tfidf_testing_matrix = tfidf_vectorizer.fit_transform(testing_df['Document'])

In [37]:
import gensim.downloader as api
import numpy as np
import pandas as pd
import pickle
word2vec_model = api.load('word2vec-google-news-300')

with open('../word2vec-google-news-300.pkl', 'wb') as file:
    pickle.dump(word2vec_model, file)

