In [1]:
import zipfile
import os

In [2]:
def unzip_file(zip_path, extract_to='.'):
    #unziping to a directory
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

In [3]:
def read_text_files(directory):
    # List to store the content of each file
    text_files_content = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='latin-1') as file:
                # Read the content and store it
                text_files_content.append(file.read())

    return text_files_content

In [4]:
def tokenize_text(text):
    # Splitting the text by spaces to get individual words
    tokens = text.split()
    return tokens

In [5]:
def to_lowercase(tokens):
    # Convert each token to lowercase
    return [token.lower() for token in tokens]

In [6]:
stop_words = ['the', 'and', 'is', 'in', 'at', 'of', 'a', 'an']

In [7]:
# Remove tokens that are in the stop_words list
def remove_stop_words(tokens, stop_words):
    return [token for token in tokens if token not in stop_words]

In [8]:
# Remove common suffixes from tokens
def simple_stemmer(tokens):
    suffixes = ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']

    stemmed_tokens = []
    for token in tokens:
        for suffix in suffixes:
            if token.endswith(suffix):
                token = token[:-len(suffix)]
        stemmed_tokens.append(token)

    return stemmed_tokens

In [9]:
# Replace tokenbs with their base form using a dictionary
def simple_lemmatizer(tokens):
    lemma_dict = {
        'better': 'good',
        'ran': 'run',
        'running': 'run',
        'cats': 'cat'
    }

    lemmatized_tokens = []
    for token in tokens:
        # Replace with the base form if it exists in lemma_dict
        if token in lemma_dict:
            lemmatized_tokens.append(lemma_dict[token])
        else:
            lemmatized_tokens.append(token)

    return lemmatized_tokens


In [10]:
# Create a vocabulary of unique words
def build_vocabulary(tokens):
    vocabulary = {}
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)
    return vocabulary

In [11]:
def vectorize_text(tokens, vocabulary):
    # Convert text into a vector based on word counts ( Bag of Words )
    vector = [0] * len(vocabulary)
    for token in tokens:
        if token in vocabulary:
            vector[vocabulary[token]] += 1
    return vector

In [12]:
zip_path = 'archive.zip'
extract_to = 'archive'

In [13]:
unzip_file(zip_path, extract_to)

In [14]:
texts = read_text_files(extract_to)

In [16]:
for i, text in enumerate(texts):
    print(f"Processing Text file {i+1}:")

    tokens = tokenize_text(text)
    tokens = to_lowercase(tokens)
    tokens = remove_stop_words(tokens, stop_words)
    tokens = simple_stemmer(tokens)
    tokens = simple_lemmatizer(tokens)

    vocabulary = build_vocabulary(tokens)
    vector = vectorize_text(tokens, vocabulary)

    print("Tokens:", tokens[:10])
    print("Vector:", vector[:10])
    print("Vocabulary:", len(vocabulary))

Processing Text file 1:
Tokens: ['newsgroup:', 'sci.crypt', 'document_id:', '14147', 'from:', 'marc', 'vanheyningen', '<mvanheyn@cs.indiana.edu>', 'subject:', 'ripem']
Vector: [1984, 2070, 1982, 2, 2126, 88, 34, 18, 2000, 216]
Vocabulary: 32208
Processing Text file 2:
Tokens: ['newsgroup:', 'comp.sys.mac.hardware', 'document_id:', '50418', 'from:', 'xor@clotho.acm.rpi.edu', '(joe', 'schwartz)', 'subject:', 're:']
Vector: [1922, 1934, 1922, 2, 1940, 4, 4, 4, 1940, 1076]
Vocabulary: 22585
Processing Text file 3:
Tokens: ['newsgroup:', 'misc.forsale', 'document_id:', '70337', 'from:', 'kedz@bigwpi.wpi.edu', '(john', 'kedziora)', 'subject:', 'motorcycle']
Vector: [1944, 1952, 1944, 2, 1956, 2, 46, 2, 1978, 4]
Vocabulary: 24556
Processing Text file 4:
Tokens: ['newsgroup:', 'soc.religion.christian', 'document_id:', '20361', 'from:', 'jenk@microsoft.com', '(jen', 'kilmer)', 'subject:', 're:']
Vector: [2004, 2020, 1994, 2, 2006, 20, 12, 12, 1998, 1522]
Vocabulary: 34652
Processing Text file 5