In [2]:
import os
import tarfile
import string
from collections import defaultdict
import shutil
import numpy as np
from collections import defaultdict

In [6]:
current_path = os.getcwd()

In [None]:
current_path = os.getcwd()
tar_gz_path = f"{current_path}/20_newsgroups.tar.gz"  


with tarfile.open(tar_gz_path, mode="r:gz") as tar:
    tar.extractall(path="newsgroup_data")

print("Extraction complete.")


def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
    
    processed_lines = lines[4:]

    
    with open(file_path, 'w') as file:
        file.writelines(processed_lines)


for root, dirs, files in os.walk("newsgroup_data"):
    topic = root.split('/')[-1]
    os.makedirs(f"./newsgroup_data/20_newsgroups_test/{topic}", exist_ok=True)
    halfway_mark = len(files) // 2
    for index, file in enumerate(files):
        file_path = os.path.join(root, file)
        process_file(file_path)
        if index >= halfway_mark:
        
            new_file_path = os.path.join(f"./newsgroup_data/20_newsgroups_test/{topic}", file)
            shutil.move(file_path, new_file_path)

        print(f"Processed: {file_path}")

print("All files processed.")

In [7]:
def remove_punctuation(word: str):
    word = word.lower()
    return word.translate(str.maketrans('', '', string.punctuation))

In [8]:
# init  hashmap of type word_freq[class] = {word: count}
word_freq = {} # {class:defaultdict(int)}
# init overall freq counter (vocabulary) to filter {word: count}
vocabulary = defaultdict(int) # {word:int}
# for topic in topics: LOGIC HERE
parent_folder = f"{current_path}/newsgroup_data/20_newsgroups/"  
seen_first = False
TOTAL_WORD_COUNT_FOR_CLASS = 'TOTAL_WORD_COUNT_FOR_CLASS/TOPIC'

for root, dirs, files in os.walk(parent_folder):
    if not seen_first:
        seen_first = True
        continue
    topic = root.split('/')[-1]
    print(topic)
    word_freq[topic] = defaultdict(int)

    for file in files:
        file_path = os.path.join(root, file)
        
        # Open and read each file
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            # Split content into words
            words = content.split()
            
            # Iterate through each word in the file
            for word in words:
                # Remove punctuation from the word
                cleaned_word = remove_punctuation(word)
                word_freq[topic][cleaned_word] += 1
                word_freq[topic][TOTAL_WORD_COUNT_FOR_CLASS] += 1
                vocabulary[cleaned_word] += 1

alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc


In [9]:
top_300 = set()
vocab = sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)
index = 1
topics = set()
for word, count in vocab:
    if index <= 300:
        top_300.add(word)
        index += 1
        continue
    for topic in word_freq.keys():
        word_freq[topic][word] += 1

for topic in word_freq.keys():
    topics.add(topic)

In [10]:
# one class will be one. all else will be -1
# Step 1: Load documents and preprocess
def load_documents(dataset_dir):
    documents = []
    labels = []
    classes = sorted(os.listdir(dataset_dir))  # Get folder names (class labels) sorted for consistency
    
    for class_name in classes:
        class_folder = os.path.join(dataset_dir, class_name)
        if os.path.isdir(class_folder):
            for doc_name in os.listdir(class_folder):
                doc_path = os.path.join(class_folder, doc_name)
                with open(doc_path, 'r', encoding='utf-8', errors='ignore') as file:
                    text = file.read()
                    documents.append(text)
                    labels.append(class_name)
    return documents, labels, classes

# Step 2: Tokenization and Preprocessing (with stop words and punctuation removal)
def tokenize_and_preprocess(doc):
    tokens = doc.split()  # Simple tokenization; consider using a tokenizer for better results
    processed_tokens = []
    for token in tokens:
        clean_token = remove_punctuation(token)
        if clean_token and clean_token not in top_300:
            processed_tokens.append(clean_token)
    return processed_tokens

# Step 3: Build the vocabulary
def build_vocabulary(tokenized_docs):
    vocabulary = set()
    for doc_tokens in tokenized_docs:
        vocabulary.update(doc_tokens)
    vocabulary = sorted(vocabulary)  # Sort the vocabulary to maintain consistent order
    word_index = {word: idx for idx, word in enumerate(vocabulary)}
    return vocabulary, word_index

# Step 4: Calculate DF for all terms
def compute_df(tokenized_docs, word_index):
    df = np.zeros(len(word_index))
    for doc_tokens in tokenized_docs:
        unique_words = set(doc_tokens)
        for word in unique_words:
            idx = word_index[word]
            df[idx] += 1
    return df

# Step 5: Calculate IDF
def compute_idf(df, total_docs):
    idf = np.log((total_docs) / (1 + df))  # Adding 1 to avoid division by zero
    return idf

# Step 6: Compute TF-IDF for each document
def compute_tfidf(tokenized_docs, word_index, idf):
    total_docs = len(tokenized_docs)
    tfidf_matrix = np.zeros((total_docs, len(word_index)))
    
    for doc_idx, doc_tokens in enumerate(tokenized_docs):
        tf = np.zeros(len(word_index))
        for word in doc_tokens:
            idx = word_index[word]
            tf[idx] += 1
        tf = tf / len(doc_tokens)  # Normalize TF by document length
        tfidf = tf * idf
        tfidf_matrix[doc_idx] = tfidf
    return tfidf_matrix

# Step 7: Generate One-vs-All Labels for SVM
def create_one_vs_all_labels(labels, classes):
    label_vectors = {}
    for class_name in classes:
        label_vector = np.array([1 if label == class_name else -1 for label in labels])
        label_vectors[class_name] = label_vector
    return label_vectors

# Step 8: Main function to create feature matrix and labels
def prepare_data_for_svm(dataset_dir):
    # Load and preprocess documents
    print('loading doc')
    documents, labels, classes = load_documents(dataset_dir)
    print('loaded doc')
    tokenized_docs = [tokenize_and_preprocess(doc) for doc in documents]
    print('done')
    total_docs = len(documents)
    
    print('built vocab')
    # Build vocabulary and word index mapping
    vocabulary, word_index = build_vocabulary(tokenized_docs)
    print('done building vocab')
    # Compute DF and IDF
    df = compute_df(tokenized_docs, word_index)
    print('computed df')
    idf = compute_idf(df, total_docs)
    print('done with idf')
    
    # Compute TF-IDF feature matrix
    tfidf_matrix = compute_tfidf(tokenized_docs, word_index, idf)
    print('got tfidf')
    
    # Generate labels for one-vs-all classification
    label_vectors = create_one_vs_all_labels(labels, classes)
    
    return tfidf_matrix, label_vectors, classes

# Step 9: Use the function to prepare data
dataset_dir = './newsgroup_data/20_newsgroups/'  # Update with your dataset path
tfidf_matrix, label_vectors, classes = prepare_data_for_svm(dataset_dir)

# Now tfidf_matrix is a 2D NumPy array where each row is a document's TF-IDF vector
# label_vectors is a dictionary containing labels for each class

KeyboardInterrupt: 