In [31]:
import os
from os import listdir, path
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import re

# stopwords
stopwords = set(stopwords.words('english'))
number = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
'4th', '5th', '6th', '7th', '8th', '9th', '10th']
for i in number:
    stopwords.add(i)

def gather_20newgroups_data(path):
    dirs = [path + dir_name + "\\" for dir_name in listdir(path) if not os.path.isfile(path + dir_name)]   # include folders in path
    train_dir = dirs[1]
    test_dir = dirs[0]
    list_newgroups = [newgroup for newgroup in listdir(train_dir)]   # store classes
    list_newgroups.sort() 

    #
    def collect_data_from(parent_dir, newgroups_list):
        data = []   # store each document as an item in a list
        for class_id, class_name in enumerate(newgroups_list):
            label = class_id
            dir_path = parent_dir + "\\" + class_name + "\\"
            files = [(filename, dir_path + filename) for filename in listdir(dir_path) if os.path.isfile(dir_path + filename)]
            files.sort()
            for filename, filepath in files:
                with open(filepath) as f:
                    text = f.read().lower()
                    words = [ps.stem(word) for word in re.split('\W+', text) if word not in stopwords]
                    content = ' '.join(words)
                    assert len(content.splitlines()) == 1  # check whether all the document is a list now
                    data.append(str(label) + '<fff>' + filename + '<fff>' + content)
        return data
    
    # get cleaned train_data and test_data
    train_data = collect_data_from(train_dir, list_newgroups)
    test_data = collect_data_from(test_dir, list_newgroups)
    
    #full_data
    full_data = train_data + test_data
    
    with open("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\processedTrain.txt", 'w') as f:
        f.write('\n'.join(train_data))
    with open("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\processedTest.txt", 'w') as f:
        f.write('\n'.join(test_data))
    with open("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\processedFull.txt", 'w') as f:
        f.write('\n'.join(full_data))

In [140]:
from collections import defaultdict 
import numpy as np

def generate_vocab(data_path, df):
    def compute_idf(df, corpus_size):   # of one word
        assert df > 0
        return np.log10(corpus_size * 1. / df)
    
    with open(data_path) as f:
        lines = f.read().splitlines()
    doc_count = defaultdict(int) # dict to store number of docs containing a word
    corpus_size = len(lines)    # numbers of docs
    
    for line in lines:     # each line is a doc
        features = line.split('<fff>')
        text = features[-1]
        words = list(set(text.split()))  # list distinguished words of one doc
        for word in words:  
            doc_count[word] += 1
    
    words_idfs = [(word, compute_idf(value, corpus_size)) for word, value in doc_count.items() if value > df and not word.isdigit()]
    words_idfs.sort(key=lambda tup: -tup[1])
    print("Vocal size: {}".format(len(words_idfs)))
    with open("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_idfs.txt", 'w') as f:
        f.write('\n'.join([word + '<fff>' + str(idf) for word, idf in words_idfs]))

In [148]:
def get_tf_idf(data_path):
    # get idfs from file
    with open("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_idfs.txt", 'r') as f:
        words_idfs = [(line.split('<fff>')[0], float(line.split('<fff>')[1])) for line in f.read().splitlines()]
        
    word_IDs = dict([(word, idx) for idx, (word, idf) in enumerate(words_idfs)])
    idfs = dict(words_idfs)
    
    with open(data_path) as f:
        docs = [(int(line.split('<fff>')[0]), int(line.split('<fff>')[1]), line.split('<fff>')[2]) for line in f.read().splitlines()]
        
    data_tf_idf = []
    for doc in docs:
        label = doc[0]
        doc_id = doc[1]
        text = doc[2]
      
        words = [word for word in text.split() if word in idfs]    # all the words from one doc that belong to vocab
        word_set = list(set(words))      #distinguished words of one doc
        assert(len(word_set) > 0)
        max_term_freq = max([words.count(word) for word in word_set])
        
        words_tfidfs = []
        sum_squares = 0.0
        for word in word_set:
            term_freq = words.count(word)
            tf_idf_value = term_freq * 1. / max_term_freq * idfs[word]
            words_tfidfs.append((word_IDs[word], tf_idf_value))
            sum_squares += tf_idf_value ** 2
        
        words_tfidfs_normalized = [str(idx) + ':' + str(tf_idf_value / np.sqrt(sum_squares)) for idx, tf_idf_value in words_tfidfs]
        
        data_tf_idf.append((label, doc_id, ' '.join(words_tfidfs_normalized)))
    
    with open("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_tfidfs.txt", 'w') as f:
        f.write('\n'.join([str(label) + '<fff>' + str(doc_id) + '<fff>' + value for label, doc_id, value in data_tf_idf]))

In [32]:
gather_20newgroups_data("D:\\movedFromC\\123\\20192\\PRJ2\\20news-bydate\\")

In [149]:
generate_vocab("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\processedTrain.txt", 10)

Vocal size: 10272


In [150]:
get_tf_idf("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\processedTrain.txt")