In [1]:
import pandas as pd
import os
import random
import pickle

train_fiction_fnames = pd.read_csv('./Fiction_Small.csv')['Filename'].tolist()
train_nonfiction_fnames = pd.read_csv('./NonFiction.csv')[:]['Filename'].tolist()

# Shuffling non-fiction and then picking 4558 samples for our analysis:
random.seed(7)
random.shuffle(train_nonfiction_fnames)
train_nonfiction_fnames = train_nonfiction_fnames[:4558]

# Adding .txt to each filename:
train_fiction = [f + '.txt' for f in train_fiction_fnames]
train_nonfiction = [f + '.txt' for f in train_nonfiction_fnames]

# Adding all filenames in the dataset to one list:
path = '../Dataset/'
allFilenamesInDataset = []
folders = os.listdir(path)[1:]
for folder_name in folders:
    allFilenamesInDataset.extend(os.listdir(path+folder_name))
# print "There are " + str(len(allFilenamesInDataset)) + " files in our dataset."

# Generating a dictionary key to map filenames to folder names- Key: GenRef; Value: list of all filenames in GenRef.
key_to_txts = {}
for folder_name in folders:
    temp = os.listdir(path+folder_name)
    key_to_txts[folder_name] = temp
    
    
    
# Mapping each filename to the folder name:

# For Fiction:
fiction_train_FolderNames = []
for fname in train_fiction:
    for folder in key_to_txts.keys():
        if fname in key_to_txts[folder]:
            fiction_train_FolderNames.append(folder)
# print len(fiction_train_FolderNames)

# For Non-Fiction:
non_fiction_train_FolderNames = []
for fname in train_nonfiction:
    for folder in key_to_txts.keys():
        if fname in key_to_txts[folder]:
            non_fiction_train_FolderNames.append(folder)
# print len(non_fiction_train_FolderNames)

print "Fiction comes from: ", set(fiction_train_FolderNames)
print "Non-Fiction comes from: ", set(non_fiction_train_FolderNames)

# fiction/nonfiction is a list of tuples; first element of tuple is the txt filename, second element is its folder name.
fiction = zip(train_fiction, fiction_train_FolderNames)
nonfiction = zip(train_nonfiction, non_fiction_train_FolderNames)

Fiction comes from:  set(['LitAndLang_2', 'SSAndFineArt', 'HistAndGeo', 'MedSciTech', 'Law', 'RelAndPhil', 'LitAndLang_1'])
Non-Fiction comes from:  set(['RelAndPhil', 'SSAndFineArt', 'HistAndGeo', 'MedSciTech', 'Law', 'LitAndLang_2', 'LitAndLang_1'])


In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
from heapq import nlargest
from nltk.corpus import stopwords
from string import punctuation


def init_stopwords():
    stopwords_path = '/Users/sunyambagga/nltk_data/corpora/stopwords'
    
    # Language stopwords:
    lang_stopwords = []
    for lang in os.listdir(stopwords_path):
        if lang != 'README':
            lang_stopwords.extend((stopwords.words(lang)))

    # By Prof. Andrew:
    EnglishNames = pd.read_csv('../Document Classification 4.0/stopwords_prof/Dict_English_NamesPlus.csv', header=None)[0].tolist()
    FrenchNames = pd.read_csv('../Document Classification 4.0/stopwords_prof/Dict_French_NamesPlus.csv', header=None)[0].tolist()

    all_stopwords = lang_stopwords + EnglishNames + FrenchNames + list(punctuation)
    all_stopwords = set(all_stopwords)
    
    return all_stopwords


'''
Takes in a list of sentences where each sentence is a list of words, and optional argument 'user_stopwords'.
Returns a dictionary with each 'word' is the key, and 'count' as the value.
'''
def calculate_frequencies(sentences_ll, user_stopwords=None):  # sentences_ll is a list of lists
    frequency = defaultdict(int)    # default value : 0
    
    all_stopwords = init_stopwords()
    if user_stopwords is not None:
        final_stopwords = set(user_stopwords).union(all_stopwords)
    else:
        final_stopwords = all_stopwords
    
    for sentence in sentences_ll:
        for word in sentence:
            word = word.lower()
            
            # Case I: No stopwords; Just one condition: len > 3
            if len(word) > 3:
                frequency[word] += 1

    return frequency

'''
Takes in text, and n = number of features
Returns a list of n most frequent words
'''
def get_features(text, n, user_stopwords=None):  # n is the desired no. of features
    sentences = sent_tokenize(text.decode('utf8'))
    
    sentences_ll = []
    for s in sentences:
        words = word_tokenize(s)
        sentences_ll.append(words)

    frequency = calculate_frequencies(sentences_ll, user_stopwords)
    return nlargest(n, frequency, key=frequency.get)


def run_and_pickle(n):
    numberOfFeatures = n
    print "Running for", n
    # They are a list of lists where each list represents a document as a collection of n frequent words.
    features_fiction = []
    features_nonfiction = []

    print "Fiction:"
    k = 0
    for (n, folder) in fiction:
        if k % 500 == 0:
            print k
        k += 1
        with open('../Dataset/' + folder + '/' + n) as f:
            text = f.read()
            features_fiction.append(get_features(text, numberOfFeatures))

    print "\n\nNonFiction:"
    k = 0
    for (n, folder) in nonfiction:
        if k % 500 == 0:
            print k
        k += 1
        with open('../Dataset/' + folder + '/' + n) as f:
            text = f.read()
            features_nonfiction.append(get_features(text, numberOfFeatures))
            
    # Pickling the results:
    with open('./features/no_stopwords/fiction_'+str(numberOfFeatures)+'.pickle', 'wb') as f:
        pickle.dump(features_fiction, f)
    with open('./features/no_stopwords/non_fiction_'+str(numberOfFeatures)+'.pickle', 'wb') as f:
        pickle.dump(features_nonfiction, f)
        
# run_and_pickle(50)
# run_and_pickle(100)
# run_and_pickle(500)
# run_and_pickle(1000)
# run_and_pickle(3000)