In [1]:
'''
Now, getting all filenames from Fiction_Big for calculating feature-vectors.
'''
import pandas as pd
fiction_big = pd.read_csv('./Fiction_Big.csv')
fn_fiction_big = fiction_big['Filename'].tolist()
print len(fn_fiction_big)
fn_fiction_big = [f + '.txt' for f in fn_fiction_big]
print len(fn_fiction_big)
print fn_fiction_big[:5]

15036
15036
['0000200100.xml.txt', '0000200200.xml.txt', '0000200400.xml.txt', '0000200500.xml.txt', '0000200601.xml.txt']


In [2]:
'''
Checking if all 15,036 files are present in my dataset.
'''
import os

# Adding all filenames in the dataset to one list:
path = '../Dataset/'
allFilenamesInDataset = []
folders = os.listdir(path)[1:]
for folder_name in folders:
    allFilenamesInDataset.extend(os.listdir(path+folder_name))
print "There are " + str(len(allFilenamesInDataset)) + " files in our dataset."

for fn in fn_fiction_big:
    if fn not in allFilenamesInDataset:
        print fn
print "Done."

There are 154924 files in our dataset.
Done.


In [3]:
'''
Checking the intersection of Fiction_Small and Fiction_Big:
'''
fnames_fiction_small = pd.read_csv('./Fiction_Small.csv')['Filename'].tolist()
small_and_big_intersection = fiction_big.loc[fiction_big['Filename'].isin(fnames_fiction_small)]
intersection_fnames = small_and_big_intersection['Filename'].tolist()
print len(fnames_fiction_small)
# There are 3893 files in ans; Hence, some of the fiction_small is not in fiction_big. Finding out which ones:
notInBig = set(fnames_fiction_small) - set(intersection_fnames)
print len(notInBig)
print len(notInBig) + len(intersection_fnames)
# print notInBig

4558
665
4558


#### There are 665 documents that are in fiction_small, but not in fiction_big; Present in the list notInBig.

In [4]:
'''
Mapping each filename to its folder.
'''
# Generating a dictionary key to map filenames to folder names- Key: GenRef; Value: list of all filenames in GenRef.
key_to_txts = {}
for folder_name in folders:
    temp = os.listdir(path+folder_name)
    key_to_txts[folder_name] = temp

# Mapping each filename to the folder name:
fiction_FolderNames = []
for fname in fn_fiction_big:
    for folder in key_to_txts.keys():
        if fname in key_to_txts[folder]:
            fiction_FolderNames.append(folder)

print "They come from: ", set(fiction_FolderNames)

# fiction is a list of tuples; first element of tuple is the txt filename, second element is its folder name.
fiction = zip(fn_fiction_big, fiction_FolderNames)
print fiction[:5]

# Pickling the list 'fiction' because it matters for the ordering:
import pickle

with open('./order_fiction.pickle', 'wb') as f:
    pickle.dump(fiction, f)

In [None]:
'''
Calculating feature-vectors for each filename in the test set (Fiction_Big).
'''

# NOTE: Calculating feature-vectors assuming Model 2.

from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
from heapq import nlargest
from nltk.corpus import stopwords
from string import punctuation
import unicodecsv as csv

'''
Takes in a list of sentences where each sentence is a list of words, and optional argument 'user_stopwords'.
Returns a dictionary with each 'word' is the key, and 'count' as the value.
'''
def calculate_frequencies(sentences_ll):  # sentences_ll is a list of lists
    frequency = defaultdict(int)    # default value : 0
    
    for sentence in sentences_ll:
        for word in sentence:
            word = word.lower()
            
            # Case I: No stopwords; Just one condition: len > 3
            if len(word) > 3:
                frequency[word] += 1

    return frequency

'''
Takes in text, and n = number of features
Returns a list of n most frequent words
'''
def get_features(text, n):  # n is the desired no. of features
    sentences = sent_tokenize(text.decode('utf8'))
    
    sentences_ll = []
    for s in sentences:
        words = word_tokenize(s)
        sentences_ll.append(words)

    frequency = calculate_frequencies(sentences_ll)
    return nlargest(n, frequency, key=frequency.get)


def run_and_pickle(nf):
    numberOfFeatures = nf
    print "Running for", nf
    # They are a list of lists where each list represents a document as a collection of n frequent words.
    features_fiction = []

    print "Fiction:"
    k = 0
    for (n, folder) in fiction:
        if k % 500 == 0:
            print k
        k += 1
        with open('../Dataset/' + folder + '/' + n) as f:
            text = f.read()
            features_fiction.append(get_features(text, numberOfFeatures))

    # Pickling the results:
    with open('./features/test_big_fiction_'+str(numberOfFeatures)+'.pickle', 'wb') as f:
        pickle.dump(features_fiction, f)
    

    # CSV-ing the results:
    with open('./features/testbigfiction_'+str(numberOfFeatures)+'.csv', 'wb') as f:
        writer = csv.writer(f, encoding='utf-8')
        writer.writerows(features_fiction)

    return features_fiction
        
features_fiction_big = run_and_pickle(500)

In [4]:
'''
Calculating CountVec feature vectors for test-fiction-big.
'''

with open('./features/test_big_fiction_500.pickle', 'rb') as f:
    features_fiction_big = pickle.load(f)

# Getting it ready for Count Vectorizer:
countvec_500 = []

for doc in features_fiction_big:
    temp = ' '.join(doc)
    countvec_500.append(temp)
    
Xtest_500 = vectorizer500.transform(countvec_500)

print Xtest_500.shape

(15036, 154749)
