In [32]:
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from statistics import mean 

def read_text(src = None, filetype = 'txt'):
    """
    this function reads all txt files under specified dir
    and does some corresponding feature engineering (see readme for an exhaustive list) 
    output is a list of dicts, with each dict containing relevant features
    """
    
    def word_count(str):
        """
        turns a string into counts, return a dict
        """
        counts = dict()
        words = str.split()
        for word in words:
            counts[word] = counts.get(word, 0) + 1
        return counts

    wd = os.getcwd() if src != None else src
    files = [f for f in os.listdir(wd) if f.endswith(filetype)]
    extracted = [{} for f in files]

    for i in range(len(files)):
        
        f = files[i]
        
        with open(f) as t:
            
            text = t.read()
            d = word_count(text)
            tokens = sent_tokenize(text)
            special_char = [c for c in text if not c.isalpha() and not c.isdigit() and c != '\n' and c!=' ']
            
            # extracted[i]['file_name'] = f
            extracted[i]['total word counts'] = sum(d.values())
            extracted[i]['unique word counts'] = len(d.values())
            extracted[i]['number of sentences'] = len(tokens)
            extracted[i]['average length of sentences (characters)'] = sum(map(len, tokens))/len(tokens)
            # extracted[i]['average length of sentences (words)'] = mean([len(s.split()) for s in tokens])
            extracted[i]['special characters'] = len(special_char)
    
    return files, extracted


import time
t = time.time()
f, result = read_text()
print(f"Files processed: {f}", *result, sep='\n\n')
print(f"\n===== {len(result)} files processed, total time taken: {time.time()-t}s =====")

Files processed: ['test01.txt', 'test02.txt', 'test.txt']

{'file_name': 'test01.txt', 'total word counts': 42, 'unique word counts': 35, 'number of sentences': 2, 'average length of sentences (characters)': 139.5, 'average length of sentences (words)': 21, 'special characters': 3}

{'file_name': 'test02.txt', 'total word counts': 41, 'unique word counts': 33, 'number of sentences': 3, 'average length of sentences (characters)': 92.33333333333333, 'average length of sentences (words)': 13.666666666666666, 'special characters': 4}

{'file_name': 'test.txt', 'total word counts': 11, 'unique word counts': 8, 'number of sentences': 1, 'average length of sentences (characters)': 38.0, 'average length of sentences (words)': 11, 'special characters': 2}

===== 3 files processed, total time taken: 0.0011248588562011719s =====
