In [1]:
#with this script, we create a csv-file containing the names of all relevant books by Virginia Woolf and the amount of all words as well as the amount of distinct words and a calculated value representing a creativity-score, meaning the amount of distinct words per 1000 words for each book

import os
import nltk
import string
import pandas as pd

In [2]:
#define path where all Woolf-files are stored
path_of_folder = "C:\\Users\\Jakob\\Desktop\\Werkzeuge Hausarbeit\\Virginia_Woolf_Files\\Bearbeitete_Downloads"

#check folder to get an overview over the files
os.listdir(path_of_folder)

['1915 The Voyage Out.txt',
 '1920 Night and Day.txt',
 '1921 Monday or Tuesday.txt',
 "1922 Jacob's Room.txt",
 '1925 Mrs Dalloway.txt',
 '1927 To the Lighthouse.txt',
 '1928 Orlando.txt',
 '1931 The Waves.txt',
 '1933 Flush.txt',
 '1937 The Years.txt',
 '1941 Between the Acts.txt']

In [3]:
#create an iterator for all files in the folder
file_iter = os.scandir(path_of_folder)

In [4]:
#use the iterator to open every file. Extract the name of the book from the file name
#extract the complete text from the file by iterating over every line and appending it to the string "text" if it does not start with "chapter" and - as it is the case in "To the Lighthouse", where chapters are given numbers - if it does not contain a number only
#save each book name and its text in a dictionary

all_texts = {}

for f in file_iter:
    myfile = open(path_of_folder + "\\" + f.name, 'r', encoding='utf-8')
    f_name = f.name[5:-4]
    text = ""
    
    for line in myfile.readlines():
        if (line.lower().startswith("chapter")) or (line.strip().isdigit()):
            continue
        else:
            text += line
    #replace \n-annotation in the string "text" with normal whitespace      
    text = text.replace('\n', ' ')
    all_texts[f_name] = text
    myfile.close()
     

In [5]:
#iterate over all texts, tokenize it with the nltk, and create a list of all words of every book. Save the book name and a list of its words to a new dictionary

words_all_texts = {}

for key in all_texts:
    sent_list = nltk.sent_tokenize(all_texts[key])
    words_list = []
    for sent in sent_list:
        words_list_per_sent = nltk.word_tokenize(sent)
        words_list.extend(words_list_per_sent)
    words_all_texts[key]=words_list
    

In [6]:
#in the following cells, we will count the amount of distinct words as well as the total amount of words to get some sort of creativity-score for each book
#before that, however, we lowercase each text and remove all punctuation marks as both English capitalization and punctuation do not testify to a person's literary creativity

for key in words_all_texts:
    new_word_list = []
    for word in words_all_texts[key]:
        if word in string.punctuation:
            continue
        else:
            new_word_list.append(word.lower())
    words_all_texts[key]=new_word_list

In [7]:
#iterate over the new dictionary. Count how many distinct words every book has by trying to add them to a set; which will only accept new/unique items. Save the book name and the length of the created set to a new list

count_words_distinct = []

for key in words_all_texts:
    word_set = set()
    for word in words_all_texts[key]:
        word_set.add(word)
    count_words_distinct.append(len(word_set))

In [8]:
#iterate over that dictionary again. Count how many words there are in total

count_words_total = []

for key in words_all_texts:
    word_amount = len(words_all_texts[key])
    count_words_total.append(word_amount)

In [9]:
#now calculate the amount of new words per 1000 words in total. This will be our creativity-score

distinct_words_per_1000_words = []

for i in range(0,len(count_words_distinct)):
    new_words_per_word = count_words_distinct[i]/count_words_total[i]
    distinct_words_per_1000_words.append(new_words_per_word*1000)
    

In [10]:
#create a DataFrame out of the results of the last operations

df_word_count = pd.DataFrame(columns = ['name', 'count_words_total', 'count_words_distinct', 'distinct_words_per_1000_words'])

index = 0
for key in words_all_texts:
    df_word_count = df_word_count.append({'name':key, 'count_words_total':count_words_total[index], 'count_words_distinct':count_words_distinct[index], 'distinct_words_per_1000_words': distinct_words_per_1000_words[index]}, ignore_index = True)
    index += 1

In [11]:
#check if DataFrame-setup worked
df_word_count

Unnamed: 0,name,count_words_total,count_words_distinct,distinct_words_per_1000_words
0,The Voyage Out,146530,10929,74.585409
1,Night and Day,177794,11734,65.997728
2,Monday or Tuesday,19637,3796,193.30855
3,Jacob's Room,57977,7450,128.499232
4,Mrs Dalloway,65816,7150,108.636198
5,To the Lighthouse,71214,6813,95.669391
6,Orlando,80442,9452,117.500808
7,The Waves,78626,8924,113.499351
8,Flush,35311,5293,149.896633
9,The Years,138161,8568,62.014606


In [12]:
#export the DataFrame to a csv-file to be able to work with it later
df_word_count.to_csv(r'C:\\Users\\Jakob\\Desktop\\Werkzeuge Hausarbeit\\Ergebnisse\\word_count.csv', index=False)