# Type-Token Analysis without split hyphenations

(Short stories as collections)

### Importing modules

In [1]:
import glob # Used for importing files matching a specific pattern
from pathlib import Path # Used to interact more easily with the file system
import pandas as pd # Used to shape data as tables and import that data into excel
import string # Used to have the ascii list of characters

### Importing and setting up functions for cleaning up and processing passages

In [2]:
# Creating a list of corpus documents and their titles
directory_path = 'resources/doyle_holmes_comb/' # Sets directory path to the combined Holmes corupus
document_files = glob.glob(f'{directory_path}/*.txt') # Creates a list with the name of each .txt file (document) in the directory
document_files = [slashes.replace('\\', '/') for slashes in document_files] # Replaces double backslashes with forward slashes so Python can find these files
document_titles = [Path(document).stem for document in document_files] # Creates a list of document titles by taking the file list and removing the trailing '.txt'

In [3]:
# Creating a function to clean the words for each document and setting up a stopwords lists
acceptable_characters = list(string.ascii_lowercase) + list(string.ascii_uppercase) + list(string.digits) # Creates a list of acceptable characters (letters and numbers)
acceptable_characters.append('-') # Adds a hyphen onto this list

stopwords = open('resources/stopwords.txt', encoding='utf-8').read() # Opens the stopwords list and reads it into the variable stopwords
stopwords = stopwords.split(",") # Splits this list of stopwords into a Python list

def clean_words(word):
    for char in word: # For every letter in a word...
        if char not in acceptable_characters: # If it is not a hyphen, letter or number...
            word = word.replace(char, '') # This removes it
    return word.lower() # Returns the word with 'unacceptable' characters removed

In [4]:
# Creating a function to do the type-token analysis for each text
def type_token(filepath, title):
    with open(f'{filepath}', encoding='utf8') as f: # Opens the .txt file as variable f
        full_passage = f.read() # Reads the .txt file into a string and then closes the file
    full_passage = full_passage.replace('\n', ' ') # Replaces new-line symbols with spaces
    full_passage = full_passage.split() # Splits this string into a list of words
    tokens = len(full_passage) # Counts the amount of words in the list of all the document's words - this is the number of 'tokens' in the document
    passage = [word for word in full_passage if word not in stopwords] # Makes a list of words with the stopwords removed (adds all the passage's words to a new list unless they are stopwords)

    unique_terms = [passage[0]] # Creates a list of unique terms, starting with the passage's first non-stopword word
    for word in passage: # For each word in the passage...
        if word not in unique_terms: # If it hasn't already been recorded as a unique word...
            unique_terms.append(word) # This adds the word to the list of unique words
    types = len(unique_terms) # Counts the amount of words in the list of the document's unique words - this is the number of 'types'
    ttr = round(types/tokens, 5) # Calculates the type-token ratio (number of types divided by number of tokens) and rounds this to five decimal places
    passage_year = title[0:4] # Creates a value for the document's year (the first four characters of the file's name)

    document_type_token = {'text': title, 'year': passage_year, 'TTR': ttr, 'types': types,'tokens': tokens} # Creates a dictionary containing the document's title, number of types, number of tokens, and type-token ratio (TTR)
    return document_type_token # Returns this list to be used by the wider loop

### Processing documents

In [5]:
# Running the type_token function on all the documents
type_token_data = [] # Creates an empty array which will be used to hold the type-token data

for document_filepath, document_title in zip(document_files, document_titles): # For every filepath and its associated document title...
    type_token_data.append(type_token(document_filepath, document_title)) # This adds its type-token data to the array of all type-token data
    print(document_title) # This prints the document title when it's done doing the above

1887_Study_in_Scarlet
1890_Sign_of_the_Four
1892_Adventures_of_Sherlock_Holmes
1894_Memoirs_of_Sherlock_Holmes
1902_Hound_of_the_Baskervilles
1905_Return_of_Sherlock_Holmes
1915_Valley_of_Fear
1917_His_Last_Bow
1927_Case-book_of_Sherlock_Holmes


### Writing this data to Excel

In [6]:
!pip install xlsxwriter # Installs xlsxwriter so it can be used
import xlsxwriter # Used as an engine for writing to Excel .xlsx files

ERROR: Invalid requirement: '#'

[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# Writes the type-token data to a Pandas dataframe and then writes this to an Excel file, which we can then use for visualisations
type_token_df = pd.DataFrame(type_token_data) # Makes the type-token data into a Pandas dataframe (a table, essentially)

with pd.ExcelWriter('holmes_type_token_comb.xlsx', engine='xlsxwriter') as writer: # Opens an Excel spreadsheet with the above name and...
    type_token_df.to_excel(writer, sheet_name=f'type_token') # Writes the type-token dataframe to it

#### Update log
- Decided to add a date to each document to facilitate plotting by date (12.12.2023, 14:31)
- Fixed an issue with files not being closed (12.12.2023, 18:06)

#### Notes
- 'Passages' rather than 'documents' or 'texts' in variable names is a holdover from an earlier version, which I have allowed to remain as is