# Type-Token Analysis without split hyphenations

(Separated short stories)

### Importing modules

In [16]:
import glob # Used for importing files matching a specific pattern
from pathlib import Path # Used to interact more easily with the file system
import pandas as pd # Used to shape data as tables and import that data into Excel
import string # Used to have the ascii list of characters

### Importing and setting up functions for cleaning up and processing passages

In [17]:
# Creating a list of corpus documents and their titles
directory_path = 'resources/doyle_holmes_sep/' # Sets directory path to the separated Holmes corpus
document_files = glob.glob(f'{directory_path}/*.txt') # Creates a list with the name of each .txt file (document) in the directory
document_files = [slashes.replace('\\', '/') for slashes in document_files] # Replaces double backslashes with forward slashes so Python can find these files
document_titles = [Path(document).stem for document in document_files] # Creates a list of document titles by taking the file list and removing the trailing '.txt'

In [18]:
# Creating a function to clean the words for each document and setting up a stopwords lists
acceptable_characters = list(string.ascii_lowercase) + list(string.ascii_uppercase) + list(string.digits) # Creates a list of acceptable characters (letters and numbers)
acceptable_characters.append('-') # Adds a hyphen onto this list

stopwords = open('resources/stopwords.txt', encoding='utf-8').read() # Opens the stopwords list and reads it into the variable stopwords
stopwords = stopwords.split(",") # Splits this list of stopwords into a Python list

def clean_words(word):
    for char in word: # For every letter in a word...
        if char not in acceptable_characters: # If it is not a hyphen, letter or number...
            word = word.replace(char, '') # This removes it
    return word.lower() # Returns the word as lowercase with 'unacceptable' characters removed

In [19]:
# Creating a function to do the type-token analysis for each text
def type_token(filepath, title):
    with open(f'{filepath}', encoding='utf8') as f: # Opens the .txt file as variable f
        full_passage = f.read().lower() # Reads the .txt file into a string and then closes the file
    full_passage = full_passage.replace('\n', ' ') # Replaces new-line symbols with spaces
    full_passage = full_passage.split() # Splits this string into a list of words
    for word in full_passage: # For each word in the list of words...
        word = clean_words(word) # This removes any remaining punctuation except for hyphens and makes it lowercase
    tokens = len(full_passage) # Counts the amount of words in the list of all the document's words - this is the number of 'tokens' in the document
    passage = [word for word in full_passage if word not in stopwords] # Makes a list of words with the stopwords removed (adds all the passage's words to a new list unless they are stopwords)

    unique_terms = [passage[0]] # Creates a list of unique terms, starting with the passage's first non-stopword word
    for word in passage: # For each word in the passage...
        if word not in unique_terms: # If it hasn't already been recorded as a unique word...
            unique_terms.append(word) # This adds the word to the list of unique words
    types = len(unique_terms) # Counts the amount of words in the list of the document's unique words - this is the number of 'types'
    ttr = round(types/tokens, 5) # Calculates the type-token ratio (number of types divided by number of tokens) and rounds this to five decimal places
    passage_date = title[0:4] # Creates a value for the document's year (the first four characters of the file's name)
    passage_month = title[5:7] # Creates a value for the document's month (the fifth and sixth characters of the file's name)
    if passage_month[0] == '0': # If the document's month has a trailing zero
        passage_month.replace('0', '') # This removes it
    passage_month_n = int(passage_month) # And this converts it to an integer to do some maths
    passage_month_n = passage_month_n - 1 # And this subtracts one for the purpose of some maths

    # The following maths works by estimating one month into the year being 0.08333, so, if the year is 2023, January is 2023.000000 (so the month needs 1 subtracted), February is 2023.08333 and so on; the purpose of all this is to make visualisations in Excel easier
    passage_month_calc = round(passage_month_n * 0.08333, 5) # Multiplies the month by 0.08333
    passage_month = str(passage_month_calc) # Converts this float value back to a string
    passage_month = passage_month[1:] # Removes the starting zero
    passage_date = passage_date + passage_month # Adds this decimal onto the end of the year string
    document_type_token = {'text': title, 'date': passage_date, 'TTR': ttr, 'types': types,'tokens': tokens} # Creates a dictionary containing the document's title, number of types, number of tokens, and type-token ratio (TTR)
    return document_type_token # Returns this list to be used by the wider loop

### Processing documents

In [20]:
# Running the type_token function on all the documents
type_token_data = [] # Creates an empty array which will be used to hold the type-token data

for document_filepath, document_title in zip(document_files, document_titles): # For every filepath and its associated document title...
    type_token_data.append(type_token(document_filepath, document_title)) # This adds its type-token data to the array of all type-token data
    print(document_title) # This prints the document title when it's done doing the above

1887_12_Study_in_Scarlet
1890_02_Sign_of_the_Four
1891_06_Scandal_in_Bohemia
1891_08_Red-Headed_League
1891_09_Case_of_Identity
1891_10_Boscome_Valley_Mystery
1891_11_Five_Orange_Pips
1891_12_Man_with_the_Twisted_Lip
1892_01_Blue_Carbuncle
1892_02_Speckled_Band
1892_03_Engineers_Thumb
1892_04_Noble_Bachelor
1892_05_Beryl_Coronet
1892_06_Copper_Beeches
1892_12_Silver_Blaze
1893_01_Cardboard_Box
1893_02_Yellow_Face
1893_03_Stockbrokers_Clerk
1893_04_Gloria_Scott
1893_05_Musgrave_Ritual
1893_06_Reigate_Squire
1893_07_Crooked_Man
1893_08_Resident_Patient
1893_09_Greek_Interpreter
1893_11_Naval_Treaty
1893_12_The_Final_Problem
1902_03_Hound_of_the_Baskervilles
1903_10_Empty_House
1903_11_Norwood_Builder
1903_12_Dancing_Men
1904_01_Solitary_Cyclist
1904_02_Priory_School
1904_03_Black_Peter
1904_04_Charles_Augustus_Milverton
1904_05_Six_Napoleons
1904_06_Three_Students
1904_07_Golden_Pince-Nez
1904_08_Missing_Three-Quarter
1904_09_Abbey_Grange
1904_12_Second_Stain
1908_08_Wisteria_Lodge
1908_

### Writing this data to Excel

In [21]:
!pip install xlsxwriter 
# Installs xlsxwriter so it can be used
import xlsxwriter # Used as an engine for writing to Excel .xlsx files




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
# Writes the type-token data to a Pandas dataframe and then writes this to an Excel file, which we can then use for visualisations
type_token_df = pd.DataFrame(type_token_data) # Makes the type-token data into a Pandas dataframe (a table, essentially)

with pd.ExcelWriter('holmes_type_token_sep.xlsx', engine='xlsxwriter') as writer: # Opens an Excel spreadsheet with the above name and...
    type_token_df.to_excel(writer, sheet_name=f'type_token') # Writes the type-token dataframe to it

#### Update log
- Decided to add a date to each document to facilitate plotting by date (12.12.2023, 14:31)
- Fixed an issue with files not being closed (12.12.2023, 18:06)
- Managed to get the date in a format that should cooperate with Excel (12.12.2023, 20:01)
- Realised I had not run clean_words and thus hadn't made words lowercase and fixed this. (18.12.2023, 10:54)

#### Notes
- 'Passages' rather than 'documents' or 'texts' in variable names is a holdover from an earlier version, which I have allowed to remain as is