# Natural Language Processing for parts-of-speech with spaCy

### Importing modules

In [11]:
# Installing spaCy
!pip install -U spacy

# Downloading spaCy's English-language model
!python -m spacy download en_core_web_sm

^C



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 544.7 kB/s eta 0:00:24
      --------------------------------------- 0.3/12.8 MB 1.7 MB/s eta 0:00:08
     - -------------------------------------- 0.6/12.8 MB 3.2 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.7 MB/s eta 0:00:03
     ----- ---------------------------------- 1.6/12.8 MB 6.0 MB/s eta 0:00:02
     ------ --------------------------------- 2.2/12.8 MB 7.0 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 8.5 MB/s eta 0:00:02
     ---------- ----------------------------- 3.5/12.8 MB 8.9 MB/s eta 0:00:02
     ------------- -------------------------- 4


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import spacy # Used for natural language processing
from spacy import displacy
from collections import Counter # Used for counting the most/least common words/bigrams
import pandas as pd # Used to shape data as tables and import that data into Excel
import glob # Used for importing files matching a specific pattern
from pathlib import Path # Used to interact more easily with the file system

nlp = spacy.load('en_core_web_sm') # Loads spaCy's English-language model and gives it the name 'nlp' to make it easier to use

### Developing functions for getting parts-of-speech data and bigram data

In [86]:
# Creating a function for working out the most common adjectives, nouns, pronouns and verbs in each text
def top_speech_parts(filepath):
    with open(filepath, encoding="utf-8") as f: # Opens the .txt file (document) as variable f
        document = nlp(f.read().lower()) # Reads the .txt file into the 'document' variable, applies NLP to it and then closes the file
    adjs = [] # Creates an empty list that will store the adjective data
    nouns = [] # Creates an empty list that will store the noun data
    pronouns = [] # Creates an empty list that will store the pronoun data
    verbs = [] # Creates an empty list that will store the verb data
    
    for token in document: # For every word in the document...
        if token.pos_ == 'ADJ': # If it is an adjective...
            adjs.append(token.text) # Record it to the adjective list
        elif token.pos_ == 'NOUN': # If it is a noun...
            nouns.append(token.text) # Record it to the noun list
        elif token.pos_ == 'PRON': # If it is a pronoun...
            pronouns.append(token.text) # Record it to the pronoun list
        elif token.pos_ == 'VERB': # If it is a verb...
            verbs.append(token.text) # Record it to the verb list
        else: # If it is not any of these...
            pass # Go on to the next word
    
    tokens = make_tokens(document) # Stores the value of what the make_tokens function created below returns

    adjs_tally = Counter(adjs) # Counts how many times each adjective is used in a tuple with the word and its number of incidences
    adjs_tally = adjs_tally.most_common() # Reorders this data from most common to least common

    nouns_tally = Counter(nouns) # Counts how many times each noun is used in a tuple with the word and its number of incidences
    nouns_tally = nouns_tally.most_common() # Reorders this data from most common to least common

    pronouns_tally = Counter(pronouns) # Counts how many times each pronoun is used in a tuple with the word and its number of incidences
    pronouns_tally = pronouns_tally.most_common() # Reorders this data from most common to least common

    verbs_tally = Counter(verbs) # Counts how many times each verb is used in a tuple with the word and its number of incidences
    verbs_tally = verbs_tally.most_common() # Reorders this data from most common to least common
    
    return adjs_tally, nouns_tally, pronouns_tally, verbs_tally, tokens # Returns the tallies of adjectives, nouns, pronouns and verbs, and the list of tokens in the text and their part-of-speech

In [80]:
# Creating a function for splitting a document into a list of words and their parts-of-speech tags
def make_tokens(doc): # Takes in an already opened document
    tokens_and_labels = [] # Creates an empty list that will store tokens and their associated part-of-speech type
    
    for token in doc: # For every word in the document...
        if token.is_alpha: # If it consists of alphabetic characters...
            tokens_and_labels.append((token.text, token.pos_)) # Add a tuple containing it and it's part-of-speech tag to the list
    
    return tokens_and_labels # Returns this list

In [81]:
# Creating a function for getting ngrams (groups of consecutive words)
def get_ngrams(word_list,number_consecutive_words): # Takes in a pre-made list of tokens and their part-of-speech tags, and the desired length of ngram
    ngrams = [] # Creates an empty list that will be used to store ngrams (groups of a certain number of words)
    adjusted_word_list_len = len(word_list)-(number_consecutive_words-1) # Works out the number up to which we can find ngrams of the stated length
    for i in range(adjusted_word_list_len): # For each number word in this document...
        ngram = word_list[i:i+number_consecutive_words] # Save the word and the word after it to the variable ngram
        ngrams.append(ngram) # Add this ngram to the list of ngrams
    
    return ngrams # Returns the list of ngrams in the text

In [82]:
# Creating a function for seeing which words appear close to a searched for word
def get_neighbour_words(keyword,bigrams,pos_label='None'): # Takes in a keyword we're looking for, a list of ngrams (bigrams in our case), and the part-of-speech tag we're looking for, setting it as 'None' by default
    neighbour_words = [] # Creates an empty list that will store words found next to the searched for keyword
    keyword = keyword.lower() # Makes the keyword lowercase - all words will be made lowercase so case-sensitivity isn't an issue
    
    for bigram in bigrams: # For every ngram in the list of ngrams (in our case, bigrams - this is accurate but also a variable name held over from a prior version of the code)
        words = [word.lower() for word, label in bigram] # Sets the words to lowercase in each bigram
        if keyword in words: # If the keyword is found in the bigram...
            for word, label in bigram: # For each word and associated label...
                if word.lower() != keyword: # If the word isn't the keyword...
                    if label == pos_label or pos_label == None: # And it is the part-of-speech we're looking for, or we aren't targeting a part of speech
                        neighbour_words.append(word.lower()) # Add this to the list of words used around the keyword (neighbour words)
    
    return Counter(neighbour_words).most_common() # Returns the list of neighbour words a) counted and put into tuples and b) ordered from most to least common

In [83]:
# Creating a list of corpus documents and their titles
directory_path = 'resources/doyle_holmes_comb/' # Sets directory path to the combined Holmes corupus
document_files = glob.glob(f'{directory_path}/*.txt') # Creates a list with the name of each .txt file (document) in the directory
document_files = [slashes.replace('\\', '/') for slashes in document_files] # Replaces double backslashes with forward slashes so Python can find these files
document_titles = [Path(document).stem for document in document_files] # Creates a list of document titles by taking the file list and removing the trailing '.txt'

### Processing documents

In [87]:
for document_filepath, document_title in zip(document_files, document_titles): # For every filepath and its associated document title...
    # Retrieve part-of-speech data
    adjs,nouns,pronouns,verbs,tokens = top_speech_parts(document_filepath) # Apply the top_speech_parts function to the document and store the results

    # Convert part-of-speech data to dataframes
    adjs_df = pd.DataFrame(adjs, columns=['adjective', 'count']) # Convert the adjective data into a Pandas dataframe (a table, essentially)
    nouns_df = pd.DataFrame(nouns, columns=['noun', 'count']) # Convert the noun data into a Pandas dataframe 
    pronouns_df = pd.DataFrame(pronouns, columns=['pronoun', 'count']) # Convert the pronoun data into a Pandas dataframe 
    verbs_df = pd.DataFrame(verbs, columns=['verb', 'count']) # Convert the verb data into a Pandas dataframe

    # Write these dataframes to an Excel file
    with pd.ExcelWriter(f'natural_lang/expanded/parts_of_speech/{document_title.lower()}_parts_of_speech.xlsx', engine='xlsxwriter') as writer: # Open an Excel file in the listed folder with the name of the document and '_parts_of_speech' and ...
        adjs_df.to_excel(writer, sheet_name=f'adjectives') # Writes the adjective data to a sheet called 'adjectives'
        nouns_df.to_excel(writer, sheet_name=f'nouns') # Writes the noun data to a sheet called 'nouns'
        pronouns_df.to_excel(writer, sheet_name=f'pronouns') # Writes the pronoun data to a sheet called 'pronouns'
        verbs_df.to_excel(writer, sheet_name=f'verbs') # Writes the verb data to a sheet called 'verbs'
        # The file then closes
    
    # Retrieve bigrams for Sherlock Holmes
    ngrams = get_ngrams(tokens,2) # Get bigrams from the list of tokens we made earlier
    sherlock_adjs = get_neighbour_words('Sherlock',ngrams,pos_label='ADJ') # Get the ordered list of adjectives used around 'Sherlock' by running get_neighbour_words
    sherlock_verbs = get_neighbour_words('Sherlock',ngrams,pos_label='VERB') # Get the ordered list of verbs used around 'Sherlock' by running get_neighbour_words
    holmes_adjs = get_neighbour_words('Holmes',ngrams,pos_label='ADJ') # Get the ordered list of adjectives used around 'Holmes' by running get_neighbour_words
    holmes_verbs = get_neighbour_words('Holmes',ngrams,pos_label='VERB') # Get the ordered list of verbs used around 'Holmes' by running get_neighbour_words

    # Convert bigram data to dataframes
    sherlock_adjs_df = pd.DataFrame(sherlock_adjs, columns=['adjective', 'count']) # Convert the 'Sherlock' adjectives to a Pandas dataframe
    sherlock_verbs_df = pd.DataFrame(sherlock_verbs, columns=['verb', 'count']) # Convert the 'Sherlock' verbs to a Pandas dataframe
    holmes_adjs_df = pd.DataFrame(holmes_adjs, columns=['adjective', 'count']) # Convert the 'Holmes' adjectives to a Pandas dataframe
    holmes_verbs_df = pd.DataFrame(holmes_verbs, columns=['verb', 'count']) # Convert the 'Holmes' verbs to a Pandas dataframe

    # Write these dataframes to an Excel file
    with pd.ExcelWriter(f'natural_lang/expanded/bigrams/{document_title.lower()}_bigrams.xlsx', engine='xlsxwriter') as writer: # This opens an Excel file in the listed folder with the name of the document and '_bigrams' and ...
        sherlock_adjs_df.to_excel(writer, sheet_name=f'sherlock_adj') # Writes the 'Sherlock' adjectives to a sheet called 'sherlock_adj'
        sherlock_verbs_df.to_excel(writer, sheet_name=f'sherlock_verb') # Writes the 'Sherlock' verbs to a sheet called 'sherlock_verb'
        holmes_adjs_df.to_excel(writer, sheet_name=f'holmes_adj') # Writes the 'Holmes' adjectives to a sheet called 'holmes_adj'
        holmes_verbs_df.to_excel(writer, sheet_name=f'holmes_verb') # Writes the 'Holmes' verbs to a sheet called 'holmes_verb'
        # The file then closes

    print(document_title) # This prints the document title when it's done doing the above to let the user know it has been processed

1887_Study_in_Scarlet
1890_Sign_of_the_Four
1892_Adventures_of_Sherlock_Holmes
1894_Memoirs_of_Sherlock_Holmes
1902_Hound_of_the_Baskervilles
1905_Return_of_Sherlock_Holmes
1915_Valley_of_Fear
1917_His_Last_Bow
1927_Case-book_of_Sherlock_Holmes


#### Update / progress log
- I spent a significant amount of time troubleshooting only to find out I had not included an underscore in .pos_ (13.12.2023, 14:14)
- Made the decision to take the simplest route and make each document its own Excel document which I will then combine manually (13.12.2023, 14:32)
- Updated the parts-of-speech section to convert all words to lowercase so I don't get double-counts for the same word (13.12.2023, 20:11)

In [None]:
# Retrieve bigrams for Sherlock Holmes
    ngrams = get_ngrams(tokens,2) # Get bigrams from the list of tokens we made earlier
    sherlock_adjs = get_neighbour_words('Sherlock',ngrams,pos_label='ADJ') # Get the ordered list of adjectives used around 'Sherlock' by running get_neighbour_words
    sherlock_verbs = get_neighbour_words('Sherlock',ngrams,pos_label='VERB') # Get the ordered list of verbs used around 'Sherlock' by running get_neighbour_words
    holmes_adjs = get_neighbour_words('Holmes',ngrams,pos_label='ADJ') # Get the ordered list of adjectives used around 'Holmes' by running get_neighbour_words
    holmes_verbs = get_neighbour_words('Holmes',ngrams,pos_label='VERB') # Get the ordered list of verbs used around 'Holmes' by running get_neighbour_words

    # Convert bigram data to dataframes
    sherlock_adjs_df = pd.DataFrame(sherlock_adjs, columns=['adjective', 'count']) # Convert the 'Sherlock' adjectives to a Pandas dataframe
    sherlock_verbs_df = pd.DataFrame(sherlock_verbs, columns=['verb', 'count']) # Convert the 'Sherlock' verbs to a Pandas dataframe
    holmes_adjs_df = pd.DataFrame(holmes_adjs, columns=['adjective', 'count']) # Convert the 'Holmes' adjectives to a Pandas dataframe
    holmes_verbs_df = pd.DataFrame(holmes_verbs, columns=['verb', 'count']) # Convert the 'Holmes' verbs to a Pandas dataframe

    # Write these dataframes to an Excel file
    with pd.ExcelWriter(f'natural_lang/expanded/bigrams/{document_title.lower()}_bigrams.xlsx', engine='xlsxwriter') as writer: # This opens an Excel file in the listed folder with the name of the document and '_bigrams' and ...
        sherlock_adjs_df.to_excel(writer, sheet_name=f'sherlock_adj') # Writes the 'Sherlock' adjectives to a sheet called 'sherlock_adj'
        sherlock_verbs_df.to_excel(writer, sheet_name=f'sherlock_verb') # Writes the 'Sherlock' verbs to a sheet called 'sherlock_verb'
        holmes_adjs_df.to_excel(writer, sheet_name=f'holmes_adj') # Writes the 'Holmes' adjectives to a sheet called 'holmes_adj'
        holmes_verbs_df.to_excel(writer, sheet_name=f'holmes_verb') # Writes the 'Holmes' verbs to a sheet called 'holmes_verb'