## Analysis of long words and word frequency in Parks Canada BIAs

### Step 1: Load libraries and BIA project descriptions.

The project descriptions will be tokenized into a list for counting word frequency.

In [10]:
import pandas as pd
import numpy as np

from pprint import pprint # pretty printer

import re # regular expressions

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/whitneylight/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load the registry data (the clean project descriptions)

reg_data = pd.read_excel("data/processed/BIA_50select_edited.xlsx")
reg_data.head(3)

Unnamed: 0,BIA_no,title,description,field_unit,park,lat,lon,province,contact_internal,fuia_num,...,attach_ia,created,modified,created_by,modified_by,id,item_type,path,lang_check,wordcount
0,87,Wildfire Risk Reduction at Fort Walsh National...,The purpose of this project is to reduce the r...,Saskatchewan South,Fort Walsh National Historic Site of Canada,49.573414,-109.882274,Saskatchewan,Sherri Clifford,SSFU-2020-035-FW,...,,2020-12-07 12:13:59,2021-03-01 17:16:27,Krista Cairns,Krista Cairns,192,Item,sites/ImpactAssessment/Lists/DIA Tracking List,en,149
1,110,Shoreline Protection - Seawall Repairs,"An armoured stone seawall, located around the ...",Southern New Brunswick,St. Andrews Blockhouse National Historic Site ...,45.07707,67.06198,New Brunswick,Shirley Butland,NBSouth-2020-EIA-4,...,,2020-09-25 10:24:42,2021-02-24 15:02:33,Shirley Butland,Shirley Butland,169,Item,sites/ImpactAssessment/Lists/DIA Tracking List,en,139
2,197,Big Island Marsh DFO Fisheries Habitat Offset ...,A Fisheries Act Authorization is required from...,Waterways,Trent-Severn Waterway National Historic Site o...,44.106416,-77.231107,Ontario,Randy Power,TS-2019-05,...,,2020-02-19 09:18:53,2021-07-27 11:54:55,Randal Power,Randal Power,82,Item,sites/ImpactAssessment/Lists/DIA Tracking List,en,155


In [3]:
# put the data into a list as tuple (BIA_no, description)

data = list(zip(reg_data["BIA_no"], reg_data["description"]))
pprint(data)

[(87,
  'The purpose of this project is to reduce the risk of wildfire at Fort Walsh '
  'National Historic Site by thinning pockets of fuel in two forested areas '
  'near the fort site. The total area to be treated is 25 ha. The absence of '
  'fire and the maturity of the forest around the fort site has resulted in '
  'the accumulation of ladder fuels, old or dying mature trees and downed '
  'woody material. By reducing the fuel load on site, removing trees and '
  'administering various FireSmart techniques (stand thinning, ladder fuel '
  'removal etc.) the impact of a wildfire on the site can be reduced. Ladder '
  'fuels will be removed and tree limbs pruned to a height of 2m+ using '
  'chainsaws and other various hand tools.  Dense areas will be thinned by '
  'hand felling and deadfall will also be removed by hand. This project will '
  'be assessed with a basic impact assessment. '),
 (110,
  'An armoured stone seawall, located around the perimeter of the shoreline '
  'ad

#### Tokenize and lowercase the words

In [4]:
# use list comprehension and re to tokenize the texts into words

datawords = [ (ID, re.findall("\w+", text)) for ID, text in data]
pprint(datawords)

[(87,
  ['The',
   'purpose',
   'of',
   'this',
   'project',
   'is',
   'to',
   'reduce',
   'the',
   'risk',
   'of',
   'wildfire',
   'at',
   'Fort',
   'Walsh',
   'National',
   'Historic',
   'Site',
   'by',
   'thinning',
   'pockets',
   'of',
   'fuel',
   'in',
   'two',
   'forested',
   'areas',
   'near',
   'the',
   'fort',
   'site',
   'The',
   'total',
   'area',
   'to',
   'be',
   'treated',
   'is',
   '25',
   'ha',
   'The',
   'absence',
   'of',
   'fire',
   'and',
   'the',
   'maturity',
   'of',
   'the',
   'forest',
   'around',
   'the',
   'fort',
   'site',
   'has',
   'resulted',
   'in',
   'the',
   'accumulation',
   'of',
   'ladder',
   'fuels',
   'old',
   'or',
   'dying',
   'mature',
   'trees',
   'and',
   'downed',
   'woody',
   'material',
   'By',
   'reducing',
   'the',
   'fuel',
   'load',
   'on',
   'site',
   'removing',
   'trees',
   'and',
   'administering',
   'various',
   'FireSmart',
   'techniques',
   'stand

   'water',
   'shoreline',
   'anchoring',
   'systems',
   'and',
   'installation',
   'of',
   'boardwalks',
   'and',
   'kiosks',
   'at',
   'shore',
   'landing',
   'points',
   'Replacements',
   'and',
   'or',
   'minor',
   'location',
   'adjustments',
   'of',
   'mooring',
   'buoy',
   'anchors',
   'may',
   'also',
   'be',
   'carried',
   'out',
   'where',
   'necessary',
   'Dock',
   'project',
   'locations',
   'include',
   'Adelaide',
   'Island',
   'East',
   'Grenadier',
   'Island',
   'Georgina',
   'Island',
   'Constance',
   'Island',
   'Mulcaster',
   'Island',
   'Milton',
   'Island',
   'Hill',
   'Island',
   'and',
   'Stovin',
   'Island',
   'Project',
   'work',
   'is',
   'planned',
   'to',
   'be',
   'commence',
   'in',
   'April',
   '2020',
   'and',
   'conclude',
   'by',
   'November',
   '2020',
   'This',
   'project',
   'will',
   'be',
   'assessed',
   'with',
   'a',
   'Basic',
   'Impact',
   'Assessment']),
 (271,
  ['A

   'over',
   'the',
   'Peace',
   'River',
   'ice',
   'depth',
   'measurements',
   'removal',
   'of',
   'pack',
   'ice',
   'on',
   'the',
   'north',
   'and',
   'south',
   'shore',
   'of',
   'the',
   'Peace',
   'River',
   'the',
   'flooding',
   'of',
   'the',
   'river',
   'until',
   'adequate',
   'ice',
   'depth',
   'is',
   'achieved',
   'The',
   'winter',
   'road',
   'then',
   'continues',
   'through',
   'the',
   'forest',
   'which',
   'requires',
   'snow',
   'removal',
   'brushing',
   'of',
   'fallen',
   'trees',
   'willows',
   'removal',
   'of',
   'large',
   'rocks',
   'tree',
   'stumps',
   'etc',
   'Coordination',
   'is',
   'required',
   'with',
   'the',
   'southern',
   'crew',
   'working',
   'at',
   'the',
   'Fort',
   'Chipewyan',
   'Rocher',
   'River',
   'Ice',
   'crossing',
   'Maintenance',
   'of',
   'ice',
   'bridge',
   'winter',
   'road',
   'is',
   'required',
   'throughout',
   'the',
   'season',
 

   'stumps',
   'to',
   'control',
   'regrowth',
   'Garlon',
   'a',
   'triclopyr',
   'herbicide',
   'was',
   'selected',
   'for',
   'this',
   'trial',
   'because',
   'it',
   'specifically',
   'targets',
   'broadleaf',
   'species',
   'successfully',
   'controls',
   'resprouting',
   'and',
   'has',
   'little',
   'to',
   'no',
   'long',
   'term',
   'effects',
   'on',
   'grasses',
   'or',
   'sedges',
   'as',
   'has',
   'been',
   'documented',
   'in',
   'peer',
   'reviewed',
   'academic',
   'research',
   'Garlon',
   'will',
   'be',
   'applied',
   'in',
   'a',
   'manner',
   'so',
   'as',
   'to',
   'avoid',
   'leaching',
   'aerial',
   'drift',
   'or',
   'surface',
   'runoff',
   'The',
   'proposed',
   'use',
   'of',
   'Garlon',
   'herbicide',
   'is',
   'consistent',
   'with',
   'Parks',
   'Canada',
   's',
   'Integrated',
   'Pest',
   'Management',
   'Plan',
   'for',
   'Gulf',
   'Islands',
   'National',
   'Park',
   '

   'to',
   'Highway',
   '93',
   'South',
   'highway',
   'closure',
   'time',
   'and',
   'gains',
   'in',
   'operational',
   'efficiency',
   'and',
   'worker',
   'safety',
   'The',
   'total',
   'footprint',
   'for',
   'four',
   'Gazex',
   'RACS',
   'units',
   'including',
   'foundations',
   'shelters',
   'and',
   'pipelines',
   'is',
   '230',
   'm',
   'A',
   'maximum',
   'of',
   'three',
   'leveled',
   'work',
   'pads',
   'will',
   'be',
   'required',
   'per',
   'site',
   '72',
   'm²',
   'footprint',
   'One',
   'or',
   'two',
   'permanent',
   'helipads',
   'may',
   'also',
   'be',
   'required',
   '50',
   'm²',
   'maximum',
   'footprint']),
 (272,
  ['Hunters',
   'Lake',
   'is',
   'a',
   'one',
   'way',
   'trail',
   'that',
   'incorporates',
   'various',
   'landscape',
   'features',
   'including',
   'jack',
   'pine',
   'ridges',
   'aspen',
   'forest',
   'and',
   'aquatic',
   'ecosytems',
   'During',
   'a',
  

In [5]:
# lower case all words
datawords = [ (ID, [w.lower() for w in words]) for ID, words in datawords]
pprint(datawords)

[(87,
  ['the',
   'purpose',
   'of',
   'this',
   'project',
   'is',
   'to',
   'reduce',
   'the',
   'risk',
   'of',
   'wildfire',
   'at',
   'fort',
   'walsh',
   'national',
   'historic',
   'site',
   'by',
   'thinning',
   'pockets',
   'of',
   'fuel',
   'in',
   'two',
   'forested',
   'areas',
   'near',
   'the',
   'fort',
   'site',
   'the',
   'total',
   'area',
   'to',
   'be',
   'treated',
   'is',
   '25',
   'ha',
   'the',
   'absence',
   'of',
   'fire',
   'and',
   'the',
   'maturity',
   'of',
   'the',
   'forest',
   'around',
   'the',
   'fort',
   'site',
   'has',
   'resulted',
   'in',
   'the',
   'accumulation',
   'of',
   'ladder',
   'fuels',
   'old',
   'or',
   'dying',
   'mature',
   'trees',
   'and',
   'downed',
   'woody',
   'material',
   'by',
   'reducing',
   'the',
   'fuel',
   'load',
   'on',
   'site',
   'removing',
   'trees',
   'and',
   'administering',
   'various',
   'firesmart',
   'techniques',
   'stand

   'buoy',
   'anchors',
   'may',
   'also',
   'be',
   'carried',
   'out',
   'where',
   'necessary',
   'dock',
   'project',
   'locations',
   'include',
   'adelaide',
   'island',
   'east',
   'grenadier',
   'island',
   'georgina',
   'island',
   'constance',
   'island',
   'mulcaster',
   'island',
   'milton',
   'island',
   'hill',
   'island',
   'and',
   'stovin',
   'island',
   'project',
   'work',
   'is',
   'planned',
   'to',
   'be',
   'commence',
   'in',
   'april',
   '2020',
   'and',
   'conclude',
   'by',
   'november',
   '2020',
   'this',
   'project',
   'will',
   'be',
   'assessed',
   'with',
   'a',
   'basic',
   'impact',
   'assessment']),
 (271,
  ['a',
   'new',
   'culvert',
   'needs',
   'to',
   'be',
   'installed',
   'on',
   'the',
   'boundary',
   'road',
   'north',
   'of',
   'osten',
   'lake',
   'camping',
   'access',
   'on',
   'rr',
   '205',
   'the',
   'culvert',
   'has',
   'collapsed',
   'and',
   'completel

   'zone',
   'ii']),
 (238,
  ['the',
   'project',
   'includes',
   'mobilizing',
   'and',
   'staging',
   'equipment',
   'at',
   'moose',
   'island',
   'fuel',
   'heavy',
   'equipment',
   'flooding',
   'pumps',
   'chainsaws',
   'etc',
   'construction',
   'of',
   'the',
   'ice',
   'bridge',
   'over',
   'the',
   'peace',
   'river',
   'ice',
   'depth',
   'measurements',
   'removal',
   'of',
   'pack',
   'ice',
   'on',
   'the',
   'north',
   'and',
   'south',
   'shore',
   'of',
   'the',
   'peace',
   'river',
   'the',
   'flooding',
   'of',
   'the',
   'river',
   'until',
   'adequate',
   'ice',
   'depth',
   'is',
   'achieved',
   'the',
   'winter',
   'road',
   'then',
   'continues',
   'through',
   'the',
   'forest',
   'which',
   'requires',
   'snow',
   'removal',
   'brushing',
   'of',
   'fallen',
   'trees',
   'willows',
   'removal',
   'of',
   'large',
   'rocks',
   'tree',
   'stumps',
   'etc',
   'coordination',
   'is',

   'traditional',
   'control',
   'methods',
   'of',
   'hand',
   'pulling',
   'or',
   'clipping',
   'ineffective',
   'continued',
   'english',
   'hawthorn',
   'invasion',
   'will',
   'degrade',
   'ecosystem',
   'health',
   'by',
   'supplanting',
   'native',
   'species',
   'and',
   'changing',
   'local',
   'environmental',
   'conditions',
   'to',
   'favor',
   'shrubs',
   'and',
   'trees',
   'rather',
   'than',
   'meadow',
   'species',
   'an',
   'experimental',
   'approach',
   'is',
   'being',
   'taken',
   'to',
   'compare',
   'the',
   'efficacy',
   'of',
   'several',
   'physical',
   'and',
   'chemical',
   'removal',
   'methods',
   'in',
   'order',
   'to',
   'determine',
   'best',
   'practices',
   'for',
   'english',
   'hawthorn',
   'control',
   'english',
   'hawthorn',
   'shrubs',
   'will',
   'be',
   'cut',
   'down',
   'using',
   'hand',
   'saws',
   'or',
   'power',
   'tools',
   'eg',
   'brush',
   'cutters',
   

   'a',
   'healthy',
   'understory',
   'there',
   'are',
   'few',
   'social',
   'trails',
   'through',
   'the',
   'area',
   'that',
   'should',
   'be',
   'used',
   'as',
   'access',
   'routes',
   'where',
   'possible',
   'to',
   'minimize',
   'disturbance']),
 (114,
  ['parks',
   'canada',
   'is',
   'proposing',
   'to',
   'install',
   'four',
   'remote',
   'avalanche',
   'control',
   'system',
   'racs',
   'units',
   'on',
   'mt',
   'whymper',
   'currently',
   'parks',
   'canada',
   'agency',
   'performs',
   'avalanche',
   'control',
   'by',
   'placing',
   'explosives',
   'from',
   'helicopters',
   'in',
   'starting',
   'zones',
   'of',
   'mt',
   'whymper',
   'however',
   'control',
   'by',
   'helicopter',
   'is',
   'often',
   'limited',
   'by',
   'poor',
   'visibility',
   'and',
   'wind',
   'in',
   'the',
   'starting',
   'zones',
   'effecting',
   'timing',
   'and',
   'efficiency',
   'of',
   'avalanche',
   'co

#### Remove stopwords

In [11]:
stop_words = set(stopwords.words('english')) # set stopwords

filtered_words = [ (ID, [word for word in words if word not in stop_words]) for ID, words in datawords]
pprint(filtered_words)

[(87,
  ['purpose',
   'project',
   'reduce',
   'risk',
   'wildfire',
   'fort',
   'walsh',
   'national',
   'historic',
   'site',
   'thinning',
   'pockets',
   'fuel',
   'two',
   'forested',
   'areas',
   'near',
   'fort',
   'site',
   'total',
   'area',
   'treated',
   '25',
   'ha',
   'absence',
   'fire',
   'maturity',
   'forest',
   'around',
   'fort',
   'site',
   'resulted',
   'accumulation',
   'ladder',
   'fuels',
   'old',
   'dying',
   'mature',
   'trees',
   'downed',
   'woody',
   'material',
   'reducing',
   'fuel',
   'load',
   'site',
   'removing',
   'trees',
   'administering',
   'various',
   'firesmart',
   'techniques',
   'stand',
   'thinning',
   'ladder',
   'fuel',
   'removal',
   'etc',
   'impact',
   'wildfire',
   'site',
   'reduced',
   'ladder',
   'fuels',
   'removed',
   'tree',
   'limbs',
   'pruned',
   'height',
   '2m',
   'using',
   'chainsaws',
   'various',
   'hand',
   'tools',
   'dense',
   'areas',
   'thin

   'main',
   'station',
   'proposed',
   'location',
   'located',
   'approximately',
   '1',
   '8',
   'km',
   'east',
   'main',
   'station',
   'passes',
   'one',
   'narrowest',
   'sections',
   'dunes',
   '100m',
   'long',
   'currently',
   'almost',
   'devoid',
   'vegetation',
   'easily',
   'navigable',
   'sinpr',
   'vehicles',
   'project',
   'assessed',
   'basic',
   'impact',
   'assessment']),
 (49,
  ['impact',
   'assessment',
   'required',
   'address',
   'potential',
   'impacts',
   'resulting',
   'repair',
   'class',
   'a2',
   'pedestrian',
   'bridge',
   'west',
   'coast',
   'trail',
   'pacific',
   'rim',
   'national',
   'park',
   'reserve',
   'bridge',
   'repairs',
   'required',
   'ensure',
   'safety',
   'visitors',
   'staff',
   'traversing',
   'maintaining',
   'trail',
   'limited',
   'ground',
   'disturbance',
   'within',
   'riparian',
   'habitat',
   'required',
   'upgrade',
   'mid',
   'span',
   'bridge',
   'supp

 (39,
  ['fundy',
   'national',
   'park',
   'intends',
   'initiate',
   'vegetation',
   'management',
   'plan',
   'reduce',
   'potential',
   'hazards',
   'excessive',
   'vegetation',
   'growth',
   'seepage',
   'caused',
   'deep',
   'roots',
   'cavities',
   'created',
   'decaying',
   'roots',
   'erosion',
   'caused',
   'uprooted',
   'trees',
   'competition',
   'undesired',
   'vegetation',
   'dense',
   'vegetation',
   'obscure',
   'ability',
   'observe',
   'deficiencies',
   'etc',
   'bennett',
   'lake',
   'dam',
   'wolfe',
   'lake',
   'water',
   'retaining',
   'structure',
   'left',
   'unchecked',
   'existing',
   'vegetation',
   'could',
   'pose',
   'serious',
   'threat',
   'strength',
   'stabilize',
   'structures',
   'proposed',
   'vegetation',
   'management',
   'activities',
   'conducted',
   'periodically',
   'growing',
   'season',
   'repeated',
   'annually',
   'basic',
   'impact',
   'assessment',
   'required',
   'inve

#### Create master list of all words

In [12]:
# extract words from tuples
words_only = []

for project in filtered_words:
    
    words_only.append(project[1])

# create master list from list of lists
master_wordlist = []

for sublist in words_only:
    for word in sublist:
        master_wordlist.append(word)
        
print(len(master_wordlist))

4397


### Step 2 : Count the frequency of words in the master word list (project descriptions)

In [13]:
from collections import Counter

In [36]:
wordcounts = Counter(master_wordlist)
pprint(wordcounts)

Counter({'project': 68,
         'site': 52,
         'park': 50,
         'area': 46,
         'national': 36,
         'work': 35,
         'island': 32,
         'existing': 30,
         'trail': 27,
         'new': 23,
         'areas': 22,
         'fire': 21,
         'canada': 21,
         'water': 20,
         'road': 20,
         'proposed': 20,
         'impact': 19,
         'assessment': 19,
         'within': 19,
         'forest': 17,
         'also': 17,
         'basic': 17,
         'required': 17,
         'parks': 17,
         'equipment': 17,
         'bridge': 17,
         'culvert': 17,
         'vegetation': 16,
         'assessed': 15,
         'access': 15,
         'management': 15,
         'approximately': 15,
         'deer': 15,
         'station': 15,
         '2020': 15,
         'replacement': 15,
         'habitat': 14,
         'main': 14,
         'species': 14,
         'reduce': 13,
         'trees': 13,
         'construction': 13,
         'lake'

         'perimeter': 1,
         'st': 1,
         'andrews': 1,
         'blockhouse': 1,
         'wave': 1,
         'action': 1,
         'passamaquoddy': 1,
         'cause': 1,
         'rehabilitate': 1,
         'raise': 1,
         'geotextile': 1,
         'fabric': 1,
         'reinstating': 1,
         'sitting': 1,
         'fitted': 1,
         'pieces': 1,
         'relation': 1,
         'tsw': 1,
         'trenton': 1,
         'glen': 1,
         'miller': 1,
         'provisions': 1,
         'offsetting': 1,
         'causeway': 1,
         '1950': 1,
         'decreased': 1,
         'flows': 1,
         'cattails': 1,
         'cover': 1,
         'entire': 1,
         'expand': 1,
         'enhancements': 1,
         'constructing': 1,
         'widening': 1,
         'deepening': 1,
         'cattail': 1,
         'width': 1,
         'stay': 1,
         'offset': 1,
         'enhancement': 1,
         'strategic': 1,
         'dozer': 1,
         'border': 1,


         'entails': 1,
         'selective': 1,
         'pruning': 1,
         'principles': 1,
         'loading': 1,
         'providing': 1,
         'safer': 1,
         'firefighters': 1,
         'event': 1,
         'wildland': 1,
         'interface': 1,
         'tumbo': 1,
         'wetland': 1,
         'saturna': 1,
         'warburton': 1,
         'pike': 1,
         'summit': 1,
         'roe': 1,
         'january': 1,
         'latitude': 1,
         'longitude': 1,
         'pca': 1,
         'segment': 1,
         'segments': 1,
         'bob': 1,
         'hunter': 1,
         'memorial': 1,
         'boyle': 1,
         'cemetery': 1,
         'intended': 1,
         'boardwalk': 1,
         'wet': 1,
         'markham': 1,
         'gateway': 1,
         'welcome': 1,
         'separate': 1,
         'farming': 1,
         'mackenzie': 1,
         '407': 1,
         '14th': 1,
         'routing': 1,
         'private': 1,
         'property': 1,
         'owner':

#### Refine count to words with 3 syllables or more

In [18]:
# Import a library for counting syllables

nltk.download('cmudict')
from nltk.corpus import cmudict
d = cmudict.dict()
            

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/whitneylight/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [14]:
# define function that returns the number of syllables

def nsyl(word): 
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]]
    except KeyError:
        return syllables(word) # helper function

In [54]:
# define helper function to count sylls in words that are not in the dictionary

def syllables(word):
    #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return [count]

In [55]:
print(nsyl("fire"))
print(nsyl("firesmart"))

[2, 1]
[3]


In [20]:
# Get rid of numbers in the word list

wordlist_clean = [word for word in master_wordlist if word.isdigit() == False]

print(wordlist_clean)

['purpose', 'project', 'reduce', 'risk', 'wildfire', 'fort', 'walsh', 'national', 'historic', 'site', 'thinning', 'pockets', 'fuel', 'two', 'forested', 'areas', 'near', 'fort', 'site', 'total', 'area', 'treated', 'ha', 'absence', 'fire', 'maturity', 'forest', 'around', 'fort', 'site', 'resulted', 'accumulation', 'ladder', 'fuels', 'old', 'dying', 'mature', 'trees', 'downed', 'woody', 'material', 'reducing', 'fuel', 'load', 'site', 'removing', 'trees', 'administering', 'various', 'firesmart', 'techniques', 'stand', 'thinning', 'ladder', 'fuel', 'removal', 'etc', 'impact', 'wildfire', 'site', 'reduced', 'ladder', 'fuels', 'removed', 'tree', 'limbs', 'pruned', 'height', '2m', 'using', 'chainsaws', 'various', 'hand', 'tools', 'dense', 'areas', 'thinned', 'hand', 'felling', 'deadfall', 'also', 'removed', 'hand', 'project', 'assessed', 'basic', 'impact', 'assessment', 'armoured', 'stone', 'seawall', 'located', 'around', 'perimeter', 'shoreline', 'adjacent', 'st', 'andrews', 'blockhouse', 'na

In [21]:
# Get rid of things that aren't words (e.g. 2m or m2 or km)

notwords = []

wordlist2 = []

def containsNumber(value): # define function that checks if a string contains a number
    return any([char.isdigit() for char in value])

for word in wordlist_clean: # iterate through the wordlist
        if containsNumber(word) == True: # if it contains a number, put it in a separate list of notwords
            notwords.append(word)
        else:
            wordlist2.append(word)
            
print(notwords)
            

['2m', 'km2', 'w3m', '600mm', '450mm', '100m', '100m', 'a2', '5m', '8m', '20m2', '30m', '3rd', '2m', '5m', '2m', '7m', '1m', '16th', '14th', '16th', 'm2', 'm²', 'm²', 'w3m', '5m', '5m', '1m']


In [56]:
# Get syllables for each word in the master list

syl_list = []

for word in wordlist2:
    count = nsyl(word)
    syl_list.append(count)

# Zip together words and syllable counts

pair_list = list(zip(wordlist2, syl_list))

pair_list[0]


('purpose', [2])

In [57]:
# Look at the words where nysl() has given more than one syllable count

for pair in pair_list:
    if len(pair[1]) > 1:
        print (pair)

('project', [2, 2])
('reduce', [2, 2, 2])
('national', [3, 2])
('fuel', [2, 1])
('treated', [2, 2])
('ha', [1, 2])
('fire', [2, 1])
('maturity', [4, 4])
('forest', [2, 2])
('around', [2, 2])
('resulted', [3, 3, 3])
('mature', [2, 2])
('reducing', [3, 3, 3])
('fuel', [2, 1])
('fuel', [2, 1])
('impact', [2, 2])
('reduced', [2, 2, 2])
('project', [2, 2])
('impact', [2, 2])
('located', [3, 2])
('around', [2, 2])
('st', [1, 1])
('national', [3, 2])
('protection', [3, 3])
('identified', [4, 4])
('result', [2, 2])
('potential', [3, 3])
('cause', [1, 1])
('protection', [3, 3])
('repairs', [2, 2])
('project', [2, 2])
('rehabilitate', [5, 5])
('repairing', [3, 3])
('damaged', [2, 2])
('degraded', [3, 3])
('fitted', [2, 2])
('concrete', [2, 2])
('pieces', [2, 2])
('required', [3, 2])
('impacts', [2, 2, 2, 2])
('associated', [5, 5])
('projects', [2, 2, 2, 2])
('offsetting', [3, 3])
('causeway', [2, 2])
('decreased', [2, 2])
('mainland', [2, 2])
('caused', [1, 1])
('mainland', [2, 2])
('new', [1, 1

In [58]:
# It appears that the first number in the list reliably indicates the number of syllables
# Edit the list so every word has only one syllable count

new_list = []

for pair in pair_list:
    word = pair[0]
    syl = pair[1][0]
    new_pair = (word, syl)
    new_list.append(new_pair)

new_list

[('purpose', 2),
 ('project', 2),
 ('reduce', 2),
 ('risk', 1),
 ('wildfire', 3),
 ('fort', 1),
 ('walsh', 1),
 ('national', 3),
 ('historic', 3),
 ('site', 1),
 ('thinning', 2),
 ('pockets', 2),
 ('fuel', 2),
 ('two', 1),
 ('forested', 3),
 ('areas', 3),
 ('near', 1),
 ('fort', 1),
 ('site', 1),
 ('total', 2),
 ('area', 3),
 ('treated', 2),
 ('ha', 1),
 ('absence', 2),
 ('fire', 2),
 ('maturity', 4),
 ('forest', 2),
 ('around', 2),
 ('fort', 1),
 ('site', 1),
 ('resulted', 3),
 ('accumulation', 5),
 ('ladder', 2),
 ('fuels', 2),
 ('old', 1),
 ('dying', 2),
 ('mature', 2),
 ('trees', 1),
 ('downed', 1),
 ('woody', 2),
 ('material', 4),
 ('reducing', 3),
 ('fuel', 2),
 ('load', 1),
 ('site', 1),
 ('removing', 3),
 ('trees', 1),
 ('administering', 5),
 ('various', 3),
 ('firesmart', 3),
 ('techniques', 2),
 ('stand', 1),
 ('thinning', 2),
 ('ladder', 2),
 ('fuel', 2),
 ('removal', 3),
 ('etc', 4),
 ('impact', 2),
 ('wildfire', 3),
 ('site', 1),
 ('reduced', 2),
 ('ladder', 2),
 ('fuels',

In [61]:
# Count occurrence of words with 3 or more syllables

long_words = []

for pair in new_list:
    if pair[1] >= 3:
        long_words.append(pair)

print(len(new_list)) # count all words
print(len(long_words))    # count all long words

4221
1314


In [62]:
# Percentage of long words in all words

1314/4221 * 100

31.13006396588486

About 30% of the meaningful words in the project description text are 3 syllables or longer.

In [64]:
# Frequency of long words

# isolate the words from syl counts so they can be counted
long = [] 
for pair in long_words:
    long.append(pair[0])

long_count = Counter(long)
pprint(long_count)
    

Counter({'area': 46,
         'national': 36,
         'existing': 30,
         'areas': 22,
         'canada': 21,
         'assessment': 19,
         'required': 17,
         'equipment': 17,
         'vegetation': 16,
         'management': 15,
         'approximately': 15,
         'replacement': 15,
         'habitat': 14,
         'construction': 13,
         'installation': 10,
         'including': 10,
         'removal': 9,
         'located': 9,
         'proposing': 9,
         'expected': 9,
         'boundary': 9,
         'reduction': 9,
         'disturbance': 8,
         'adjacent': 7,
         'currently': 7,
         'operations': 7,
         'wildfire': 6,
         'historic': 6,
         'removing': 6,
         'etc': 6,
         'potential': 6,
         'reclamation': 6,
         'developed': 6,
         'visitor': 6,
         'objective': 6,
         'maintenance': 6,
         'constructed': 6,
         'activities': 6,
         'various': 5,
         'inspection'