# Day 16 - List Comprehensions

---


## Create a list of lowercase names

In [1]:
names = 'tim sara lily ella daisy princess darcy sookie ginny'.split()
print(names )

['tim', 'sara', 'lily', 'ella', 'daisy', 'princess', 'darcy', 'sookie', 'ginny']


---

## Print each name in title case


In [2]:
for name in names:
    print(name.title())


Tim
Sara
Lily
Ella
Daisy
Princess
Darcy
Sookie
Ginny


---

## Print only names that start with the letters A-G

- Use the `string.ascii_lowercase` attribute to get a list of all ascii lowercase letters.
    - Don't forget to import the `string` module.
    - The return value for the attribute `ascii_lowercase` is a string with no spaces.
        - Use the `list()` method to convert the string to a list.
- Slice the list at an index of `6` to get the first 7 letters of the alphabet.


In [3]:
import string
alphabet_lower = list(string.ascii_lowercase[:13])

new_names = []
for name in names:
    if name[0].lower() in alphabet_lower:
        new_names.append(name.title())

print(new_names)

['Lily', 'Ella', 'Daisy', 'Darcy', 'Ginny']


---

## Refactor using a list comprehension


In [4]:
name_list_2 = [name.title() for name in names if name[0] in alphabet_lower]
print(name_list_2)

['Lily', 'Ella', 'Daisy', 'Darcy', 'Ginny']


### Confirm both list configurations produce the same result


In [5]:
assert new_names == name_list_2

---

# Day 16a - More List Comprehensions


## Parse the text of Harry Potter


In [6]:
""" Documented URL will not open via requests.get() method (HTTP 406 error).
    Stored file contents locally
"""

# Set the file to open
text_file = 'harry.txt'

# Open the file
with open(
    file=text_file,
    mode='rt',
    encoding='utf-8'
) as file:
    # Read the file contents, convert to lowercase, and convert each word to a list item
    words = file.read().lower().split()

# Display the wordr count of the file contents
print(f'Total words in file: {len(words)}')


Total words in file: 4656


---

## Get the most common words from the text


In [7]:
# Use collections.Counter to get the top N words, by count
from collections import Counter

count = Counter(words)
count = count.most_common(5)
print(count)

# Does the list contain non-text characters?
'-' in words

[('the', 202), ('he', 136), ('a', 108), ('and', 100), ('to', 93)]


True

---

## Remove non-alphabetic characters, to keep them from counting like actual words


In [8]:
# Use a regular expression to find non-word characters
import re

""" Use a list comprehension with the re.sub() method
# \W matches any non-alphanumeric characters [^a-zA-Z0-9_]

First argument is the match pattern
Second argument is the replace pattern
Third argument is the object to search, which is also the loop variable in the list comprehension

Effectively, this loops through every word in the list of words and matches only those list items without non-alphanumeric characters
"""
words = [re.sub(r'\W+', r'', word) for word in words]

# Re-count the words
count = Counter(words)
count = count.most_common(5)
print(count)

# Does the list contain non-text characters?
'-' in words

[('the', 204), ('he', 137), ('a', 112), ('and', 102), ('to', 94)]


False

---

## Remove any stopwords (a, the, etc.) from the list


In [9]:
""" Documented URL will not open via requests.get() method (HTTP 406 error).
    Stored file contents locally
"""

# Set the file to open
text_file = 'stopwords.txt'

# Open the file
with open(
    file=text_file,
    mode='rt',
    encoding='utf-8'
) as file:
    # Read the file contents, convert to lowercase, and convert each word to a list item
    stopwords = file.read().lower().split()

# Display the wordr count of the file contents
print(f'Total stopwords in file: {len(words)}')

Total stopwords in file: 4656


In [10]:
""" Remove all blanks and stopwords with a list comprehension.
    'word.strip()' returns False if the value of 'word' is an empty string ('')
    An empty string always returns false.
"""
words = [word for word in words if word.strip() and word not in stopwords]

# Re-count the words
count = Counter(words)
count = count.most_common(5)
print(count)

[('dursley', 45), ('dumbledore', 35), ('said', 32), ('mr', 30), ('professor', 30)]


In [11]:
# Check for a stopword in the list
assert 'the' not in words