In [8]:
#!/usr/bin/env python3

import nltk
from collections import Counter
# nltk.download('punkt') # punkt is an unsupervised model and can be used on data without labels

MALE = 'male'
FEMALE = 'female'
UNKNOWN = 'unknown'
BOTH = 'both'
# what is set? 
MALE_WORDS = set([
    'guy','spokesman','chairman',"men's",'men','him',"he's",'his',
    'boy','boyfriend','boyfriends','boys','brother','brothers','dad',
    'dads','dude','father','fathers','fiance','gentleman','gentlemen',
    'god','grandfather','grandpa','grandson','groom','he','himself',
    'husband','husbands','king','male','man','mr','nephew','nephews',
    'priest','prince','son','sons','uncle','uncles','waiter','widower',
    'widowers'
])

FEMALE_WORDS = set([
    'heroine','spokeswoman','chairwoman',"women's",'actress','women',
    "she's",'her','aunt','aunts','bride','daughter','daughters','female',
    'fiancee','girl','girlfriend','girlfriends','girls','goddess',
    'granddaughter','grandma','grandmother','herself','ladies','lady',
    'mom','moms','mother','mothers','mrs','ms','niece','nieces',
    'priestess','princess','queens','she','sister','sisters','waitress',
    'widow','widows','wife','wives','woman'
])

[nltk_data] Downloading package punkt to /Users/yutian/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [16]:
def genderize(words):

    mwlen = len(MALE_WORDS.intersection(words))
    fwlen = len(FEMALE_WORDS.intersection(words))

    if mwlen > 0 and fwlen == 0:
        return MALE
    elif mwlen == 0 and fwlen > 0:
        return FEMALE
    elif mwlen > 0 and fwlen > 0:
        return BOTH
    else:
        return UNKNOWN

In [17]:
words = "I love men but I don't like son"
len(MALE_WORDS.intersection(words))
# no results, this means that the text needs to be processed

0

In [18]:
def count_gender(sentences):

    sents = Counter()
    words = Counter()

    for sentence in sentences:
        gender = genderize(sentence)
        sents[gender] += 1
        words[gender] += len(sentence)

    return sents, words

In [21]:
sentences = set([words, words])
genderize(sentences)
count_gender(sentences)

'unknown'

In [22]:
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )

In [25]:
with open('/Users/yutian/Dropbox/Python/atap/snippets/ch01/ballet.txt', 'r') as f:
    f_read = f.read()
# print(f_read)
# f_read is the raw text

In [35]:
sentences = [
    [word.lower() for word in nltk.word_tokenize(sentence)]
    for sentence in nltk.sent_tokenize(f_read) # why this can be in reverse order? 
]
print(sentences)

[['with', 'apologies', 'to', 'james', 'brown', ',', 'the', 'hardest', 'working', 'people', 'in', 'show', 'business', 'may', 'well', 'be', 'ballet', 'dancers', '.'], ['and', 'at', 'new', 'york', 'city', 'ballet', ',', 'none', 'work', 'harder', 'than', 'the', 'dancers', 'in', 'its', 'lowest', 'rank', ',', 'the', 'corps', 'de', 'ballet', '.'], ['during', 'the', 'first', 'week', 'of', 'the', 'company', '’', 's', 'winter', 'season', ',', 'claire', 'kretzschmar', ',', '24', ',', 'a', 'rising', 'corps', 'member', ',', 'danced', 'in', 'all', 'seven', 'performances', ',', 'appearing', 'in', 'five', 'ballets', ',', 'sometimes', 'changing', 'costumes', 'at', 'intermission', 'to', 'dance', 'two', 'roles', 'in', 'a', 'night', '.'], ['but', 'her', 'work', 'onstage', 'did', 'not', 'even', 'begin', 'to', 'capture', 'the', 'stamina', 'required', 'to', 'be', 'in', 'the', 'corps', '.'], ['spending', 'a', 'week', 'shadowing', 'ms.', 'kretzschmar', 'was', 'exhausting', '—', 'she', 'gave', 'new', 'meaning',

In [34]:
# more sequential way, but apparently less readable according to chatGPT
def tokenize_sentence(sentence):
    return [word.lower() for word in nltk.word_tokenize(sentence)]

sentences = nltk.sent_tokenize(f_read)
tokenized_sentences = [tokenize_sentence(sentence) for sentence in sentences]

print(tokenized_sentences)

[['with', 'apologies', 'to', 'james', 'brown', ',', 'the', 'hardest', 'working', 'people', 'in', 'show', 'business', 'may', 'well', 'be', 'ballet', 'dancers', '.'], ['and', 'at', 'new', 'york', 'city', 'ballet', ',', 'none', 'work', 'harder', 'than', 'the', 'dancers', 'in', 'its', 'lowest', 'rank', ',', 'the', 'corps', 'de', 'ballet', '.'], ['during', 'the', 'first', 'week', 'of', 'the', 'company', '’', 's', 'winter', 'season', ',', 'claire', 'kretzschmar', ',', '24', ',', 'a', 'rising', 'corps', 'member', ',', 'danced', 'in', 'all', 'seven', 'performances', ',', 'appearing', 'in', 'five', 'ballets', ',', 'sometimes', 'changing', 'costumes', 'at', 'intermission', 'to', 'dance', 'two', 'roles', 'in', 'a', 'night', '.'], ['but', 'her', 'work', 'onstage', 'did', 'not', 'even', 'begin', 'to', 'capture', 'the', 'stamina', 'required', 'to', 'be', 'in', 'the', 'corps', '.'], ['spending', 'a', 'week', 'shadowing', 'ms.', 'kretzschmar', 'was', 'exhausting', '—', 'she', 'gave', 'new', 'meaning',

In [6]:
from pathlib import Path
Path.cwd()
# ??? how to set the path for a project?

PosixPath('/Users/yutian/Dropbox/Python/atap/code')

In [9]:
if __name__ == '__main__':
    with open('/Users/yutian/Dropbox/Python/atap/snippets/ch01/ballet.txt', 'r') as f:
        parse_gender(f.read())

39.269% unknown (48 sentences)
52.994% female (38 sentences)
4.393% both (2 sentences)
3.344% male (3 sentences)
