# Python Basic

### File Operations

Read file line by line

In [1]:
lines = []
for line in open('building_global_community.txt'):
    # delete the blank and line feed at the begining and end
    line = line.strip()
    # add processed line text into list 'lines'
    lines.append(line)

In [4]:
lines[2]

"On our journey to connect the world, we often discuss products we're building and updates on our business. Today I want to focus on the most important question of all: are we building the world we all want?"

or you can just write

In [31]:
# list comprehension
lines = [line.strip() for line in open('building_global_community.txt')]

In [32]:
lines[0]

'To our community,'

### String operations

In [5]:
sentence = "I want to eat an apple ."

#### string indexing

In [6]:
sentence[5]

't'

In [7]:
sentence[10:13]

'eat'

In [8]:
sentence[-1]

'.'

In [9]:
sentence[10:-3]

'eat an appl'

#### find sequences in string

In [12]:
sentence.find('ttt')

-1

find from right-hand side

In [13]:
sentence.rfind('a')

17

find with a starting point

In [15]:
sentence.find('a')

3

return -1 when not found

In [18]:
sentence.find('can')

-1

combine the use of subsequence and find

In [19]:
sentence[sentence.find('want to'):sentence.rfind('.')]

'want to eat an apple '

### String Normalization

In [20]:
sentence

'I want to eat an apple .'

In [21]:
sentence.lower()

'i want to eat an apple .'

In [82]:
sentence.upper()

'I WANT TO EAT AN APPLE .'

In [83]:
sentence.capitalize()

'I want to eat an apple .'

In [17]:
'Aa'.isupper()

False

In [87]:
'A'.islower()

False

In [92]:
'apple'.isalpha()

True

In [18]:
'20'.isdigit()

True

In [94]:
'20.9'.isdigit()

False

In [19]:
'20'.isdecimal()

True

In [20]:
'furen5566'.isalnum()

True

### split sentence by blank

In [21]:
# the result is list of words in the sentence
sentence.split(' ')

['I', 'want', 'to', 'eat', 'an', 'apple', '.']

In [22]:
sentence.endswith('.')

True

In [23]:
sentence.startswith('He wants')

False

## Dictionary examples

In [25]:
# book = dict()
book = {}

In [26]:
book['title'] = 'Natural Language Processing with Python'
book['author'] = 'Bird, Klein, and Loper'
book['year'] = 2009

In [33]:
book['author'] = [author.strip(' and') for author in 'Bird, Klein, and Loper'.split(', ')]

In [38]:
test = []
for author in 'Bird, Klein, and Loper'.split(', '):
    test.append(author)

In [39]:
test

['Bird', 'Klein', 'and Loper']

In [34]:
book['author']

['Bir', 'Klei', 'Loper']

In [27]:
book

{'author': 'Bird, Klein, and Loper',
 'title': 'Natural Language Processing with Python',
 'year': 2009}

In [28]:
book.keys()

dict_keys(['title', 'author', 'year'])

In [29]:
book.values()

dict_values(['Natural Language Processing with Python', 'Bird, Klein, and Loper', 2009])

In [30]:
book.items()

dict_items([('title', 'Natural Language Processing with Python'), ('author', 'Bird, Klein, and Loper'), ('year', 2009)])

string formatting

In [28]:
'%s is a book written by %s in %d' % (book['title'], book['author'], book['year'])

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

In [108]:
'{0} is a book written by {1} in {2}'.format(book['title'], book['author'], book['year'])

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

In [109]:
# advanced formatting
'{title} is a book written by {author} in {year}'.format(**book)

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

## Counting Example

In [40]:
data = ['red', 'red', 'red', 'red', 'yellow', 'yellow', 'yellow', 'blue', 'blue']

In [43]:
counter = {}
for color in data:
    if color in counter:
        counter[color] += 1
    else:
        counter[color] = 1

In [44]:
counter

{'blue': 2, 'red': 4, 'yellow': 3}

### use default dictionary

In [45]:
from collections import defaultdict
counter = defaultdict(lambda: 0)  # default value function is 0
counter = defaultdict(int)  # default value function is "int", which initialize to 0

In [46]:
for color in data:
    counter[color] += 1

In [47]:
counter

defaultdict(int, {'blue': 2, 'red': 4, 'yellow': 3})

### use built-in Counter

In [49]:
from collections import Counter

In [50]:
counter = Counter(data)

In [51]:
counter

Counter({'blue': 2, 'red': 4, 'yellow': 3})

In [52]:
new_data = ['blue', 'red', 'blue', 'yellow', 'blue', 'yellow', 'blue', 'yellow', 'blue']
counter.update(new_data)

In [53]:
counter

Counter({'blue': 7, 'red': 5, 'yellow': 6})

#### most common elements

In [54]:
counter.most_common()

[('blue', 7), ('yellow', 6), ('red', 5)]

In [55]:
counter.most_common(2)

[('blue', 7), ('yellow', 6)]

In [56]:
for color, count in counter.most_common():
    print('{0}: {1}'.format(color, count))

blue: 7
yellow: 6
red: 5


In [57]:
# clear counter
counter.clear()
print(counter['blue'])

0


# Exercise

compute the word frequencies in "Building_Global_Community.txt"
- read sentences from file "Building_Global_Community.txt"
- split sentences into words (split, or nltk word_tokenize)
- filter out symbols (isalpha, isdigit, isalnum)
- normalize words and count ('Word' and 'word' are considered as the same word)
- count the occurance of words (counting exmaple)

write your code here

In [1]:
# write your code here
import nltk
from nltk.corpus import stopwords
text = open('building_global_community.txt').read()
words = nltk.tokenize.word_tokenize(text)
# filter out symbols
words = [word for word in words if word.isalpha()]

# normalize words and count
words = [word.lower() for word in words]

#filter out the stop word
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
words = [word for word in words if word not in stopwords]

#count the occurance of words
wordCounter = nltk.FreqDist(words)

print wordCounter.most_common(20)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tsunh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[('community', 80), ('people', 62), ('us', 38), ('world', 36), ('social', 31), ('help', 26), ('communities', 25), ('global', 24), ('facebook', 24), ('infrastructure', 24), ('content', 23), ('groups', 23), ('many', 22), ('share', 20), ('like', 19), ('important', 19), ('building', 19), ('around', 18), ('together', 18), ('build', 16)]


### Save the result into a csv file

https://docs.python.org/3/library/csv.html

In [2]:
import csv

write word count result

In [3]:
with open('wordcount.csv', 'w') as csvfile:
    # set up header
    fieldnames = ['word', 'count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for word, count in wordCounter.most_common():
        writer.writerow({'word': word, 'count': count})

read csv

In [4]:
with open('wordcount.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        print(row['word'], row['count'])

('community', '80')
('people', '62')
('us', '38')
('world', '36')
('social', '31')
('help', '26')
('communities', '25')
('global', '24')
('facebook', '24')
('infrastructure', '24')
('content', '23')
('groups', '23')
('many', '22')
('share', '20')
('like', '19')
('important', '19')
('building', '19')
('around', '18')
('together', '18')
('build', '16')
('new', '16')
('friends', '15')
('even', '14')
('want', '14')
('must', '13')
('local', '13')
('across', '13')
('safe', '12')
('need', '12')
('news', '12')
('issues', '11')
('work', '11')
('standards', '11')
('see', '11')
('tools', '10')
('today', '10')
('media', '10')
('personal', '10')
('connect', '10')
('one', '10')
('built', '10')
('common', '10')
('come', '10')
('hope', '9')
('also', '9')
('part', '9')
('well', '9')
('understanding', '8')
('example', '8')
('information', '8')
('may', '8')
('every', '8')
('civic', '8')
('always', '8')
('greatest', '8')
('different', '8')
('opportunity', '8')
('whether', '8')
('time', '8')
('system', '7'

## Bonus

Here I would like to build a dictionary with following structure:

POS_wordcount = {

    "VB":{
    
        play:1
        
        go:23
        
        ...
        
    },
    
    "NN":{
    
        community:80,
        
        government:52
        
        ...
        
    }
   
}

In [5]:
nltk.FreqDist(nltk.pos_tag(words)).items()
POS_wordcount = {}

for i in nltk.FreqDist(nltk.pos_tag(words)).items():
    if i[0][1] in POS_wordcount:
        POS_wordcount[i[0][1]].update({i[0][0]:i[1]})
    else:
        POS_wordcount[i[0][1]] = {}
        POS_wordcount[i[0][1]].update({i[0][0]:i[1]})

In [6]:
POS_wordcount

{'CD': {'billion': 4,
  'five': 1,
  'hundred': 1,
  'million': 4,
  'one': 10,
  'ten': 1,
  'three': 1,
  'two': 7},
 'DT': {'another': 1, 'every': 8},
 'IN': {'across': 13,
  'alongside': 1,
  'although': 2,
  'among': 2,
  'anew': 1,
  'around': 16,
  'begin': 1,
  'berlin': 1,
  'beyond': 2,
  'explore': 1,
  'like': 18,
  'past': 5,
  'since': 5,
  'upon': 1,
  'videos': 1,
  'whether': 8,
  'within': 2,
  'without': 4,
  'worldwide': 2,
  'worth': 1},
 'JJ': {'able': 3,
  'accuracy': 1,
  'act': 1,
  'active': 1,
  'actual': 1,
  'additional': 1,
  'address': 1,
  'affect': 1,
  'ai': 1,
  'american': 1,
  'artificial': 2,
  'attempted': 1,
  'aware': 1,
  'bad': 3,
  'banning': 1,
  'basic': 1,
  'big': 1,
  'black': 2,
  'bother': 1,
  'build': 1,
  'child': 1,
  'civic': 5,
  'civil': 1,
  'clear': 3,
  'collective': 7,
  'come': 1,
  'common': 10,
  'complete': 2,
  'concert': 1,
  'connect': 3,
  'consistent': 1,
  'content': 8,
  'contribute': 1,
  'controversial': 1,
  'c

In [7]:
POS_wordcount["NN"]["community"]

80