# Section one: Processing Text and Strings in Python
- string.split ( ) and string.join ( )
- List operations on strings 
- index ( ) and find ( )
- The string library
- string.startswith ( )
- string.isahpha ( )
- string.strip ( )  -- remove white space
- NLTK library

In [1]:
import json
import string

In [2]:
path = '/Users/yaohanli/Downloads/yelp_dataset/review.json'

In [4]:
f = open(path)

In [6]:
d = json.loads(f.readline())

In [7]:
d

{'review_id': 'Q1sbwvVQXV2734tPgoKj4Q',
 'user_id': 'hG7b0MtEbXx5QzbzE6C_VA',
 'business_id': 'ujmEBvifdJM6h6RLv4wQIg',
 'stars': 1.0,
 'useful': 6,
 'funny': 1,
 'cool': 0,
 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.',
 'date': '2013-05-07 04:34:36'}

In [9]:
review = d['text']

In [10]:
reviewwords = review.split() ## This process is known as tokenization

In [11]:
reviewwords

['Total',
 'bill',
 'for',
 'this',
 'horrible',
 'service?',
 'Over',
 '$8Gs.',
 'These',
 'crooks',
 'actually',
 'had',
 'the',
 'nerve',
 'to',
 'charge',
 'us',
 '$69',
 'for',
 '3',
 'pills.',
 'I',
 'checked',
 'online',
 'the',
 'pills',
 'can',
 'be',
 'had',
 'for',
 '19',
 'cents',
 'EACH!',
 'Avoid',
 'Hospital',
 'ERs',
 'at',
 'all',
 'costs.']

In [12]:
' '.join(reviewwords)

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'

In [13]:
review.lower()

'total bill for this horrible service? over $8gs. these crooks actually had the nerve to charge us $69 for 3 pills. i checked online the pills can be had for 19 cents each! avoid hospital ers at all costs.'

In [14]:
review.upper()

'TOTAL BILL FOR THIS HORRIBLE SERVICE? OVER $8GS. THESE CROOKS ACTUALLY HAD THE NERVE TO CHARGE US $69 FOR 3 PILLS. I CHECKED ONLINE THE PILLS CAN BE HAD FOR 19 CENTS EACH! AVOID HOSPITAL ERS AT ALL COSTS.'

In [15]:
len(review)

204

In [16]:
len(reviewwords)

39

In [17]:
review[:10]

'Total bill'

In [18]:
reviewwords[:10]

['Total',
 'bill',
 'for',
 'this',
 'horrible',
 'service?',
 'Over',
 '$8Gs.',
 'These',
 'crooks']

In [19]:
reviewwords.index("checked")

22

In [20]:
review.find("for")

11

In [21]:
review.find("banada") # if there is no such word,  the return will be -1

-1

In [22]:
review.count("for")

3

In [23]:
review.lower().count("total")

1

## Removing punctuation characters

In [24]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [25]:
[x for x in review if not x in string.punctuation]

['T',
 'o',
 't',
 'a',
 'l',
 ' ',
 'b',
 'i',
 'l',
 'l',
 ' ',
 'f',
 'o',
 'r',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 'h',
 'o',
 'r',
 'r',
 'i',
 'b',
 'l',
 'e',
 ' ',
 's',
 'e',
 'r',
 'v',
 'i',
 'c',
 'e',
 ' ',
 'O',
 'v',
 'e',
 'r',
 ' ',
 '8',
 'G',
 's',
 ' ',
 'T',
 'h',
 'e',
 's',
 'e',
 ' ',
 'c',
 'r',
 'o',
 'o',
 'k',
 's',
 ' ',
 'a',
 'c',
 't',
 'u',
 'a',
 'l',
 'l',
 'y',
 ' ',
 'h',
 'a',
 'd',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'n',
 'e',
 'r',
 'v',
 'e',
 ' ',
 't',
 'o',
 ' ',
 'c',
 'h',
 'a',
 'r',
 'g',
 'e',
 ' ',
 'u',
 's',
 ' ',
 '6',
 '9',
 ' ',
 'f',
 'o',
 'r',
 ' ',
 '3',
 ' ',
 'p',
 'i',
 'l',
 'l',
 's',
 ' ',
 'I',
 ' ',
 'c',
 'h',
 'e',
 'c',
 'k',
 'e',
 'd',
 ' ',
 'o',
 'n',
 'l',
 'i',
 'n',
 'e',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'p',
 'i',
 'l',
 'l',
 's',
 ' ',
 'c',
 'a',
 'n',
 ' ',
 'b',
 'e',
 ' ',
 'h',
 'a',
 'd',
 ' ',
 'f',
 'o',
 'r',
 ' ',
 '1',
 '9',
 ' ',
 'c',
 'e',
 'n',
 't',
 's',
 ' ',
 'E',
 'A',
 'C',
 'H',
 ' ',
 'A'

In [26]:
''.join([x for x in review if not x in string.punctuation]) 
## Get rid of all the punctuation

'Total bill for this horrible service Over 8Gs These crooks actually had the nerve to charge us 69 for 3 pills I checked online the pills can be had for 19 cents EACH Avoid Hospital ERs at all costs'

# Section Two: Processing Times and Dates in Python
- Time.strptime: convert a time string to a structured time object
- Time.strftime: convert a time object to a string
- Time.mktime/calendar.timegm: convert a time object to a number
- Time.gmtime: convert a number to a time object

## Concept: unix time: the value is the number of seconds since Jan 1, 1970 in the UTC timezone

In [27]:
import time
import calendar

In [33]:
timeString = "2019-07-09 23:30:02"

In [34]:
timeStruct = time.strptime(timeString, "%Y-%m-%d %H:%M:%S")

In [35]:
timeStruct

time.struct_time(tm_year=2019, tm_mon=7, tm_mday=9, tm_hour=23, tm_min=30, tm_sec=2, tm_wday=1, tm_yday=190, tm_isdst=-1)

In [36]:
timeStruct.tm_wday

1

In [37]:
help(time.strptime)

Help on built-in function strptime in module time:

strptime(...)
    strptime(string, format) -> struct_time
    
    Parse a string to a time tuple according to a format specification.
    See the library reference manual for formatting codes (same as
    strftime()).
    
    Commonly used format codes:
    
    %Y  Year with century as a decimal number.
    %m  Month as a decimal number [01,12].
    %d  Day of the month as a decimal number [01,31].
    %H  Hour (24-hour clock) as a decimal number [00,23].
    %M  Minute as a decimal number [00,59].
    %S  Second as a decimal number [00,61].
    %z  Time zone offset from UTC.
    %a  Locale's abbreviated weekday name.
    %A  Locale's full weekday name.
    %b  Locale's abbreviated month name.
    %B  Locale's full month name.
    %c  Locale's appropriate date and time representation.
    %I  Hour (12-hour clock) as a decimal number [01,12].
    %p  Locale's equivalent of either AM or PM.
    
    Other codes may be available on yo

In [39]:
t1 = calendar.timegm(timeStruct)

In [41]:
t2 = time.mktime(timeStruct)

In [42]:
t1, t2

(1562715002, 1562729402.0)

In [43]:
t1 + 60*60*24*5 
# 5 days later

1563147002

## Note:
### mktime assumes the structure is a local time whereas time.gmtime assumes the structure is a UTC time

In [44]:
time.gmtime(t1 + 60*60*24*5)

time.struct_time(tm_year=2019, tm_mon=7, tm_mday=14, tm_hour=23, tm_min=30, tm_sec=2, tm_wday=6, tm_yday=195, tm_isdst=0)

In [45]:
time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(t1 + 60*60*24*5))

'2019-07-14 23:30:02'

# Section Three: Time and Date Data

In [46]:
import json
path = "/Users/yaohanli/Downloads/yelp_dataset/review.json"
f = open(path, 'r', encoding = 'utf8')

In [47]:
dataset = []
for i in range(50000):
    dataset.append(json.loads(f.readline()))

In [48]:
dataset[0]

{'review_id': 'Q1sbwvVQXV2734tPgoKj4Q',
 'user_id': 'hG7b0MtEbXx5QzbzE6C_VA',
 'business_id': 'ujmEBvifdJM6h6RLv4wQIg',
 'stars': 1.0,
 'useful': 6,
 'funny': 1,
 'cool': 0,
 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.',
 'date': '2013-05-07 04:34:36'}

In [49]:
timeString = dataset[0]['date']
print(timeString)

2013-05-07 04:34:36


In [50]:
import time

In [51]:
timeStruct = time.strptime(timeString, "%Y-%m-%d %H:%M:%S")
timeStruct

time.struct_time(tm_year=2013, tm_mon=5, tm_mday=7, tm_hour=4, tm_min=34, tm_sec=36, tm_wday=1, tm_yday=127, tm_isdst=-1)

In [52]:
help(time.strptime)

Help on built-in function strptime in module time:

strptime(...)
    strptime(string, format) -> struct_time
    
    Parse a string to a time tuple according to a format specification.
    See the library reference manual for formatting codes (same as
    strftime()).
    
    Commonly used format codes:
    
    %Y  Year with century as a decimal number.
    %m  Month as a decimal number [01,12].
    %d  Day of the month as a decimal number [01,31].
    %H  Hour (24-hour clock) as a decimal number [00,23].
    %M  Minute as a decimal number [00,59].
    %S  Second as a decimal number [00,61].
    %z  Time zone offset from UTC.
    %a  Locale's abbreviated weekday name.
    %A  Locale's full weekday name.
    %b  Locale's abbreviated month name.
    %B  Locale's full month name.
    %c  Locale's appropriate date and time representation.
    %I  Hour (12-hour clock) as a decimal number [01,12].
    %p  Locale's equivalent of either AM or PM.
    
    Other codes may be available on yo

In [53]:
time.strptime("21:36:18, 28/5/2019", "%H:%M:%S, %d/%m/%Y")

time.struct_time(tm_year=2019, tm_mon=5, tm_mday=28, tm_hour=21, tm_min=36, tm_sec=18, tm_wday=1, tm_yday=148, tm_isdst=-1)

In [54]:
timeInt = time.mktime(timeStruct)

In [55]:
timeInt

1367915676.0

In [57]:
timeInt2 = time.mktime(time.strptime(dataset[99]['date'], "%Y-%m-%d %H:%M:%S"))

In [58]:
timeInt2

1454134083.0

In [61]:
timeDiff = timeInt - timeInt2

In [62]:
timeDiff

-86218407.0

In [63]:
timeDiff/(60*60)

-23949.5575

In [64]:
timeDiff/(60*60*24)

-997.8982291666666

In [65]:
time.gmtime(timeInt)

time.struct_time(tm_year=2013, tm_mon=5, tm_mday=7, tm_hour=8, tm_min=34, tm_sec=36, tm_wday=1, tm_yday=127, tm_isdst=0)

In [66]:
time.gmtime(timeInt + 60*60*24*7)

time.struct_time(tm_year=2013, tm_mon=5, tm_mday=14, tm_hour=8, tm_min=34, tm_sec=36, tm_wday=1, tm_yday=134, tm_isdst=0)