# Goal: Create clusters of 5 grams with different center
1. Investigate format of the .gz file
2. Filter the bundles by each word
2. Create first letter verification
Right now only focus on the 5 word bundles, excluding START and END

In [6]:
import os
import gzip
import json
import re
from tqdm import tqdm
#from nltk import WordNetLemmatizer
#lemmatizer = WordNetLemmatizer()
#import sys
#!{sys.executable} -m pip install Unidecode
from unidecode import unidecode
from collections import OrderedDict

### [NLTK POS Lemmatizer](https://www.nltk.org/_modules/nltk/stem/wordnet.html)

The Part Of Speech tag. Valid options are `"n"` for nouns,
            `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
            for [satellite adjectives](https://stackoverflow.com/questions/18817396/what-part-of-speech-does-s-stand-for-in-wordnet-synsets).  
  
  Syntax:
`lemmatizer.lemmatize(word)`

### [Google Tags](https://books.google.com/ngrams/info)
These tags can either stand alone (\_PRON\_) or can be appended to a word (she_PRON)
- _NOUN_		
- _VERB_	
- _ADJ_	adjective
- _ADV_	adverb
- _PRON_	pronoun
- _DET_	determiner or article
- _ADP_	an adposition: either a preposition or a postposition
- _NUM_	numeral
- _CONJ_	conjunction
- _PRT_	particle

In [None]:
import string
PUNCTUATION = set(char for char in string.punctuation).union({'“','”'})
DIGITS = set(string.digits)
VOWELS = set("aeiouyAEIOUY")
#Excluding '_' (underscore) from DASHES precludes the tagged 1grams "_NOUN", add it to also include the tagged 1grams
DASHES = {'—','–','—','―','‒','-'}
PUNCTUATION.difference_update(DASHES)
STOPS = PUNCTUATION.union(DIGITS)
#GOOGLE_TAGS = {'_NOUN','_VERB','_ADJ','_ADV','_PRON','_DET','_ADP','_NUM','_CONJ','_PRT'}

[How to open Gzip files](https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python)

In [None]:
def open_gzip(directory,file_path):
    with gzip.open(directory+file_path,'r') as f_in:
        rows = [x.decode('utf8').strip() for x in f_in.readlines()]
    return rows

In [None]:
def csv2tuple(string):
    return int(string.split(',')[1])
    #Original
    #year,match_count,volume_count = tuple(string.split(','))
    #return int(year),int(match_count),int(volume_count)

In [None]:
def save_json(ngram_dict,directory,file_path):
    output = file_path[:-3]+'_CLUSTERED.json'
    if len(ngram_dict)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(ngram_dict, f_out)
        print('SAVED: ',output,len(ngram_dict))
    else:
        print('unigram dict empty',output)

In [None]:
def pentagram_tests(pentagram_l):
    for gram in pentagram_l:
        #Checks each character in the gram against the characters in the STOP set. (character level filtering) - no punctuation or digits allowed
        if set(gram).intersection(STOPS):
            return False

        #Excluded all of the form _PRON_
        if gram[0] == '_' and gram[-1] == '_':
            return False

        #must have a vowel (presupposes that it must also have a letter of the alphabet inside)
        if not set(gram).intersection(VOWELS):
            return False #Rewrite the alphabet one, I think this is better

        #Words cannot start or end with dashes
        if gram[0] in DASHES or gram[-1] in DASHES:
            return False

        #must have 0 non-english letters
        test = unidecode(gram, errors='replace')
        if test != gram:
            return False

        #Can implement more tests here if you need to do more filtering
    return True

In [None]:
def clear_dict(dictionary):
    for k, v in list(dictionary.items()):
        #set the minumim number of bundles here
        if v['bundles']<2:
            del dictionary[k]

In [None]:
def preprocess_5grams(directory,file_path):
    
    rows = open_gzip(directory,file_path)
    pentagram_dict = dict()
    '''
    This implementation uses 
    pentagram_dict = {'cluster':{'bundles': # of bundles,
                                 'total_uses':total uses of all of the bundles,
                                 'words':{word1:usage1,
                                          word2:usage2,
                                          ...}
                                 ...},
                        ...}
    where 'cluster' is a string of form: word 1 2 '_' 4 5
    '''
    #FIRST ROW, FIRST COLUMN, FIRST WORD, LOWERED
    #prev_first_word = rows[0].split('\t')[0].split()[0].lower()
    
    for row in tqdm(rows):
        columns = row.split('\t')
        
        pentagram = columns[0]
        #pentagram_l is the list version of 
        pentagram_l = pentagram.lower().split()
        #If it passes the word tests continue parsing the pentagram
        if pentagram_tests(pentagram_l):
            word = pentagram_l[2]
            cluster = pentagram_l[0]+' '+pentagram_l[1]+' _ '+pentagram_l[3]+' '+pentagram_l[4]
            total_count = 0
            for entry in columns[1:]:
                match_count = csv2tuple(str(entry))
                total_count+=match_count
            
            if cluster in pentagram_dict.keys():
                pentagram_dict[cluster]['bundles']+=1
                pentagram_dict[cluster]['total_uses']+=total_count
                pentagram_dict[cluster]['words'][word]=total_count
            else:
                pentagram_dict[cluster] = {'total_uses':total_count,
                                           'bundles':1,
                                           'words':{word:total_count},
                                           }
                prev_first_word = word
    
    #Filter out the insignificant ones
    clear_dict(pentagram_dict)
    #Save as JSON
    save_json(pentagram_dict,directory,file_path)

In [None]:
%%time
preprocess_5grams('./5grams/','5-19384-of-19423.gz')

In [3]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        dictionary = json.load(f)
        f.close()
    return dictionary

In [4]:
pentagrams = open_json('./5grams/','5-19384-of-19423_CLUSTERED.json')

In [7]:
def top_counts(dictionary,count_type,num_hits=10,head = True):
    return OrderedDict(sorted(dictionary.items(), key=lambda x: x[1][count_type], reverse=head)[:num_hits])

In [8]:
top = top_counts(pentagrams,'total_uses')

In [9]:
top

OrderedDict([('you the _ of the',
              {'total_uses': 1083810,
               'bundles': 2194,
               'words': {'owner': 3072,
                'rewards': 66,
                'clerk': 289,
                'morning': 403,
                'beings': 70,
                'actions': 106,
                'recital': 233,
                'service': 691,
                'comforts': 50,
                'shelter': 43,
                'history': 201,
                'overview': 102,
                'liberation': 55,
                'decoration': 160,
                'conclusion': 684,
                'illustration': 169,
                'identity': 872,
                'copie': 80,
                'fulfillment': 168,
                'face': 566,
                'definitions': 72,
                'cover': 48,
                'sequence': 298,
                'paper': 100,
                'inspiration': 89,
                'purposes': 187,
                'award': 118,
                

This section should be for streaming the \*\.gz files from google ngrams server

In [10]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [11]:
url = 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-5-ngrams_exports.html'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

links = soup.find_all('li')
url_pattern = re.compile('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+')
urls = [url_pattern.findall(str(link))[0] for link in links]

In [12]:
urls[:10]

['http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00000-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00001-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00002-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00003-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00004-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00005-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00006-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00007-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00008-of-19423.gz',
 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00009-of-19423.gz']

In [18]:
from io import BytesIO, TextIOWrapper,StringIO

In [13]:
for url in urls[:10]:
    gzipped_file = requests.get(url)
    print(gzipped_file,type(gzipped_file),gzipped_file.raw,type(gzipped_file.raw))
    break

<Response [200]> <class 'requests.models.Response'> <urllib3.response.HTTPResponse object at 0x7f7f01b61820> <class 'urllib3.response.HTTPResponse'>


In [16]:
print(gzipped_file.raw.read(1000),type(gzipped_file.raw.read(),))

b'' <class 'bytes'>


In [19]:
response = requests.post(urls[0])
buffer_data = BytesIO(response.content)
# Lets decompress
f = gzip.GzipFile(fileobj=buffer_data)
for row in f.readlines():
    print(row,type(row))
    break

BadGzipFile: Not a gzipped file (b'<?')

In [None]:
i=1
for row in gzipped_file.readlines():
    # filter out keep-alive new lines
    if i==1:
        print(row,type(row))
    with gzip.open(BytesIO(row)) as f:
        #rows = [x.decode('utf8').strip() for x in f_in.readlines()]
        for line in f:
            string = line.decode('utf8').strip()
            print(string)
    break
    i+=1

In [None]:
for url in urls[:10]:
    r = requests.get(url, stream=True)
    for chunk in r.raw.stream(1024):
        #chunk.decode('utf8').strip()
        #BytesIO(chunk).decode('utf8').strip()
        wrapper = TextIOWrapper(BytesIO(chunk), encoding='utf-8')
        print(wrapper.read())
        #print(chunk,type(chunk))
        break
    break

In [None]:
for url in urls[:10]:
    r = requests.get(url, stream=True)
    for row in r.iter_lines():
        # filter out keep-alive new lines
        print(row,type(row))
        with gzip.open(BytesIO(row)) as f:
            #rows = [x.decode('utf8').strip() for x in f_in.readlines()]
            for line in f:
                string = line.decode('utf8').strip()
                print(string)
                break
            break
        break
    break

In [None]:
for url in urls[:10]:
    r = requests.get(url)
    for row in r.iter_lines():
        # filter out keep-alive new lines
        print(line,type(line))
        with gzip.open(BytesIO(line)) as f:
            #rows = [x.decode('utf8').strip() for x in f_in.readlines()]
            for line in f:
                string = line.decode('utf8').strip()
                print(string)
        break
    break

In [None]:
for url in urls[:10]:
    r = requests.get(url, stream=True)
    for chunk in r.iter_content(chunk_size=336):
        with gzip.open(BytesIO(chunk)) as f:
            #rows = [x.decode('utf8').strip() for x in f_in.readlines()]
            for line in f:
                string = line.decode('utf8').strip()
                print(string)
            #rows = [x.decode('utf8').strip() for x in f_in.readlines()]
            #break
    #for chunk in r.raw.stream(decode_content=False):
        #print(chunk,type(chunk))
    break
    '''
    with open(local_filename, 'wb') as f:
        for chunk in r.raw.stream(1024, decode_content=False):
            if chunk:
                f.write(chunk)
    '''

In [None]:
with gzip.open(directory+file_path,'r') as f_in:
    rows = [x.decode('utf8').strip() for x in f_in.readlines()]

In [None]:
directory = './5grams/'
files = os.listdir(directory)
for file_path in files:
    if '.gz' in file_path:
        rows = open_gzip(directory,file_path)
        #preprocess_ngrams(directory,file_path)
        break

In [None]:
rows[:10]