In [1]:
import pandas as pd
import string
import time
import requests
import urllib.request
import threading
import gzip
import csv
import glob

# procedure

stream small ngram files (<700 mb) from url
process and write to two-column csv

download big ngram files (>700 mb) data to local
process and write to two-column csv

clean ngram column and only keep alphabet

In [2]:
def read_ngram(url, big_files):
    ngram_dict = {}
    
    with requests.get(url,stream = True) as res:
        size = res.headers['Content-length']
        if int(size) < 1000 * 1048576: # 900 mb
            extracted = gzip.decompress(res.content)
            for line in extracted.split(b'\n'):
                line = line.decode()
                if line:
                    ngram, year, match_count, volume_count = line.split('\t')
                    # remove POS tags
                    if "_" not in ngram:
                        ngram_dict[ngram] = ngram_dict.get(ngram, 0) + int(match_count)
        else:
            big_files.append(url) # keep track of big files
    return ngram_dict

def read_ngram_local(file):
    ngram_dict = {}
    with gzip.open(file,'rt') as f:
        for line in f:
            if line:
                ngram, year, match_count, volume_count = line.split('\t')
                # remove POS tags
                if "_" not in ngram:
                    ngram_dict[ngram] = ngram_dict.get(ngram, 0) + int(match_count)
    return ngram_dict

def write_ngram(d, n, code):
    df = pd.DataFrame(d.items(), columns=['Ngram', 'Freq'])
    df.to_csv(f"E:/google_ngram/{n}gram/{n}gram-{code}.csv", encoding = "utf-8-sig", index = False)
    
def download_ngram(url):
    name = url.split("/")[-1]
    urllib.request.urlretrieve(url, "F:/big_files/" + name)
    print("'%s\' fetched in %ss" % (name, (time.time() - start)))

In [11]:
# read local google ngram file
n = 3
files = glob.glob(f"F:/big_file/*{n}gram*")

for file in files:
    start = time.time()
    code = file.split("-")[-1].split(".")[0]
    print(f"Working on {file} ......")
    ngram_dict = read_ngram_local(file)
    write_ngram(ngram_dict, n, code)
    ngram_dict.clear()
    end = time.time()
    print(f"It took me {end - start:.2f} seconds")

print(files)

Working on F:/big_file\googlebooks-eng-all-3gram-20120701-a_.gz ......
It took me 2170.08 seconds
Working on F:/big_file\googlebooks-eng-all-3gram-20120701-i_.gz ......
It took me 435.53 seconds
['F:/big_file\\googlebooks-eng-all-3gram-20120701-a_.gz', 'F:/big_file\\googlebooks-eng-all-3gram-20120701-i_.gz']


In [None]:
# read online google ngram data and download big files
n = 5
letters = string.ascii_lowercase

for n in [2, 3, 4, 5]:
    big_files = []
    
    for i in letters:
        #for j in letters:

        start = time.time()
        code = i + '_'
        url = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n}gram-20120701-{code}.gz"
        print(f"Working on {url} ......")
        try:
            ngram_dict = read_ngram(url, big_files)
            write_ngram(ngram_dict, n, code)
            end = time.time()
            print(f"It took me {end - start:.2f} seconds")
            ngram_dict.clear()
        except:
            big_files.append(url)
            print(f"I can't open {url}")

    print(big_files)
    
    
    print("Start downloading big files ......")
    start = time.time()
    # open multi threads
    threads = [threading.Thread(target=download_ngram, args=(url,)) for url in big_files]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

    print("Elapsed Time: %s" % (time.time() - start))

In [None]:
# check if there is empty data
n = 4
files = glob.glob(f"E:/google_ngram/{n}gram/*.csv")

size = len(files)

for i in range(size):
    f = files[i]
    df = pd.read_csv(f)
    if len(df) < 1:
        print(f)

In [20]:
# merge ngram files
def merge_ngram(i, n, files):
    print()
    df_list = [pd.read_csv(f) for f in files]
    merge = pd.concat(df_list)
    merge.to_csv(f"E:/google_ngram/clean/{n}gram/{n}gram_{i}.csv", index = False, encoding = "utf-8-sig")

letters = string.ascii_lowercase
n = 2

#for i in letters:
for i in ['a']:
    files = glob.glob(f"E:/google_ngram/raw/{n}gram/*{i}[a-z].csv")
    print(f"Merging files with initial letter {i}")
    merge_ngram(i, n, files)

Merging files with initial letter a



In [None]:
def is_alnum(ngram):
    grams = ngram.split()
    
    for g in grams:
        if not g.isalnum():
            return False
    return True

def clean_ngram(file):
    df = pd.read_csv(file)
    df['Clean'] = df['Ngram'].apply(is_alnum)
    df = df[df['Clean'] == True]
    del df['Clean']
    df.to_csv(file, index = False, encoding = 'utf-8-sig')


for n in [2, 3, 4, 5]:
    files = glob.glob(f"E:/google_ngram/{n}gram/*")
    for file in files:
        print(f"Working on {file} ...")
        start = time.time()
        clean_ngram(file)
        end = time.time()
        print(f"It took me {end - start:.2f} seconds")

In [None]:
# clean ngram and only retain alphabet and numerical
'''

def clean_ngram(df, n):
    # select colnames based on ngram
    colnames = ['First_Word', 'Second_Word', 'Third_Word', 'Fourth_Word', 'Fifth_Word']
    colnames = colnames[:n]
    
    # keep rows only when all characters are alphabetic
    for col in colnames:
        df = df[df[col].str.isalnum()]
    
    return df

# split ngram into columns
def split_ngram(df, n):
    # select colnames based on ngram
    colnames = ['First', 'Second', 'Third', 'Fourth', 'Fifth']
    colnames = colnames[:n]
    
    # split ngram
    df[colnames] = df.Ngram.str.split(expand=True)
    del df['Ngram']
    
    # split POS tag with the word for each gram
    for col in colnames:  
        df[col + '_Word'] = df[col].str.split('_', expand = True)[0]
        df[col + '_POS'] = df[col].str.split('_', expand = True)[1]
        del df[col]
    
    return df

'''

In [16]:
def get_size(files):
    size = 0
    for f in files:
        df = pd.read_csv(f)
        size += len(df)
    return size

def check_data(files):
    for f in files:
        df = pd.read_csv(f)
        print(df.head())
        print(df.tail())

In [None]:
n = 3
files = glob.glob(f"E:/google_ngram/clean/{n}gram/*.csv")
check_data(files)