# Google Ngrams Analysis
## An Evolutionary Investigation



[How to open Gzip files](https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python)
[Original Ngrams analysis](https://github.com/Aaronasnx/Google-preprocessing/blob/main/ngram%20project.ipynb)

In [1]:
import os
import gzip
import json

from nltk import WordNetLemmatizer
lem = WordNetLemmatizer()

In [2]:
import string
PUNCTUATION = set(char for char in string.punctuation).union({'“','”'})
DIGITS = {str(i) for i in range(10)}
#Excluding '_' (underscore) from DASHES precludes the tagged 1grams "_NOUN", add it to also include the tagged 1grams
DASHES = {'—','–','—','―','‒','-'}
PUNCTUATION.difference_update(DASHES)
STOPS = PUNCTUATION.union(DIGITS)

In [3]:
def csv2tuple(string):
    year,match_count,volume_count = tuple(string.split(','))
    return int(year),int(match_count),int(volume_count)

In [4]:
def preprocess_ngrams(directory,file_path):
    with gzip.open(directory+file_path,'r') as f_in:
        rows = [x.decode('utf8').strip() for x in f_in.readlines()]

    ngram_dict = dict()

    #This implementation uses {1gram:{year:match_count ...} ...}
    i=0
    for row in rows:
        columns = row.split('\t')
        #unigram is the first entry, the rest of the entries are of the form year,match_count,volume_count\t n times, where n is variable each line
        unigram = columns[0].lower().strip()
        #If it passes the initial word test continue parsing, checks each character in the unigram against the characters in the STOP set. (character level filtering)
        if not set(unigram).intersection(STOPS):
            #Parse the new entry and create a list of records in form [...[year, match_count]...]
            records = dict()
            for entry in columns[1:]:
                year,match_count,volume_count = csv2tuple(str(entry))
                if year>1800 and volume_count>1:
                    records[year] = match_count

            #Modify the dictionary if new entry is already there, else just add it as a new unigram:records to the dict
            if unigram in ngram_dict.keys():
                #accessing the ngram dictionary and seeing if each year is present, if so add match count, else add a new record entry to the dictionary.
                for yr, match_ct in records.items(): #each record should be of the form {year, match_count}
                    #If the year in the new record is in the dict for this 1gram, then find where it is.
                    if yr in ngram_dict[unigram].keys():
                        ngram_dict[unigram][yr] += match_ct
                    else:
                        #This just adds the record to the end, will need to sort later
                        ngram_dict[unigram][yr] = match_ct
            else:
                ngram_dict[unigram] = records

        i+=1
        if i%500000==0:
            print(i)
            break

    print(file_path,len(ngram_dict))
    
    #Save as JSON
    with open(directory+file_path[:-3]+'-untagged-testing.json', 'w') as f_out:
        json.dump(ngram_dict, f_out)

In [5]:
directory = './Ngrams/'
files = os.listdir(directory)
for file_path in files:
    if '.gz' in file_path:
        preprocess_ngrams(directory,file_path)

500000
1000000
1500000
2000000
1-00000-of-00024.gz 930
500000
1000000
1500000
2000000
2500000
3000000
1-00015-of-00024.gz 1004792
500000
1000000
1500000
2000000
2500000
3000000
1-00009-of-00024.gz 1034189
500000
1000000
1500000
2000000
2500000
3000000
1-00010-of-00024.gz 1135827
500000
1000000
1500000
2000000
2500000
3000000
1-00022-of-00024.gz 1159144
500000
1000000
1500000
2000000
2500000
3000000
1-00019-of-00024.gz 1075576
500000
1000000
1500000
2000000
2500000
3000000
1-00005-of-00024.gz 0
500000
1000000
1500000
2000000
2500000
3000000
1-00006-of-00024.gz 539327
500000
1000000
1500000
2000000
2500000
3000000
1-00021-of-00024.gz 1012273
500000
1000000
1500000
2000000
2500000
3000000
1-00013-of-00024.gz 1162887
500000
1000000
1500000
2000000
2500000
3000000
1-00016-of-00024.gz 1144314
500000
1000000
1500000
2000000
2500000
3000000
1-00003-of-00024.gz 0
500000
1000000
1500000
2000000
2500000
3000000
1-00004-of-00024.gz 0
500000
1000000
1500000
2000000
2500000
3000000
1-00018-of-00024.