In [42]:
import tokenize

def get_token_list(fname):
    with tokenize.open(fname) as f:
        tokens = tokenize.generate_tokens(f.readline)
        tokenlist = [token for token in tokens]
    return tokenlist

def get_token_ngrams(tokenlist, n):
    ngrams = []
    for i in range(len(tokenlist) - n + 1):
        ngram = tuple(tokenlist[i:i + n])
        ngrams.append(ngram)
    return ngrams

def get_all_token_ngrams(tokenlist, min_n, max_n):
    all_ngrams = []
    for n in range(min_n, max_n+1):
        ngrams = get_token_ngrams(tokenlist, n)
        all_ngrams.append(ngrams)
    return all_ngrams

tokenlist = get_token_list('bubblesort1.py')
ngrams = get_token_ngrams(tokenlist, 5)


In [43]:
from dataclasses import dataclass

@dataclass
class NGramData:
    file: str
    ngram: list

def make_ngram_dictionary(ngrams, filename):
    ngram_dict = {}
    for ngram in ngrams:
        tokentypes = tuple( [token.type for token in ngram] )
        data = NGramData(file=filename, ngram=ngrams)
        if ngram in ngram_dict:
            ngram_dict[tokentypes].append(data)
        else:
            ngram_dict[tokentypes] = [data]
    return ngram_dict

ngram_dict = make_ngram_dictionary(ngrams, 'bubblesort1.py')
print(ngram_dict)

{(1, 1, 54, 1, 54): [NGramData(file='bubblesort1.py', ngram=[(TokenInfo(type=1 (NAME), string='def', start=(1, 0), end=(1, 3), line='def bubble_sort(arr):\n'), TokenInfo(type=1 (NAME), string='bubble_sort', start=(1, 4), end=(1, 15), line='def bubble_sort(arr):\n'), TokenInfo(type=54 (OP), string='(', start=(1, 15), end=(1, 16), line='def bubble_sort(arr):\n'), TokenInfo(type=1 (NAME), string='arr', start=(1, 16), end=(1, 19), line='def bubble_sort(arr):\n'), TokenInfo(type=54 (OP), string=')', start=(1, 19), end=(1, 20), line='def bubble_sort(arr):\n')), (TokenInfo(type=1 (NAME), string='bubble_sort', start=(1, 4), end=(1, 15), line='def bubble_sort(arr):\n'), TokenInfo(type=54 (OP), string='(', start=(1, 15), end=(1, 16), line='def bubble_sort(arr):\n'), TokenInfo(type=1 (NAME), string='arr', start=(1, 16), end=(1, 19), line='def bubble_sort(arr):\n'), TokenInfo(type=54 (OP), string=')', start=(1, 19), end=(1, 20), line='def bubble_sort(arr):\n'), TokenInfo(type=54 (OP), string=':', 

In [47]:
def compare_ngram_dictionaries(dict1, dict2):
    shared_ngrams = {}
    for ngram in dict1:
        if ngram in dict2:
            shared_ngrams[ngram] = (dict1[ngram], dict2[ngram])
    frac_copies_dict1 = len(shared_ngrams) / len(dict1)
    frac_copies_dict2 = len(shared_ngrams) / len(dict2)
    return shared_ngrams, frac_copies_dict1, frac_copies_dict2

tokenlist2 = get_token_list('bubblesort2.py')
ngrams2 = get_token_ngrams(tokenlist2, 5)
ngram_dict2 = make_ngram_dictionary(ngrams2, 'bubblesort2.py')

shared_ngrams, frac_copies_dict1, frac_copies_dict2 = compare_ngram_dictionaries(ngram_dict, ngram_dict2)
print(frac_copies_dict1, frac_copies_dict2)


0.6888888888888889 0.5535714285714286
