In [1]:
import numpy as np
import re
from numba import cuda, types, uint32, int32, jit, njit
import itertools
from collections import Counter
import string
import time

In [2]:
with open("data/wikipedia_50GB/file9", 'r', encoding='utf-8') as file:
        text = file.read()

In [3]:
@cuda.jit(device=True)
def isalpha(c):
    if c >= ord('A') and c <= ord('Z'):
        return True
    elif c >= ord('a') and c <= ord('z'):
        return True
    else:
        return False

@cuda.jit(device=True)
def tolower(c):
    if c >= ord('A') and c <= ord('Z'):
        return c+32
    else:
        return c

@cuda.jit(device=True)
def hash_byte_array(input_array):
    hash_value = np.uint32(5381) 
    for i in range(input_array.size):
        hash_value = ((hash_value << 5) + hash_value) + np.uint32(input_array[i])
    return hash_value

@cuda.jit
def map(text, counts, hash_start, hash_length, hash_size, chunk_size):
    idx = cuda.grid(1)
    if idx*chunk_size >= text.shape[0]:
        return
    chunk_end = (idx+1)*chunk_size
    if chunk_end > text.shape[0]:
        chunk_end = text.shape[0]

    s = -1
    t = idx*chunk_size
    if idx == 0 or (not isalpha(text[t-1])):
        if isalpha(text[t]):
            s = t
    else:
        while (t < chunk_end and isalpha(text[t])):
            t += 1
    while (t < chunk_end):
        if isalpha(text[t]) and (s == -1):
                s = t
        elif (not isalpha(text[t])) and (s != -1):
            word = text[s:t]
            for i in range(len(word)):
                word[i] = tolower(word[i])
            hash_value = hash_byte_array(word) % hash_size
            cuda.atomic.add(counts, hash_value, 1)
            cuda.atomic.exch(hash_start, hash_value, s)
            cuda.atomic.exch(hash_length, hash_value, t-s)
            s = -1
        t += 1
    if s != -1:
        while (t < text.shape[0]) and (isalpha(text[t])):
            t += 1
        word = text[s:t]
        for i in range(len(word)):
            word[i] = tolower(word[i])
        hash_value = hash_byte_array(word) % hash_size
        cuda.atomic.add(counts, hash_value, 1)
        cuda.atomic.exch(hash_start, hash_value, s)
        cuda.atomic.exch(hash_length, hash_value, t-s)
        s = -1

In [4]:
start_time=time.time()
hash_size = 65536
chunk_size = 128

char_array = np.frombuffer(text.encode('utf-8'), dtype=np.byte)

d_text = cuda.to_device(char_array)

d_counts = cuda.device_array(hash_size, dtype=np.int32)
d_hash_start = cuda.device_array(hash_size, dtype=np.int32)
d_hash_length = cuda.device_array(hash_size, dtype=np.int32)

threads_per_block = 256
blocks_per_grid = (len(char_array) + threads_per_block - 1) // threads_per_block


map[blocks_per_grid, threads_per_block](d_text, d_counts, d_hash_start, d_hash_length, hash_size, chunk_size)



char_array = d_text.copy_to_host()
counts = d_counts.copy_to_host()
hash_start = d_hash_start.copy_to_host()
hash_length = d_hash_length.copy_to_host()
end_time = time.time()
print(end_time-start_time)

2.958202600479126


In [7]:
def cpu_word_count(text):
    start_time = time.time()
    text = text.lower()
    words = re.findall(r'[a-zA-Z]+', text)
    word_counts = {}
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    end_time = time.time()
    cpu_time = end_time-start_time
    return word_counts, cpu_time
    
word_counts, cpu_time = cpu_word_count(text)
print(cpu_time)
print(dict(itertools.islice(word_counts.items(), 100)))
#print("Word Frequencies:", frequencies)

82.14207768440247
{'title': 3560369, 'wikipedia': 3134798, 'general': 72583, 'disclaimer': 214930, 'disclaimers': 71656, 'a': 10847552, 'li': 2597008, 'ul': 606332, 'div': 3232705, 'body': 144046, 'html': 3287356, 'doctype': 71756, 'public': 73034, 'w': 445402, 'c': 1289100, 'dtd': 215280, 'xhtml': 287033, 'transitional': 143534, 'en': 942504, 'http': 2947564, 'www': 1352862, 'org': 1598375, 'tr': 273747, 'xmlns': 71755, 'xml': 71850, 'lang': 144173, 'dir': 82642, 'ltr': 82634, 'head': 143805, 'meta': 149495, 'equiv': 143511, 'content': 668479, 'type': 1154485, 'text': 2251624, 'charset': 71756, 'utf': 143406, 'headlinks': 71635, 'removed': 110018, 'link': 179329, 'rel': 1077915, 'shortcut': 71657, 'icon': 95313, 'href': 4729256, 'misc': 214974, 'favicon': 71635, 'ico': 71635, 'user': 1922210, 'talk': 764100, 'the': 1229258, 'free': 383283, 'encyclopedia': 202963, 'style': 1097464, 'css': 1002903, 'cdata': 147010, 'import': 501458, 'skins': 788030, 'offline': 286572, 'main': 218436, 's

In [27]:
def fast_word_count(text):
    start_time = time.time()
    text = text.lower()
    
    # Use regular expressions to efficiently find all words
    words = re.findall(r'[a-zA-Z]+', text)
    
    # Count each word using Counter, which is highly optimized for this task
    word_counts = Counter(words)
    end_time = time.time()
    cpu_time = end_time-start_time
    return word_counts, cpu_time

word_counts, cpu_time =fast_word_count(text)
print(cpu_time)

51.444968700408936


In [28]:
print(word_counts.most_common(100))

[('a', 10847552), ('href', 4729256), ('title', 3560369), ('html', 3287356), ('div', 3232705), ('wikipedia', 3134798), ('class', 3060715), ('http', 2947564), ('id', 2903302), ('articles', 2744422), ('li', 2597008), ('text', 2251624), ('user', 1922210), ('org', 1598375), ('p', 1567121), ('n', 1536527), ('f', 1399088), ('www', 1352862), ('c', 1289100), ('b', 1237756), ('e', 1231657), ('the', 1229258), ('type', 1154485), ('h', 1130577), ('style', 1097464), ('ip', 1090719), ('rel', 1077915), ('script', 1010716), ('css', 1002903), ('external', 953003), ('en', 942504), ('nofollow', 940582), ('to', 932686), ('whois', 921209), ('skins', 788030), ('src', 765500), ('talk', 764100), ('o', 728243), ('you', 713455), ('net', 710697), ('wiki', 681324), ('svg', 678886), ('content', 668479), ('if', 667301), ('em', 665817), ('span', 662698), ('of', 641393), ('ul', 606332), ('amp', 597699), ('i', 593601), ('javascript', 576896), ('s', 576322), ('bin', 569015), ('d', 543913), ('page', 515208), ('query', 50

In [9]:
indices = np.argsort(counts)[::-1]
gpu_dict = {}
for i in range(100):
    idx = indices[i]
    word = char_array[hash_start[idx]:hash_start[idx]+hash_length[idx]]
    word = word.tobytes().decode('utf-8')
    gpu_dict[word] = counts[idx]
print(gpu_dict)

{'a': 10847555, 'href': 4729256, 'title': 3560732, 'html': 3287361, 'div': 3232825, 'wikipedia': 3134798, 'class': 3060769, 'http': 2947564, 'id': 2903468, 'articles': 2744422, 'li': 2598177, 'text': 2251698, 'user': 1922284, 'org': 1598757, 'p': 1568561, 'n': 1536527, 'f': 1399091, 'www': 1352867, 'c': 1289103, 'b': 1237757, 'e': 1231666, 'the': 1229258, 'type': 1154485, 'h': 1130580, 'style': 1097564, 'ip': 1091576, 'rel': 1077927, 'script': 1010716, 'css': 1002968, 'external': 953105, 'en': 942722, 'nofollow': 940637, 'to': 932689, 'whois': 921210, 'skins': 788071, 'src': 765510, 'talk': 764113, 'o': 728266, 'you': 713470, 'net': 710714, 'wiki': 681324, 'svg': 678887, 'content': 668687, 'if': 667302, 'em': 665865, 'span': 662704, 'of': 641398, 'ul': 606344, 'amp': 597718, 'i': 593581, 'about': 592784, 'javascript': 576901, 's': 576416, 'bin': 569015, 'd': 543928, 'page': 515262, 'query': 508237, 'ie': 501521, 'import': 501458, 'png': 468123, 'image': 464182, 'by': 462513, 'com': 459

In [24]:
print(char_array[2])

105
