In [1]:
import numpy as np
import os
import threading
import operator
import pickle
import re
import pandas as pd


start_time = 1190146243
end_time = 1192994591

# Dump and load functions

In [2]:
def dump_data(data, output_name):
    with open(output_name, 'wb') as f:
        pickle.dump(data, f)

def load_data(input_name):
    with open(input_name, 'rb') as f:
        data = pickle.load(f)
    return data

# Dump word frequencies (Only English)

In [3]:
def get_lemma_distribution(file_name):
    lemmas=[]
    with open(file_name) as f:
        for line in f:
            url=line.strip().split(' ')[2]
            if url[7:9] != 'en':
                continue
            lemma=url[url.rfind('/')+1:]
            if ':' in lemma:
                continue
            lemmas.append(lemma)
    return lemmas


def count_lemma(dictionary, lemmas):
    for lemma in lemmas:
        if lemma not in dictionary:
            dictionary[lemma] = 1
        else:
            dictionary[lemma] += 1

            
def get_lemma_counter(num, nworkers = 4):
    data_root = "./data/only_lemma/"
    data_files = os.listdir(data_root)
    counter = {}

    threads = [None] * nworkers
    flags = [True] * nworkers
    
    if num > len(data_files):
        num = len(data_files)
    indices = np.random.choice(len(data_files), num)
    
    def target(index, flags, counter, filename):
        count_lemma(counter, get_lemma_distribution(filename))
        flags[index] = True
        
    j = 1
    
    for i in indices:
        index = 0
        while True:
            try:
                index = flags.index(True)
                flags[index] = False
                break
            except:
                continue
        print("Processing file {}".format(j))
        j += 1
        t = threading.Thread(target = target, args = [index, flags, counter, data_root + data_files[i]])
        threads[index] = t
        t.start()
        
    for thread in threads:
        if thread:
            thread.join()
    return counter

def word_freq_dataframe(counter, num = 2000):
    sorted_dict = sorted(counter.items(), key=operator.itemgetter(1), reverse=True)
    data = []
    index = []
    for i in range(num):
        item = sorted_dict[i]
        if len(item[0]) == 0:
            continue
        index.append(item[0])
        data.append(int(item[1]))
    return pd.Series(data= data, index = index)

In [4]:
#counter = get_lemma_counter(1000, nworkers = 50)
#dump_data(counter,"data/dump/hot_words_all.pkl")

In [5]:
#word_freq_series = word_freq_dataframe(counter, 100000)
#dump_data(word_freq_series, "data/dump/hot_words_100000.pkl")

# Dump access number per second

In [5]:
from ipywidgets import IntProgress 

def get_time_distribution(file_name):
    times=[]
    for line in open(file_name):
        t=float(line.strip().split(' ')[1])
        times.append(t)
    return times

def count_timestamp(dictionary, times):
    for time in times:
        time = int(time)
        if time not in dictionary:
            dictionary[time] = 1
        else:
            dictionary[time] += 1

def get_timestamp_counter(num, nworkers = 4):
    data_root = "./data/only_lemma/"
    data_files = os.listdir(data_root)
    counter = {}

    threads = [None] * nworkers
    flags = [True] * nworkers
    
    if num > len(data_files):
        num = len(data_files)
    indices = np.random.choice(len(data_files), num)
    
    def target(index, flags, counter, filename):
        count_timestamp(counter, get_time_distribution(filename))
        flags[index] = True
        
    j = 1
    p = IntProgress(max = len(indices))
    display(p)
    for i in indices:
        index = 0
        while True:
            try:
                index = flags.index(True)
                flags[index] = False
                break
            except:
                continue
        #print("Processing file {}".format(j))
        j += 1
        p.value += 1
        t = threading.Thread(target = target, args = [index, flags, counter, data_root + data_files[i]])
        threads[index] = t
        t.start()
        
    for thread in threads:
        if thread:
            thread.join()
    return counter

In [6]:
time_counter = get_timestamp_counter(1000)
dump_data(time_counter, "data/dump/access_per_second.pkl")

IntProgress(value=0, max=792)

# Dump timestamps of single words (Only English)

In [6]:
def get_lemma_timestamps_from_file(file_name, entry_list, timestamps):
    with open(file_name) as f:
        for line in f:
            timestamp, url=line.strip().split(' ')[1:3]
            if url[7:9] != 'en':
                continue
            lemma=url[url.rfind('/')+1:]
            if lemma in entry_list:
                timestamps[lemma].append(timestamp)
        return timestamps

def get_lemma_timestamps_from_file_reset(file_name, entry_list):
    timestamps = {}
    with open(file_name) as f:
        for line in f:
            timestamp, url=line.strip().split(' ')[1:3]
            if url[7:9] != 'en':
                continue
            lemma=url[url.rfind('/')+1:]
            if lemma in entry_list:
                try:
                    timestamps[lemma].append(timestamp)
                except:
                    timestamps[lemma] = [timestamp]
        return timestamps

def merge_timestamps_to_file(timestamps, output_dir):
    for word in timestamps:
        with open(output_dir+"{}.csv".format(word), 'a') as f:
            for timestamp in timestamps[word]:
                f.write("{}\n".format(timestamp))
    print("done!")

def dump_timestamps_to_file(entry_list, output_dir, nworkers = 4):
    data_root = "./data/only_lemma/"
    data_files = os.listdir(data_root)
    
    flags = [True] * nworkers
    threads = [None] * nworkers
    
    def target(index, flags, file_name, entry_list):
        timestamps = get_lemma_timestamps_from_file_reset(file_name, entry_list)
        dump_data(timestamps, output_dir + "{}_timestamp.pkl".format(file_name.split("/")[-1])) 
        #merge_timestamps_to_file(timestamps, output_dir)
        flags[index] = True
    
    i = 0
    index = 0
    f = open('log.txt', 'w')
    
    for file in data_files:
        while True:
            try:
                index = flags.index(True)
                flags[index] = False
                break
            except:
                continue
        i += 1
        #print("Processing file {}".format(i))
        #target(timestamps, entry, data_root + file)
        t = threading.Thread(target = target, args = [index, flags, data_root + file, entry_list])
        threads[index] = t
        t.start()
        f.write("Processing file {}\n".format(i))
        f.flush()
        
    for thread in threads:
        if thread:
            thread.join()
    return timestamps

def get_lemma_timestamps(entry_list, nworkers = 4):
    timestamps = {}
    for entry in entry_list:
        timestamps[entry] = []
    data_root = "./data/only_lemma/"
    data_files = os.listdir(data_root)
    
    flags = [True] * nworkers
    threads = [None] * nworkers
    
    def target(index, flags, file_name, entry_list, timestamps):
        get_lemma_timestamps_from_file(file_name, entry_list, timestamps)
        flags[index] = True
    
    i = 0
    index = 0
    f = open('log.txt', 'w')
    for file in data_files:
        while True:
            try:
                index = flags.index(True)
                flags[index] = False
                break
            except:
                continue
        i += 1
        #print("Processing file {}".format(i))
        #target(timestamps, entry, data_root + file)
        t = threading.Thread(target = target, args = [index, flags, data_root + file, entry_list, timestamps])
        threads[index] = t
        t.start()
        f.write("Processing file {}\n".format(i))
        f.flush()
        
    for thread in threads:
        if thread:
            thread.join()
    return timestamps
    
def get_lemma_timestamps_from_seperate(entry_list, nworkers = 4):
    timestamps = {}
    for entry in entry_list:
        timestamps[entry] = []
    data_root = "./data/timestamps/dump/"
    data_files = os.listdir(data_root)
    
    flags = [True] * nworkers
    threads = [None] * nworkers
    
    def target(index, flags, file_name, entry_list):
        timestamps_of_file = load_data(file_name)
        for word in timestamps:
            try:
                timestamps[word] += timestamps_of_file[word]
            except:
                pass
        flags[index] = True
    
    i = 0
    index = 0
    for file in data_files:
        while True:
            try:
                index = flags.index(True)
                flags[index] = False
                break
            except:
                continue
        i += 1
        t = threading.Thread(target = target, args = [index, flags, data_root + file, entry_list])
        threads[index] = t
        t.start()
        
    for thread in threads:
        if thread:
            thread.join()
    return timestamps

In [None]:
word_freq_series = load_data("data/dump/hot_words_100000.pkl")
delta = 5000
for j in range(20):
    print("data/dump/word_access_timestamps_{}-{}.pkl".format(delta * j, delta * (j+1)))
    timestamps = get_lemma_timestamps_from_seperate(word_freq_series.index[delta * j: delta * (j+1)])
    dump_data(timestamps, "data/dump/word_access_timestamps_{}-{}.pkl".format(delta * j, delta * (j+1)))

data/dump/word_access_timestamps_0-5000.pkl
data/dump/word_access_timestamps_5000-10000.pkl
data/dump/word_access_timestamps_10000-15000.pkl
data/dump/word_access_timestamps_15000-20000.pkl
data/dump/word_access_timestamps_20000-25000.pkl
data/dump/word_access_timestamps_25000-30000.pkl
data/dump/word_access_timestamps_30000-35000.pkl
data/dump/word_access_timestamps_35000-40000.pkl
data/dump/word_access_timestamps_40000-45000.pkl
data/dump/word_access_timestamps_45000-50000.pkl
data/dump/word_access_timestamps_50000-55000.pkl
data/dump/word_access_timestamps_55000-60000.pkl


In [None]:
#word_freq_series = load_data("data/dump/hot_words_100000.pkl")
#dump_timestamps_to_file(word_freq_series, "data/timestamps/dump/")

In [4]:
times = load_data('data/dump/word_access_timestamps_0-5000.pkl')

In [7]:
word_list = list(times.keys())
seperate_set = {}

In [9]:
for i in range(5):
    seperate_set = {}
    for word in word_list[i*1000:(i+1)*1000]:
        seperate_set[word] = times[word]
    dump_data(seperate_set, "data/dump/word_access_timestamps_{}-{}.pkl".format(i*1000,(i+1)*1000))