In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Input, Activation, GRU, Dense
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import scale, StandardScaler, RobustScaler
from collections import OrderedDict, defaultdict
plt.rcParams['figure.figsize'] = [10, 8]

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Data Processing

In [6]:
# dimensions of csv are 145063 x 804

In [3]:
# narrow down dataset to 75k least popular contents
df = pd.read_csv('wikipedia/web-traffic-time-series-forecasting/train_2.csv', usecols=[1])
df.fillna(0, inplace=True) # fill missing vals with 0
df.sort_values(by=[df.columns[0]], ascending=False, inplace=True)
df = df[70000:]
ixs = df.index

### Traditional Caching Algorithms

### LRU

In [20]:
"""Least Recently Used Cache Policy"""
class LruContentStore():
    def __init__(self, size):
        self.size = size
        self.store = OrderedDict()
        self.hits = 0
        self.misses = 0

    def add(self, item):
        if self.size:
            if(len(self.store) == self.size):
                self.store.popitem(last=False)
            self.store[item] = item

    def get(self, item):
        try:
            cached_item = self.store.pop(item)
            self.store[item] = cached_item
            return cached_item
        except:
            return None

### LFU

In [21]:
"""Least Frequently Used Cache Policy"""
class LfuContentStore():
    def __init__(self, size):
        self.size = size
        self.store = {} # {'name', [item, freq]}
        self.hits = 0
        self.misses = 0
    
    def add(self, item):
        if self.size:
            if len(self.store) == self.size:
                min_key = None
                min_freq = None
                for key in self.store.keys():
                    if min_freq == None or self.store[key][1] < min_freq:
                        min_freq = self.store[key][1]
                        min_key = key
                self.store.pop(min_key)
            self.store[item] = [item, 1]

    def get(self, item):
        try:
            cached_item = self.store[item][0]
            self.store[item][1] += 1
            return cached_item
        except:
            return None

### Random

In [22]:
"""Random Cache Policy"""
class RandomContentStore():
    def __init__(self, size):
        self.size = size
        self.store = {}
        self.hits = 0
        self.misses = 0

    def add(self, item):
        if self.size:
            if len(self.store) == self.size:
                self.store.pop(np.random.choice(list(self.store.keys())))
            self.store[item] = item
    
    def get(self, item):
        try:
            return self.store[item]
        except:
            return None

In [23]:
# init content stores
cache_size = int(0.01 * 75000)
lru = LruContentStore(cache_size)
lfu  = LfuContentStore(cache_size)
rand = RandomContentStore(cache_size)

In [None]:
seed = 123
np.random.seed(123)

for i in range(50):
    df = pd.read_csv('wikipedia/web-traffic-time-series-forecasting/train_2.csv', usecols=[703 + i])
    df = df.loc[ixs]
    df.fillna(0, inplace=True)
    arr = df.values
    weights = arr.flatten()
    weights = weights/sum(weights)
    for j in range(500000):
        c = np.random.choice(ixs, 1, p=weights)[0]
        if lru.get(c) == None:
            lru.add(c)
            lru.misses += 1
        else:
            lru.hits += 1
        if lfu.get(c) == None:
            lfu.add(c)
            lfu.misses += 1
        else:
            lfu.hits += 1
        if rand.get(c) == None:
            rand.add(c)
            rand.misses += 1
        else:
            rand.hits += 1
        if(j % 100000 == 0):
            print(j)