In [1]:
import numpy as np
import pandas as pd
import json
import time
from traffic_emulator import TrafficEmulator
pd.set_option('mode.chained_assignment', None)

In [2]:
# Setting up data
session_df = pd.read_csv(filepath_or_buffer='./data/net_traffic_nonull.dat', sep=',', names=['uid','location','startTime_unix','duration_ms','domainProviders','domainTypes','domains','bytesByDomain','requestsByDomain'])
session_df.index.name = 'sessionID'
session_df['endTime_unix'] = session_df['startTime_unix'] + session_df['duration_ms']
session_df['startTime_datetime'] = pd.to_datetime(session_df['startTime_unix'], unit='ms')  # convert start time to readible date_time strings
session_df['endTime_datetime'] = pd.to_datetime(session_df['endTime_unix'], unit='ms')
session_df['totalBytes'] = session_df['bytesByDomain'].apply(lambda x: x.split(';')).map(lambda x: sum(map(float, x)))  # sum bytes across domains
session_df['totalRequests'] = session_df['requestsByDomain'].apply(lambda x: x.split(';')).map(lambda x: sum(map(float, x)))  # sum requests across domains
session_df.sort(['startTime_datetime'], ascending=True, inplace=True)  # get it sorted
session_df['interArrivalDuration_datetime'] = session_df.groupby('location')['startTime_datetime'].diff()  # group-wise diff
session_df['interArrivalDuration_ms'] = session_df.groupby('location')['startTime_unix'].diff()  # group-wise diff



# Initialization

In [3]:
# Empty session_df
print "=======Initialization: Empty session_df======="
try:
    te = TrafficEmulator()  # should raise ValueError
except ValueError:
    pass
finally:
    pass

TrafficEmulator Initialization: session_df passed in is empty or None.


In [4]:
# Default values
print "=======Initialization: Default values======="
te = TrafficEmulator(session_df)
print te.time_step
print te.head_datetime
print te.tail_datetime
print te.verbose

0 days 00:00:01
2014-08-31 16:00:04.181000
2015-01-31 15:56:59.142000
0


In [5]:
# Verbose
print "=======Initialization: Verbose======="
te = TrafficEmulator(session_df, verbose=1)

New TrafficEmulator with parameters:
  head=2014-08-31 16:00:04.181000
  tail=2015-01-31 15:56:59.142000
  time_step=0 days 00:00:01
  epoch=0
  verbose=1


In [6]:
# Head and tail datetime
print "=======Initialization: Head and tail datetime======="
head, tail = pd.datetime(year=2014, month=9, day=5), pd.datetime(year=2014, month=9, day=3)
try:
    te = TrafficEmulator(session_df, head_datetime=head, tail_datetime=tail, time_step=pd.Timedelta(days=0.5))
except ValueError:
    pass

head_datetime > tail_datetime


# Traffic & Service

When doing tests, do not initiate with a time_step that is too large, this may increase the duration for each epoch greatly. E.g. 1h takes more than 10 seconds, while 1m only taks less than 0.5 seconds.

In [7]:
# Head and tail range:
# Datetime range larger than dataset. should observe empty traffic at first, and warning in the end.
print "=======Traffic & Service: datetime range======="
head, tail, time_step = pd.datetime(year=2014, month=9, day=3), pd.datetime(year=2014, month=9, day=7), pd.Timedelta(minutes=1)
te = TrafficEmulator(session_df, head_datetime=head, tail_datetime=tail, time_step=time_step)
for i in range(0, 10):
    temp = time.time()
    print "{} to {}".format(head+i*time_step, head+(i+1)*time_step)
    t = te.get_traffic()
    if t is not None:
        print t.index
    else:
        pass
    print "Reward = {} ({} seconds)".format(te.serve(service_df=pd.DataFrame()), time.time()-temp)

2014-09-03 00:00:00 to 2014-09-03 00:01:00
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
           dtype='int64')
Reward = -841 (0.485023975372 seconds)
2014-09-03 00:01:00 to 2014-09-03 00:02:00
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
            68, 69, 70, 71, 72],
           dtype='int64')
Reward = -2909 (0.480963945389 seconds)
2014-09-03 00:02:00 to 2014-09-03 00:03:00
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

In [10]:
print "=======Traffic & Service: no service======="
head, tail, time_step = pd.datetime(year=2014, month=9, day=3), pd.datetime(year=2014, month=9, day=7), pd.Timedelta(minutes=1)
te = TrafficEmulator(session_df, head_datetime=head, tail_datetime=tail, time_step=time_step, verbose=1)
for i in range(0, 10):
    temp = time.time()
    print "{} to {}".format(head+i*time_step, head+(i+1)*time_step)
    t = te.get_traffic()
    if t is not None:
        print t.index
    else:
        pass
    service_df = pd.DataFrame(columns=['sessionID', 'service_per_request_per_domain'], index=t.index if t is not None else pd.Index([]))
    service_df['service_per_request_per_domain'] = json.dumps({})
    service_df['sessionID'] = t['sessionID']
    print te.serve(service_df=service_df)
    print "{} seconds".format(time.time()-temp)

New TrafficEmulator with parameters:
  head=2014-09-03 00:00:00
  tail=2014-09-07 00:00:00
  time_step=0 days 00:01:00
  epoch=0
  verbose=1
2014-09-03 00:00:00 to 2014-09-03 00:01:00
get_traffic(): locating incoming sessions.
get_traffic(): appending incoming sessions to buffer.
get_traffic(): generating traffic.
get_traffic(): finished.
                     bytesSent_per_request_per_domain  sessionID       uid
0   {"hupu.com": {"1": 3020}, "hoopchina.com.cn": ...   10555461  85025936
1                             {"qq.com": {"0": 1583}}    2278064  26043101
2   {"qq.com": {"0": 1245}, "renren.com": {"12": 1...     200945  12081120
3   {"xiaomi.net": {"0": 823}, "miui.com": {"8": 7...     506527  14068894
4   {"renren.com": {"40": 1798, "42": 1795, "4": 1...    6993484  59045188
5                             {"qq.com": {"0": 1536}}    9100301  74069356
6   {"youdao.com": {"0": 730}, "amap.com": {"0": 1...   11265633  91015093
7   {"weibo.cn": {"3": 3021}, "sinaimg.cn": {"0": ...   120

In [13]:
# Full service
print "=======Traffic & Service: full service======="
head, tail, time_step = pd.datetime(year=2014, month=9, day=3), pd.datetime(year=2014, month=9, day=7), pd.Timedelta(minutes=0.1)
te = TrafficEmulator(session_df, head_datetime=head, tail_datetime=tail, time_step=time_step,verbose=0)
for i in range(0, 10):
    temp = time.time()
    print "{} to {}".format(head+i*time_step, head+(i+1)*time_step)
    t = te.get_traffic()
    if t is not None:
        print t.index
        service_df = pd.DataFrame(columns=['sessionID', 'service_per_request_per_domain'], index=t.index)

        for idx in t.index:
            bytesSent_req_domain = json.loads(t.loc[idx, 'bytesSent_per_request_per_domain'])
            service_req_domain = {}
            for domain in bytesSent_req_domain:
                for reqID in bytesSent_req_domain[domain]:
                    if domain not in service_req_domain:
                        service_req_domain[domain] = {}
                    service_req_domain[domain][int(reqID)] = 'serve'
            service_df.loc[idx, 'service_per_request_per_domain'] = json.dumps(service_req_domain)
            service_df.loc[idx, 'sessionID'] = t.loc[idx, 'sessionID']
    else:
        service_df = pd.DataFrame(columns=['sessionID', 'service_per_request_per_domain'], index=pd.Index([]))
    print te.serve(service_df=service_df)
    print "{} seconds".format(time.time()-temp)

2014-09-03 00:00:00 to 2014-09-03 00:00:06
Int64Index([0, 1, 2, 3, 4], dtype='int64')
8
0.108529090881 seconds
2014-09-03 00:00:06 to 2014-09-03 00:00:12
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
35
0.155853033066 seconds
2014-09-03 00:00:12 to 2014-09-03 00:00:18
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')
40
0.172075986862 seconds
2014-09-03 00:00:18 to 2014-09-03 00:00:24
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')
38
0.193950176239 seconds
2014-09-03 00:00:24 to 2014-09-03 00:00:30
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21],
           dtype='int64')
50
0.21231508255 seconds
2014-09-03 00:00:30 to 2014-09-03 00:00:36
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], dtype='int64')
66
0.203326940536 seconds
2014-09-03 00:00:36 to 2014-09-03 00:00:42
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  

In [None]:
# Partial service
print "=======Traffic & Service: random service======="
head, tail, time_step = pd.datetime(year=2014, month=9, day=3), pd.datetime(year=2014, month=9, day=7), pd.Timedelta(minutes=0.1)
te = TrafficEmulator(session_df, head_datetime=head, tail_datetime=tail, time_step=time_step,verbose=0)
for i in range(0, 10):
    temp = time.time()
    print "{} to {}".format(head+i*time_step, head+(i+1)*time_step)
    t = te.get_traffic()
    if t is not None:
        print t.index
        service_df = pd.DataFrame(columns=['sessionID', 'service_per_request_per_domain'], index=t.index)

        for idx in t.index:
            bytesSent_req_domain = json.loads(t.loc[idx, 'bytesSent_per_request_per_domain'])
            service_req_domain = {}
            for domain in bytesSent_req_domain:
                for reqID in bytesSent_req_domain[domain]:
                    if domain not in service_req_domain:
                        service_req_domain[domain] = {}
                    r = np.random.rand()
                    if r < 1.0/3:
                        service_req_domain[domain][int(reqID)] = 'serve'
                    elif r < 2.0/3:
                        service_req_domain[domain][int(reqID)] = 'queue'
                    else:
                        service_req_domain[domain][int(reqID)] = 'reject'
            service_df.loc[idx, 'service_per_request_per_domain'] = json.dumps(service_req_domain)
            service_df.loc[idx, 'sessionID'] = t.loc[idx, 'sessionID']
    else:
        service_df = pd.DataFrame(columns=['sessionID', 'service_per_request_per_domain'], index=pd.Index([]))
    print service_df
    print te.serve(service_df=service_df)
    print "{} seconds".format(time.time()-temp)

2014-09-03 00:00:00 to 2014-09-03 00:00:06
Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')
     sessionID                     service_per_request_per_domain
0  1.05555e+07              {"hoopchina.com.cn": {"22": "queue"}}
1  2.27806e+06                        {"qq.com": {"0": "reject"}}
2       200945  {"sina.cn": {"20": "serve"}, "sinaimg.cn": {"1...
3       506527                      {"miui.com": {"0": "reject"}}
4  6.99348e+06  {"renren.com": {"15": "queue"}, "xnimg.cn": {"...
5   9.1003e+06                         {"qq.com": {"0": "serve"}}
6  1.20256e+07                       {"weibo.cn": {"2": "queue"}}
7  1.19306e+06                    {"renren.com": {"7": "reject"}}
-26
0.116399049759 seconds
2014-09-03 00:00:06 to 2014-09-03 00:00:12
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')
      sessionID                     service_per_request_per_domain
0        200945  {"sina.cn": {"32": "queue"}, "sinaimg.cn": {"1...
1   6.99348e+06  {"renren.com": {"40": "s