In [0]:
!curl -s https://course.fast.ai/setup/colab | bash
    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.tabular import *


from google.colab import drive

drive.mount("/content/drive")
path = Path('/content/drive/My Drive/Competitions/Timeseries')

Updating fastai...
Done.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# !unzip '/content/drive/My Drive/Competitions/Timeseries/datasets/aivivn_timeseries.zip' -d '/content/drive/My Drive/Competitions/Timeseries/datasets'

In [0]:
tet_holidays = {'2018':{'start':'2018-02-11', 
                        'end':'2018-02-24'},
                '2019':{'start':'2019-01-31',
                        'end':'2019-02-14'
                       }
               }

In [0]:
train_df = pd.read_csv(path/'datasets/train.csv')

In [0]:
train_df = train_df[train_df.SERVER_NAME != 'SERVER_ZONE02_085']

In [0]:
def gen_statistic_col(df, target, groupby):
    statistics = ['mean','median','min','max','count','std']
    res = df.groupby(groupby).agg({target:statistics}).reset_index()
    res.columns = [*groupby, *[f'{i}_{target}' for i in statistics]]
    return res


In [0]:
bandwidth_by_server = gen_statistic_col(train_df, 'BANDWIDTH_TOTAL',['SERVER_NAME'])
bandwidth_by_server_hour = gen_statistic_col(train_df, 'BANDWIDTH_TOTAL',['SERVER_NAME', "HOUR_ID"])

In [0]:
user_by_server = gen_statistic_col(train_df, 'MAX_USER',['SERVER_NAME'])
user_by_server_hour = gen_statistic_col(train_df, 'MAX_USER',['SERVER_NAME', "HOUR_ID"])

In [0]:
del train_df

In [0]:
def prepare_data(path, is_train = True):
    df = pd.read_csv(path)
    if is_train:
        df = df[(df.MAX_USER != 0)]
        df = df[df.SERVER_NAME != 'SERVER_ZONE02_085']
        df['MAX_USER'] = df['MAX_USER']/51
#         df.BANDWIDTH_TOTAL = 1/df.BANDWIDTH_TOTAL**2
    num_active_server = df.groupby(['UPDATE_TIME','HOUR_ID']).SERVER_NAME.nunique().reset_index()
    num_active_server.columns = ['UPDATE_TIME','HOUR_ID','NUM_ACTIVE_SERVER']
    num_active_zone = df.groupby(['UPDATE_TIME','HOUR_ID']).ZONE_CODE.nunique().reset_index()
    num_active_zone.columns = ['UPDATE_TIME','HOUR_ID','NUM_ACTIVE_ZONE']        
    
    df = pd.merge(df, num_active_server, on = ['UPDATE_TIME','HOUR_ID'])
    df = pd.merge(df, num_active_zone, on = ['UPDATE_TIME','HOUR_ID'])
    df = pd.merge(df, bandwidth_by_server, on = ['SERVER_NAME'])
    df = pd.merge(df, bandwidth_by_server_hour, on = ['SERVER_NAME','HOUR_ID'])
    df = pd.merge(df, user_by_server, on = ['SERVER_NAME'])
    df = pd.merge(df, user_by_server_hour, on = ['SERVER_NAME','HOUR_ID'])
    df = df.sort_values(by = ['UPDATE_TIME','HOUR_ID'])
    df = add_datepart(df, 'UPDATE_TIME', drop= False)
    df['IS_TET'] = (((pd.to_datetime(df.UPDATE_TIME) > pd.to_datetime(tet_holidays['2018']['start'])) 
                   &(pd.to_datetime(df.UPDATE_TIME) < pd.to_datetime(tet_holidays['2018']['end'])))
                   |((pd.to_datetime(df.UPDATE_TIME) > pd.to_datetime(tet_holidays['2019']['start'])) 
                   &(pd.to_datetime(df.UPDATE_TIME) < pd.to_datetime(tet_holidays['2019']['end'])))
                  )
    df = df[df.IS_TET == False]
    df.reset_index(inplace=True)
    del df['index']
    return df

In [0]:
train = prepare_data(path/'datasets/train.csv')
test_df = prepare_data(path/'datasets/test_id.csv', is_train = False)
del bandwidth_by_server_hour
del bandwidth_by_server
del user_by_server
del user_by_server_hour

In [0]:
bandwidth_total = train[train.BANDWIDTH_TOTAL > 0].BANDWIDTH_TOTAL.values

In [0]:
def mape(pred, targ):
    dev = np.abs(pred - targ)/targ
    res = np.mean(dev)*100
    return res

from scipy.optimize import minimize

def find_optimal_mape(start_val,range1):
    mape_ = partial(mape,targ = range1)
    optimal = minimize(mape_, start_val)    
    return optimal['x'][0]

In [0]:
def encode_range(value):
    global threshold
    for i in range(len(threshold)):
        if value >= threshold[i]:
            return i
encode_range = np.vectorize(encode_range)
   

In [0]:
threshold = []
def target_encode(df, target_col, num_classes):
    ## Gen threshold:
    global threshold
    threshold= []
    step_size = 100/num_classes
    for i in range(num_classes):
        t = np.percentile(df[target_col],(i)*step_size)
        threshold = [t] + threshold
    df['target_encoded'] = encode_range(df[target_col])
    return df
    
    

In [0]:
train = target_encode(train, 'BANDWIDTH_TOTAL', 5)
optimals = []
for i in range(len(threshold)):
    if i == 0:
        range1 = bandwidth_total[bandwidth_total > threshold[i]]
    else:
        range1 = bandwidth_total[(bandwidth_total < threshold[i-1])&(bandwidth_total > threshold[i])]
    start_val = threshold[i]
    opt = find_optimal_mape(start_val, range1)
    optimals.append(opt)
    del range1



In [0]:
cont_vars = ['NUM_ACTIVE_SERVER', 'NUM_ACTIVE_ZONE','UPDATE_TIMEElapsed',
             *[f'{i}_BANDWIDTH_TOTAL_x' for i in ['mean','median','min','max','count','std']],
             *[f'{i}_BANDWIDTH_TOTAL_y' for i in ['mean','median','min','max','count','std']],
             *[f'{i}_MAX_USER_x' for i in ['mean','median','min','max','count','std']],
             *[f'{i}_MAX_USER_y' for i in ['mean','median','min','max','count','std']],
            ]
cat_vars = ['HOUR_ID', 'ZONE_CODE','SERVER_NAME','UPDATE_TIMEYear','UPDATE_TIMEMonth','UPDATE_TIMEWeek','UPDATE_TIMEDay','UPDATE_TIMEDayofweek']
dep_var = 'target_encoded'

procs=[FillMissing, Categorify, Normalize]


In [0]:
print(cat_vars)
print(cont_vars)

['HOUR_ID', 'ZONE_CODE', 'SERVER_NAME', 'UPDATE_TIMEYear', 'UPDATE_TIMEMonth', 'UPDATE_TIMEWeek', 'UPDATE_TIMEDay', 'UPDATE_TIMEDayofweek']
['NUM_ACTIVE_SERVER', 'NUM_ACTIVE_ZONE', 'UPDATE_TIMEElapsed', 'mean_BANDWIDTH_TOTAL_x', 'median_BANDWIDTH_TOTAL_x', 'min_BANDWIDTH_TOTAL_x', 'max_BANDWIDTH_TOTAL_x', 'count_BANDWIDTH_TOTAL_x', 'std_BANDWIDTH_TOTAL_x', 'mean_BANDWIDTH_TOTAL_y', 'median_BANDWIDTH_TOTAL_y', 'min_BANDWIDTH_TOTAL_y', 'max_BANDWIDTH_TOTAL_y', 'count_BANDWIDTH_TOTAL_y', 'std_BANDWIDTH_TOTAL_y', 'mean_MAX_USER_x', 'median_MAX_USER_x', 'min_MAX_USER_x', 'max_MAX_USER_x', 'count_MAX_USER_x', 'std_MAX_USER_x', 'mean_MAX_USER_y', 'median_MAX_USER_y', 'min_MAX_USER_y', 'max_MAX_USER_y', 'count_MAX_USER_y', 'std_MAX_USER_y']


In [0]:
cut_idx = min(train[train.UPDATE_TIME > '2019-01-09'].index )

valid_idx = range(cut_idx, train.shape[0])

In [0]:
def inverse_exp_mape(pred:Tensor, targ:Tensor)->Rank0Tensor:
    "Exp MAPE between `pred` and `targ`."
    pred,targ = flatten_check(pred,targ)
    pred, targ = torch.exp(pred), torch.exp(targ)
    pct_var = torch.abs(torch.sqrt(1/targ) - torch.sqrt(1/pred))*torch.sqrt(targ)
    return pct_var.mean()*100

In [0]:
def exp_mape(pred:Tensor, targ:Tensor)->Rank0Tensor:
    "Exp MAPE between `pred` and `targ`."
    pred,targ = flatten_check(pred,targ)
    pred, targ = torch.exp(pred), torch.exp(targ)
    pct_var = torch.abs(targ - pred)/targ
    return pct_var.mean()*100

In [0]:
def mape(pred:Tensor, targ:Tensor)->Rank0Tensor:
    "Exp MAPE between `pred` and `targ`."
    pred,targ = flatten_check(pred,targ)
    pct_var = torch.abs(targ - pred)/targ
    return pct_var.mean()*100

In [0]:
def adj_accuracy(input:Tensor, targs:Tensor)->Rank0Tensor:
    "Compute accuracy with `targs` when `input` is bs * n_classes."
    n = targs.shape[0]
    input = input.argmax(dim=-1).view(n,-1)
    targs = targs.view(n,-1)
    return (input<=targs).float().mean()

In [0]:
data = (TabularList.from_df(train, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var)
                .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
                .databunch())

In [0]:
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,  metrics=[adj_accuracy, accuracy])

In [0]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(25, 10)
    (1): Embedding(4, 3)
    (2): Embedding(535, 54)
    (3): Embedding(4, 3)
    (4): Embedding(13, 7)
    (5): Embedding(52, 15)
    (6): Embedding(32, 11)
    (7): Embedding(8, 5)
  )
  (emb_drop): Dropout(p=0.04)
  (bn_cont): BatchNorm1d(27, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=135, out_features=1000, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.001)
    (4): Linear(in_features=1000, out_features=500, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.01)
    (8): Linear(in_features=500, out_features=5, bias=True)
  )
)

In [0]:
learn.metrics=[accuracy, adj_accuracy]

In [0]:
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,adj_accuracy,time
0,0.901924,0.856521,0.623736,0.806091,13:22


In [32]:
learn.fit_one_cycle(1, 1e-4)

epoch,train_loss,valid_loss,accuracy,adj_accuracy,time
0,0.901291,0.860746,0.621584,0.811864,13:17


In [0]:
learn.save("BANDWIDTH_TOTAL_Category-stage1")

In [0]:
dep_var2 = 'MAX_USER'
data2 = (TabularList.from_df(train, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var2, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
                .databunch())


max_log_y2 = np.log(np.max(train['MAX_USER'])*1.2)
y_range2 = torch.tensor([0, max_log_y2], device=defaults.device)

learn2 = tabular_learner(data2, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, 
                        y_range=y_range2, metrics=exp_mape)




In [0]:
data2.save("data-max-user")

In [0]:
learn2.fit_one_cycle(1, 1e-3)

In [0]:
learn2.save("MAX_USER-stage1")

In [0]:
max_user_preds=learn2.get_preds(DatasetType.Test)

In [0]:
bandwidth_total_preds = learn.get_preds(DatasetType.Test)

In [0]:
bandwidth_total_preds

In [0]:
probs = bandwidth_total_preds[0].numpy()

In [0]:
test_df['BANDWIDTH_TOTAL'] = np.argmax(probs, axis = 1)

In [0]:
test_df['prob'] = np.max(probs, axis = 1)

In [0]:
test_df.BANDWIDTH_TOTAL = test_df.BANDWIDTH_TOTAL.map(lambda i: optimals[int(i)])

In [0]:
test_df['adj_bandwidth_total'] = test_df['BANDWIDTH_TOTAL']

In [0]:
def adj_bandwidth(value, prob):
    if prob < 0.7:
        return optimals[-1]
    return value

adj_bandwidth = np.vectorize(adj_bandwidth)

In [0]:
test_df['adj_bandwidth_total']= adj_bandwidth(test_df['BANDWIDTH_TOTAL'], test_df['prob'])

In [0]:
test_df['MAX_USER'] = (np.exp(max_user_preds[0].data).numpy()*51).T[0].astype(int)


In [0]:
def convert_to_label(x1, x2):
    return '{:.2f} {}'.format(x1, int(x2))

convert_to_label = np.vectorize(convert_to_label)

In [0]:
test_df['label'] = convert_to_label(test_df.adj_bandwidth_total, test_df.MAX_USER)

In [0]:
test_df[['id','label']].to_csv(path/'submission.csv', index = False)

In [0]:
test_df[test_df.prob>0.7].head(100)

In [0]:
test_df.shape

In [0]:
test = pd.read_csv(path/'datasets/test_id.csv')

In [0]:
test.shape

In [0]:
bandwidth_baseline = train.groupby(['SERVER_NAME']).BANDWIDTH_TOTAL.min().reset_index()
max_user_baseline = train.groupby(['SERVER_NAME']).MAX_USER.min().reset_index()



In [0]:
test_baseline = pd.merge(test[['id','SERVER_NAME','ZONE_CODE','HOUR_ID','UPDATE_TIME']], bandwidth_baseline, on = ['SERVER_NAME'], how = 'left')

In [0]:
test_baseline = pd.merge(test_baseline, max_user_baseline, on = ['SERVER_NAME'], how = 'left')

In [0]:
test_baseline.fillna(0, inplace = True)

In [0]:
test_baseline['label'] = convert_to_label(test_baseline.BANDWIDTH_TOTAL, test_baseline.MAX_USER)

In [0]:
train[train.SERVER_NAME=='SERVER_ZONE03_057']

In [0]:
test_baseline[['id','label']].to_csv(path/'submission.csv', index = False)

In [0]:
temp = pd.merge(test_df, test_baseline, on = 'id')

In [0]:
temp['diff'] = np.abs(temp.BANDWIDTH_TOTAL_x - temp.BANDWIDTH_TOTAL_y)/temp.BANDWIDTH_TOTAL_y*100

In [0]:
temp[temp.BANDWIDTH_TOTAL_y != 0]['diff'].mean()

In [0]:
train_df = train.iloc[:cut_idx]

In [0]:
valid_df = train.iloc[cut_idx:]

In [0]:
bandwidth_baseline = train_df.groupby(['SERVER_NAME','HOUR_ID']).BANDWIDTH_TOTAL.min().reset_index()
max_user_baseline = train_df.groupby(['SERVER_NAME','HOUR_ID']).MAX_USER.min().reset_index()

valid_baseline = pd.merge(valid_df[['SERVER_NAME','ZONE_CODE','HOUR_ID','UPDATE_TIME','BANDWIDTH_TOTAL','MAX_USER']], bandwidth_baseline, on = ['SERVER_NAME','HOUR_ID'], how = 'left')
valid_baseline = pd.merge(valid_baseline, max_user_baseline, on = ['SERVER_NAME','HOUR_ID'], how = 'left')




In [0]:
valid_baseline.head()

In [0]:
small_mape = valid_baseline[valid_baseline.BANDWIDTH_TOTAL_y > 350]

np.mean(np.abs(small_mape['BANDWIDTH_TOTAL_y'] -small_mape['BANDWIDTH_TOTAL_x'])/small_mape['BANDWIDTH_TOTAL_x'])

In [0]:
iplot(train[train.SERVER_NAME == 'SERVER_ZONE01_106'].groupby("UPDATE_TIME").BANDWIDTH_TOTAL.sum().iplot(asFigure = True))

In [0]:
set(train.SERVER_NAME).difference(test_df.SERVER_NAME)

In [0]:
set(test_df.SERVER_NAME).difference(train.SERVER_NAME)

In [0]:
set(train.SERVER_NAME.value_counts())

In [0]:
(valid_baseline.BANDWIDTH_TOTAL_y < 20).sum()

In [0]:
(180038*1 + 370749*0.8905040575809736)/(370749+180038)

In [0]:
small_mape = valid_baseline[valid_baseline.BANDWIDTH_TOTAL_y > 20]

np.mean(np.abs(small_mape['BANDWIDTH_TOTAL_y'] -small_mape['BANDWIDTH_TOTAL_x'] )/small_mape['BANDWIDTH_TOTAL_x'])

In [0]:
bandwidth_baseline = train_df.groupby(['SERVER_NAME','HOUR_ID']).BANDWIDTH_TOTAL.median().reset_index()
max_user_baseline = train_df.groupby(['SERVER_NAME','HOUR_ID']).MAX_USER.median().reset_index()

valid_baseline = pd.merge(valid_df[['SERVER_NAME','ZONE_CODE','HOUR_ID','UPDATE_TIME','BANDWIDTH_TOTAL','MAX_USER']], bandwidth_baseline, on = ['SERVER_NAME','HOUR_ID'], how = 'left')

In [0]:

valid_baseline.head()