In [23]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split

In [24]:
dataset = pd.read_csv("data/clean_dataset.csv")

In [25]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35544 entries, 0 to 35543
Data columns (total 5 columns):
UPDATE_TIME        35544 non-null object
ZONE_CODE          35544 non-null object
HOUR_ID            35544 non-null int64
BANDWIDTH_TOTAL    35544 non-null float64
MAX_USER           35544 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.4+ MB


In [26]:
dataset.head(10)

Unnamed: 0,UPDATE_TIME,ZONE_CODE,HOUR_ID,BANDWIDTH_TOTAL,MAX_USER
0,2017-10-01,ZONE01,0,16096.710313,212415.0
1,2017-10-01,ZONE01,1,9374.207907,166362.0
2,2017-10-01,ZONE01,2,5606.22575,146370.0
3,2017-10-01,ZONE01,3,4155.654661,141270.0
4,2017-10-01,ZONE01,4,3253.978594,139689.0
5,2017-10-01,ZONE01,5,4310.774522,165342.0
6,2017-10-01,ZONE01,6,10015.599874,221901.0
7,2017-10-01,ZONE01,7,19780.995631,302736.0
8,2017-10-01,ZONE01,8,24344.875025,336600.0
9,2017-10-01,ZONE01,9,28651.427689,356235.0


In [27]:
zone_code_unique = pd.Series.unique(dataset.ZONE_CODE)
zone_code_unique

array(['ZONE01', 'ZONE03', 'ZONE02'], dtype=object)

In [28]:
hour_id_unique = pd.Series.unique(dataset.HOUR_ID)
hour_id_unique

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [29]:
update_time_unique = pd.Series.unique(dataset.UPDATE_TIME)
update_time_unique

array(['2017-10-01', '2017-10-02', '2017-10-03', '2017-10-04',
       '2017-10-05', '2017-10-06', '2017-10-07', '2017-10-08',
       '2017-10-09', '2017-10-10', '2017-10-11', '2017-10-12',
       '2017-10-13', '2017-10-14', '2017-10-15', '2017-10-16',
       '2017-10-17', '2017-10-18', '2017-10-19', '2017-10-20',
       '2017-10-21', '2017-10-22', '2017-10-23', '2017-10-24',
       '2017-10-25', '2017-10-26', '2017-10-27', '2017-10-28',
       '2017-10-29', '2017-10-30', '2017-10-31', '2017-11-01',
       '2017-11-02', '2017-11-03', '2017-11-04', '2017-11-05',
       '2017-11-06', '2017-11-07', '2017-11-08', '2017-11-09',
       '2017-11-10', '2017-11-11', '2017-11-12', '2017-11-13',
       '2017-11-14', '2017-11-15', '2017-11-16', '2017-11-17',
       '2017-11-18', '2017-11-19', '2017-11-20', '2017-11-21',
       '2017-11-22', '2017-11-23', '2017-11-24', '2017-11-25',
       '2017-11-26', '2017-11-27', '2017-11-28', '2017-11-29',
       '2017-11-30', '2017-12-01', '2017-12-02', '2017-

In [30]:
def convert_datetime_to_timestamp(strDate):
    #timestamp = datetime.strptime(strDate, "%Y-%m-%d")
    timestamp = pd.Timestamp(strDate)
    
    return timestamp.timestamp()

def get_date_in_month(strDate):
    #timestamp = datetime.strptime(strDate, "%Y-%m-%d")
    dt = pd.to_datetime(strDate)
    
    return dt.day

def get_date_in_week(strDate):
    #timestamp = datetime.strptime(strDate, "%Y-%m-%d")
    dt = pd.to_datetime(strDate)
    
    return dt.dayofweek

def hour_to_shift(hour):
    return hour // 8

dataset.ZONE_CODE = dataset.ZONE_CODE.map({'ZONE01' : 1, 'ZONE03' : 3, 'ZONE02' : 2})
dataset['DATE_IN_MONTH'] =  dataset.UPDATE_TIME.map(get_date_in_month).astype('int32')
dataset['DATE_IN_WEEK'] =  dataset.UPDATE_TIME.map(get_date_in_week).astype('int32')
dataset['DAY_SHIFT'] =  dataset.HOUR_ID.map(hour_to_shift).astype('int32')
dataset.UPDATE_TIME = dataset.UPDATE_TIME.map(convert_datetime_to_timestamp).astype('int32')
dataset.head(10)

Unnamed: 0,UPDATE_TIME,ZONE_CODE,HOUR_ID,BANDWIDTH_TOTAL,MAX_USER,DATE_IN_MONTH,DATE_IN_WEEK,DAY_SHIFT
0,1506816000,1,0,16096.710313,212415.0,1,6,0
1,1506816000,1,1,9374.207907,166362.0,1,6,0
2,1506816000,1,2,5606.22575,146370.0,1,6,0
3,1506816000,1,3,4155.654661,141270.0,1,6,0
4,1506816000,1,4,3253.978594,139689.0,1,6,0
5,1506816000,1,5,4310.774522,165342.0,1,6,0
6,1506816000,1,6,10015.599874,221901.0,1,6,0
7,1506816000,1,7,19780.995631,302736.0,1,6,0
8,1506816000,1,8,24344.875025,336600.0,1,6,1
9,1506816000,1,9,28651.427689,356235.0,1,6,1


In [31]:
dataset.corr()

Unnamed: 0,UPDATE_TIME,ZONE_CODE,HOUR_ID,BANDWIDTH_TOTAL,MAX_USER,DATE_IN_MONTH,DATE_IN_WEEK,DAY_SHIFT
UPDATE_TIME,1.0,0.03296,0.00146,0.233044,0.154116,-0.006745,0.00429,0.001073
ZONE_CODE,0.03296,1.0,4.1e-05,-0.611186,-0.755793,0.008337,-0.001165,3.6e-05
HOUR_ID,0.00146,4.1e-05,1.0,0.313592,0.181726,-0.00079,4.6e-05,0.943551
BANDWIDTH_TOTAL,0.233044,-0.611186,0.313592,1.0,0.925587,0.00347,0.035023,0.309379
MAX_USER,0.154116,-0.755793,0.181726,0.925587,1.0,0.008498,0.002566,0.183248
DATE_IN_MONTH,-0.006745,0.008337,-0.00079,0.00347,0.008498,1.0,-0.017459,-0.000486
DATE_IN_WEEK,0.00429,-0.001165,4.6e-05,0.035023,0.002566,-0.017459,1.0,-0.00012
DAY_SHIFT,0.001073,3.6e-05,0.943551,0.309379,0.183248,-0.000486,-0.00012,1.0


In [32]:
#data = dataset[['DATE_IN_MONTH', 'ZONE_CODE', 'HOUR_ID']]
data = dataset[['ZONE_CODE', 'HOUR_ID', 'DATE_IN_WEEK', "DAY_SHIFT"]]
bandwidth_total = dataset.BANDWIDTH_TOTAL
max_user = dataset.MAX_USER

In [33]:
zone_code_unique

array(['ZONE01', 'ZONE03', 'ZONE02'], dtype=object)

In [34]:
x_train_bandwidth, x_test_bandwidth, y_train_bandwidth, y_test_bandwidth = train_test_split(data, bandwidth_total,test_size=0.3)
x_train_max_user, x_test_max_user, y_train_max_user, y_test_max_user = train_test_split(data, max_user,test_size=0.3)

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
list_k = []
list_acc = []
for k in range(2,50):
    list_k.append(k)
    neigh = KNeighborsRegressor(n_neighbors = k, weights='distance', algorithm='kd_tree', n_jobs = -1)
    neigh.fit(x_train_bandwidth,  y_train_bandwidth)
    y_pred_brandwidth = neigh.predict(x_test_bandwidth)
    acc = r2_score(y_test_bandwidth, y_pred_brandwidth)
    list_acc.append(acc)
    print("The Accurcy is", acc * 100, "% for k-value", k)
index = list_acc.index(max(list_acc))
k = list_k[index]
print("\nThe optimal number of neighbors is %d with %f%%" % (k, list_acc[index]* 100))

The Accurcy is 62.62260482403252 % for k-value 2
The Accurcy is 66.46083040120257 % for k-value 3
The Accurcy is 68.06991030987976 % for k-value 4
The Accurcy is 69.07935675173135 % for k-value 5
The Accurcy is 69.93918129324332 % for k-value 6
The Accurcy is 69.76663954523382 % for k-value 7
The Accurcy is 70.1813795866617 % for k-value 8
The Accurcy is 70.45924254584767 % for k-value 9
The Accurcy is 70.97399072831304 % for k-value 10
The Accurcy is 71.2662482580534 % for k-value 11
The Accurcy is 71.56690826015326 % for k-value 12
The Accurcy is 71.62573531827611 % for k-value 13
The Accurcy is 71.5535188316895 % for k-value 14
The Accurcy is 71.8038743517114 % for k-value 15
The Accurcy is 71.96033584389505 % for k-value 16
The Accurcy is 72.00978969751914 % for k-value 17
The Accurcy is 72.08855915254854 % for k-value 18
The Accurcy is 72.24239941756834 % for k-value 19
The Accurcy is 72.20418069703258 % for k-value 20
The Accurcy is 72.22788404798355 % for k-value 21
The Accurcy 

In [36]:
neigh = KNeighborsRegressor(n_neighbors = k, weights='uniform', algorithm='auto')
neigh.fit(x_train_bandwidth, y_train_bandwidth)
y_pred_bandwidth = neigh.predict(x_test_bandwidth)

In [37]:
brandwidth_real_pred = pd.DataFrame({'y_test_brandwidth': y_test_bandwidth, 
                                     'y_pred_bandwidth': y_pred_bandwidth})

In [38]:
brandwidth_real_pred.head(10)

Unnamed: 0,y_test_brandwidth,y_pred_bandwidth
25296,5387.240972,4553.890197
23537,28769.89839,21375.014782
12057,51014.311245,42611.027368
17240,36404.445158,36663.63898
7715,4635.197335,4896.264134
28602,5313.246001,4402.566893
17910,4180.964073,5094.922251
22781,7171.092502,6021.841663
29748,5186.050554,4709.221627
7992,2628.164815,2257.628371


In [39]:
print("Bandwidth Mean squared error: %f"% mean_squared_error(y_test_bandwidth, y_pred_bandwidth))
print("Bandwidth Score: %f"% (r2_score(y_test_bandwidth, y_pred_bandwidth) * 100))
print("Bandwidth Score: %f"% (neigh.score(x_test_bandwidth, y_test_bandwidth) * 100))

Bandwidth Mean squared error: 152971265.543421
Bandwidth Score: 73.006439
Bandwidth Score: 73.006439


In [40]:
list_k = []
list_acc = []
for k in range(2,50):
    list_k.append(k)
    neigh = KNeighborsRegressor(n_neighbors = k, weights='uniform', algorithm='auto')
    neigh.fit(x_train_max_user,  y_train_max_user)
    y_pred_max_user = neigh.predict(x_test_max_user)
    acc = r2_score(y_test_max_user, y_pred_max_user)
    list_acc.append(acc)
    print("The Accurcy is", acc * 100, "% for k-value", k)
index = list_acc.index(max(list_acc))
k = list_k[index]
print("\nThe optimal number of neighbors is %d with %f%%" % (k, list_acc[index]* 100))

The Accurcy is 72.99030985024461 % for k-value 2
The Accurcy is 76.76786111190816 % for k-value 3
The Accurcy is 78.36272873584662 % for k-value 4
The Accurcy is 78.80212978426356 % for k-value 5
The Accurcy is 79.48289956697559 % for k-value 6
The Accurcy is 79.70661806867571 % for k-value 7
The Accurcy is 79.81534164856195 % for k-value 8
The Accurcy is 79.93477631818557 % for k-value 9
The Accurcy is 80.34720917217298 % for k-value 10
The Accurcy is 80.48425452872972 % for k-value 11
The Accurcy is 80.72812153994474 % for k-value 12
The Accurcy is 80.89274768237358 % for k-value 13
The Accurcy is 81.00793125410439 % for k-value 14
The Accurcy is 81.14125283747285 % for k-value 15
The Accurcy is 81.18392143648057 % for k-value 16
The Accurcy is 81.22645902399161 % for k-value 17
The Accurcy is 81.25997272605396 % for k-value 18
The Accurcy is 81.28759833348401 % for k-value 19
The Accurcy is 81.37473730955314 % for k-value 20
The Accurcy is 81.39632497730835 % for k-value 21
The Accu

In [41]:
maxUser_knn = KNeighborsRegressor(n_neighbors = k, weights='uniform', algorithm='auto')
maxUser_knn.fit(x_train_max_user, y_train_max_user)
y_pred_max_user = maxUser_knn.predict(x_test_max_user)
y_pred_max_user = np.round(y_pred_max_user)

In [42]:
maxUser_real_pred = pd.DataFrame({'y_pred_maxUser': y_pred_max_user, 'y_test_max_user': y_test_max_user})
maxUser_real_pred.head()

Unnamed: 0,y_pred_maxUser,y_test_max_user
11319,42732.0,33762.0
23413,29233.0,28713.0
33019,47502.0,66045.0
21055,731469.0,842520.0
9608,599179.0,433398.0


In [43]:
print("MaxUser Mean squared error: %f"% mean_squared_error(y_pred_max_user, y_test_max_user))
print("MaxUser Score: %f"% r2_score(y_test_max_user, y_pred_max_user))
print("MaxUser Score: %f"% maxUser_knn.score(x_test_max_user, y_test_max_user))

MaxUser Mean squared error: 13638313217.466618
MaxUser Score: 0.816364
MaxUser Score: 0.816364
