In [126]:
#Import packages
import random
import os, sys
import pytz
import pandas as pd
import numpy as np
from datetime import datetime, date, time, timedelta
import re
from IPython.display import Image
import itertools
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

In [3]:
sys.path.append('/Users/dmatekenya/PycharmProjects/power-mon')

In [33]:
#Pypower modules
from pypower import prediction_models as pred
from pypower import data_utils as ut

In [11]:
data_dir = "/Users/dmatekenya/Google Drive/World-Bank/electricity_monitoring/01.data/processed_sms/"

In [127]:
# Get the data: sms_rect_hr.csv'
cols_to_use = ['box_id', 'psu', 'lon', 'lat', 'str_datetime_sent_hr', 'hour_sent', 'event_type_str', 'power_state']
sms2 = pd.read_csv(data_dir + 'sms_rect_hr.csv', usecols=cols_to_use, parse_dates=['str_datetime_sent_hr'])
sms2.rename(columns={'str_datetime_sent_hr': 'datetime_sent_hr'}, inplace=True)

# Remove missing values
sms2 = sms2[sms2.power_state != -1]

**EVALUATING NEAREST NEIGHBOR MODEL**
=======================
==================================================================

1. * Pick a test box*
----------------------

In [128]:
# Pick a test box
test_box_id = 1005
data_test_bx = sms2[sms2.box_id == test_box_id]  # select data for this box only

# List of dates, to help with picking random dates for testing
event_dates = list(data_test_bx.datetime_sent_hr)

2. * Decide how many tests to do *
--------------------------------------
-Cant do complete leave one out due to time

In [100]:
prop_test = 0.2
max_test_cases = 250
min_test_cases = 50
num_tests = int(data_test_bx.shape[0] * prop_test)
if num_tests < min_test_cases:
        print ('Lets choose another box!!!')

if num_tests > max_test_cases:
    num_tests = max_test_cases

print('We will test {} events out of {} events for this box'.format(num_tests, data_test_bx.shape[0]))

We will test 250 events out of 5530 events for this box


3. * Randomly pick test dates *
--------------------------------

In [129]:
test_dates = random.sample(event_dates, num_tests)

# Generated test events from the dates
test_df = data_test_bx[data_test_bx['datetime_sent_hr'].isin(test_dates)]

4. * Remove all test events from training data*
---------------------------------------------

In [130]:
sms2_without_test_box = sms2[sms2.box_id != test_box_id]

to_keep = list(set(event_dates) - set(test_dates))

test_box_to_keep = data_test_bx[data_test_bx['datetime_sent_hr'].isin(to_keep)]

train_df = sms2_without_test_box.append(test_box_to_keep)

5. * Just to be sure, check that test events arent in the train dataset*
-------------------------------------------------------------------------
Note: Only for the test-box

In [131]:
train_df_test_box = train_df[train_df.box_id == test_box_id]

for date in test_dates:
    if date in list(train_df_test_box.datetime_sent_hr):
        print ('WAIT A MINUTE, HOW COME TEST EVENTS ARE STILL IN TRAINIGNG DATA')
print('No leakages')

No leakages


6. * Create a nearest neighbor model*
--------------------------------------
This model has the following parameters:
 
 -*target*: what to predict(either power_state or event_type)
 
 -*neighbors*-Number of boxes (based on location to include. e.g., 0 neighbors include 2 boxes)
 
 -*time-window*: moving winodw to search from
 
 -*direction*: whethere to pool foward looking or backward looking events only (centred on test date)
 
 -*how*: How to make prediction, default is frequent
 
 -*train_data*: the training data

In [132]:
# ----------------CREATE MODEL OBJECT-----------------------------
predictor_params = {'neighbors': 1, 'time-window': 7, 'direction': 'both', 'how': 'frequent',
                    'target': 'power_state'}
# model object
clf = pred.ImputationNearestNeighbor(data=train_df, target=predictor_params['target'],
                                         neighbors=predictor_params['neighbors'],
                                         how=predictor_params['how'],
                                         time_window=predictor_params['time-window'],
                                         direction=predictor_params['direction']
                                         )
# location details for boxes
box_file = "/Users/dmatekenya/Google Drive/World-Bank/electricity_monitoring/01.data/Boxes.csv"
clf.generate_box_metadata(box_file=box_file)

# Get lat-lon for the test box
box_lat_lon = [data_test_bx[data_test_bx.box_id == test_box_id].lat.values[0],
                   data_test_bx[data_test_bx.box_id == test_box_id].lon.values[0]]

7. * We are now ready to make predictions using the model *
---------------------------------------------------------

7.1 Select neighbors
-------------------------

In [122]:
# search neighbors from all boxes except test box
bx = clf.BOX_METADATA[clf.BOX_METADATA.box_id != test_box_id]
bx.is_copy = False

# compute distance between target (test box) and rest of the boxes
target_loc = box_lat_lon
bx['dist'] = bx.apply(lambda row: ut.calculate_distance([row['lat'], row['lon']],target_loc), axis=1)

# Get top-n nearest neighbors
nearest_n = bx.sort_values(by=['dist'], ascending=True)[:clf.neighbors]
print('The distances...')
print()
print(bx.sort_values(by=['dist'], ascending=True).head())

neighbors = list(nearest_n.box_id.values)
print()

neighbors.append(test_box_id) # since we also want to learn from the same box
print('Given number of neighbors = {}, we have these neighbors: {}'.format(clf.neighbors, neighbors))

print('COMPARE TO OUT TEST BOX DETAILS')
test_box_psu = train_df[train_df.box_id==test_box_id].iloc[0].psu
print('Test box id==> {}, test-box-psu==> {}, test-box-lat/lon==> {}'.format(test_box_id, test_box_psu, box_lat_lon))

The distances...

     box_id  psu        lon        lat      dist
267    1271   94  69.385761  37.529461  0.000000
253    1257   92  69.395822  37.491284  4.341864
209    1212   92  69.395822  37.491284  4.341864
107    1108   93  69.429172  37.575373  6.387610
135    1136   93  69.429172  37.575373  6.387610

Given number of neighbors = 1, we have these neighbors: [1271, 1005]
COMPARE TO OUT TEST BOX DETAILS
Test box id==> 1005, test-box-psu==> 94, test-box-lat/lon==> [37.529461095977027, 69.385761185057504]


7.2 Generate training data
---------------------------
This discards all the excepet that in [test_date-window_length, test_date+window_length]. 
Also, we only keep data for the neighbors.

In [133]:
# Lets pick a test date
prediction_date = test_df.iloc[0].datetime_sent_hr
print(test_df.iloc[0])

train_data = clf.generate_train_data(target_date=prediction_date, raw_data=train_df, boxes=neighbors)

box_id                             1005
psu                                  94
lon                             69.3858
lat                             37.5295
datetime_sent_hr    2017-01-12 06:00:00
hour_sent                             6
event_type_str                  pon_mon
power_state                           1
Name: 1541584, dtype: object


In [150]:
print ('Checking that training data only has the 2 boxes')
print('---------------------------------------------------')
print(train_data.box_id.value_counts())

print()
print ('Checking that training data is within the time window centred on test date')
print('-----------------------------------------------------------------------------')
print()
print ('#### Test date ==> {}, window-length ==> {} days #######'.format(prediction_date, predictor_params['time-window']))
print(train_data.datetime_sent_hr.describe())

Checking that training data only has the 2 boxes
---------------------------------------------------
1005    170
1271     62
Name: box_id, dtype: int64

Checking that training data is within the time window centred on test date
-----------------------------------------------------------------------------

#### Test date ==> 2017-01-12 06:00:00, window-length ==> 7 days #######
count                     232
unique                    232
top       2017-01-18 16:00:00
freq                        1
first     2017-01-05 06:00:00
last      2017-01-19 06:00:00
Name: datetime_sent_hr, dtype: object


7.2 * Generate Event Freqs*
--------------------------------

In [160]:
hr_cnts = train_data.groupby(['hour_sent', 'power_state'])['power_state'].agg(['count'])
event_freqs = hr_cnts.reset_index()

7.3 * Return the event with most counts*
-------------------------------------

In [162]:
pred_hr = prediction_date.hour
events_hr = event_freqs[event_freqs.hour_sent == pred_hr]
predicted_event = events_hr.max(axis=0)[clf.target_var]

6

In [165]:
predicted_event = clf.predict(prediction_date=prediction_date, box_id=test_box_id, target_loc=box_lat_lon)