In [1]:
import sys
import pandas as pd
import numpy as np
from time import time
import line_profiler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
sys.path.append('/Users/dmatekenya/LTJK_mapping_app/LTJK_mapping_app')

In [3]:
#---------SET UP WORKING DIR-------------#
data_dir = '/Users/dmatekenya/Google Drive/World-Bank/electricity_monitoring/01.data/'
sms = data_dir + 'sms.csv'
sms2 = data_dir + 'sms2.csv'
boxes = data_dir + 'Boxes.csv'

In [4]:
import data_utils as ut
import model_evaluation as mod_ev
import imputation_models as impute

**GENERATE REGURALISED DATASET**
=================================

In [None]:
bx = pd.read_csv(boxes, usecols=['LONG', 'LAT', 'ClusterId', 'BoxID'])
bx = bx[['ClusterId','LONG', 'LAT', 'BoxID']]
bx.rename(columns={'ClusterId': 'psu', 'LONG': 'lon', 'LAT': 'lat', 'BoxID': 'device_id'}, inplace=True)

sms2_df = pd.read_csv(data_dir + 'sms2.csv',parse_dates=['datetime_rcvd'])

In [None]:
start = time()
test_num = 1
sms2_df = pd.read_csv(data_dir + 'sms2.csv',parse_dates=['datetime_rcvd'])
df_reg = ut.generate_hr_regular_dataframe(sms2_df, bx, test_num)
end = time()
print('Generating regular dataset for %s boxes took %.3f seconds' % (str(test_num), (end - start)))

**Now that we have the dataset, we can evaluate some simple predictors**
==========================================================================
1. The target prediction variable here is message type

In [5]:
df_reg = pd.read_csv(data_dir + 'reg_hr_all_boxes.csv',parse_dates=['datetime_rcvd','datetime_rcvd_hr'])

In [9]:
df_reg.msg.value_counts()

pon_mon      384081
pfail_mon    298384
pfail        154538
pback        138825
Name: msg, dtype: int64

In [14]:
df_reg2 = df_reg[df_reg['msg'].isin(['pfail', 'pback'])]


**EVALUATE NEAREST BOX PREDICTOR**
=====================================

In [16]:
num_psu, acc_all = mod_ev.batch_evaluation_nearest_predictor(df_reg2, False)
print ('Total Clusters Evaluated: %s'%num_psu)
print ('Accuracy (%)-summary across all boxes : ')
print ('Worst case: %s'%(np.min(acc_all)),
       ' | Best case : %s'%(np.max(acc_all)),
       ' | Median : %s'%(np.median(acc_all)),' | Std deviation : %s'%(np.std(acc_all)))

Total Clusters Evaluated: 137
Accuracy (%)-summary across all boxes : 
Worst case: 28.5714285714  | Best case : 100.0  | Median : 60.7460404111  | Std deviation : 12.5684551195


In [18]:
num_psu, acc_all = mod_ev.batch_evaluation_nearest_predictor(df_reg2, True)
print ('Total Clusters Evaluated: %s'%num_psu)
print ('Accuracy (%)-summary across all boxes : ')
print ('Worst case: %s'%(np.min(acc_all)),
       ' | Best case : %s'%(np.max(acc_all)),
       ' | Median : %s'%(np.median(acc_all)),' | Std deviation : %s'%(np.std(acc_all)))

Total Clusters Evaluated: 137
Accuracy (%)-summary across all boxes : 
Worst case: 16.6666666667  | Best case : 50.0  | Median : 26.8382352941  | Std deviation : 6.44994014914


**Although using nearest box doesnt give very good accuracy, its better than random guessing**

**EVALUATE MOVING WINDOW PREDICTOR**
====================================

In [21]:
num_psu, acc_all_wind = mod_ev.batch_evaluation_temporal_window_predictor(df_reg2, 50, 0.25, 2)

print('Total Clusters Evaluated: %s' % num_psu)
print('Accuracy (%)-summary across all boxes : ')
print('Worst case: %s' % (np.min(acc_all_wind)),
          ' | Best case : %s' % (np.max(acc_all_wind)),
          ' | Median : %s' % (np.median(acc_all_wind)), ' | Std deviation : %s' % (np.std(acc_all_wind)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['hr'] = df['datetime_rcvd_hr'].apply(lambda x: x.hour)


invalid type comparison
invalid type comparison
invalid type comparison
invalid type comparison
division by zero
invalid type comparison
invalid type comparison
invalid type comparison
invalid type comparison
invalid type comparison
division by zero
invalid type comparison
division by zero
invalid type comparison
division by zero
division by zero
Total Clusters Evaluated: 140
Accuracy (%)-summary across all boxes : 
Worst case: 0.0  | Best case : 74.358974359  | Median : 41.4818945159  | Std deviation : 22.6070503613


In [22]:
df_reg.pwr_state.value_counts()

 1.0    473867
-1.0    296320
 0.0    205641
Name: pwr_state, dtype: int64

**EVALUATE OTHER PREDICTORS USING ALL FEATURES (TEMPORAL)**
===========================================================

*Logistic regression was used to predict missing values....with accuracy of around 80 percent*

**OTHER OUTPUTS**
===================
1. Computed variables for visualisation in tableau