In [1]:
import subprocess
import pandas as pd
import numpy as np
import requests
import json
import os
import paho.mqtt.client as client
from pathlib import Path
from tqdm import tqdm
import time
import traceback

In [2]:
def get_prereq_results():
#     print("prereq")
    body = {"algorithm": "function_approximation"}
    url = 'http://0.0.0.0:5000/pre_req'
    r = requests.post(url, json=body)
    prereq_res = r.json()
#     print(prereq_res.keys())
    return prereq_res

In [3]:
def get_phase0_results():
    body = {"algorithm": "function_approximation","chunk_count": 30}
    url = 'http://0.0.0.0:5000/phase0'
    r = requests.post(url, json=body)
    phase0_res = r.json()
#     print(phase0_res.keys())
    return phase0_res

In [4]:
def get_phase1_results():
    body = {
        "algorithm": "function_approximation",
        "partition_size": 100,
        "chunk_size": 100,
        "net_sizes": [5],
        "lr": [0.1],
        "no_data_passes": 20,
        "SIZE_OF_PARTITIONS": 10,
        "no_chunk_passes": 20,
        "neigh_rate": 0.8
    }
    url = 'http://0.0.0.0:5000/phase1'
    r = requests.post(url, json=body)
    phase1_res = r.json()
#     print(phase1_res)
    return phase1_res
    

In [5]:
def get_phase2_results(
    dataset,
    train_dataset_path,
    validation_dataset_path,
    test_dataset_path,
    partition_size=100,
                       chunk_size=100,
                       net_sizes=[3,4,5],
                       lr=[0.1, 0.05, 0.075],
                       num_passes=100,
                       top_ranks=100,
                       select_fs_cnt=100,
                       train_split=0.9,
                       convergence_threshold=0.05,
                       stack_passes=100
                      ):
    body = {
        "algorithm": "function_approximation",
        "partition_size": partition_size,
        "chunk_size": chunk_size,
        "net_sizes": net_sizes,
        "lr": lr,
        "no_passes": num_passes,
        "top_ranks": top_ranks,
        "select_fs_cnt": select_fs_cnt,
        "train_split": train_split,
        "boost_trials": 1,
        "convergence_threshold": convergence_threshold,
        "lambda": [1],
        "stack_passes": stack_passes,
        "weight_init_std": 0.2,
        'train_dataset_path': train_dataset_path,
        'validation_dataset_path': validation_dataset_path,
        'test_dataset_path': test_dataset_path,
        'dataset': dataset
    }
    
    url = 'http://0.0.0.0:5000/phase2'
    r = requests.post(url, json=body)
    phase2_res = r.json()
#     print(phase2_res.keys())
    return phase2_res


In [8]:
def process_results(data_dir, dataset, seed, df):
    proc, gateway_proc, details = None, None, {}
    try:
        gateway_proc = subprocess.Popen(['python', os.path.join(data_directory, 'gateway_simulation.py')])
        # backend_proc = subprocess.Popen(['python', '/home/asim/ssriva59/MainAppCuda/main.py'])
        proc = subprocess.Popen(['python', os.path.join(data_directory, 'ECG_Stream_V2.py'), dataset, str(seed)])
        time.sleep(10)
        
        prereq_res = get_prereq_results()
        time.sleep(10)
        
        phase0_res = get_phase0_results()
        time.sleep(10)
        
        phase1_res = get_phase1_results()
        time.sleep(10)
        
        phase2_res = get_phase2_results(
            dataset,
            train_dataset_path=os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv'),
            validation_dataset_path=os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv'),
            test_dataset_path=os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv'),
            net_sizes=[3], #, 4, 5, 6, 7],
            lr=[0.1] # , 0.05, 0.075, 0.5]
        )
        time.sleep(10)
        
        features = min(len(phase1_res['rankings']), 15)
        details = {
            'dataset': dataset,
            'seed': seed,
            'train_rmse': phase2_res['trainError'],
            'test_rmse': phase2_res['testError'],
            'KernelTime': phase1_res['kernel_time'] + phase2_res['kernel_time'],
            'hostTime': phase0_res['host_time'] + phase1_res['host_time'] + phase2_res['host_time']
        }
        
        details['val_rmse'] = phase2_res['global_min_val_rmse']
        details['phase1_chunks_for_convergence'] = phase2_res['chunks_for_convergence_phase1']
        details['phase2_chunks_for_convergence'] = phase2_res['chunks_for_convergence_phase2']
        details['phase3_chunks_for_convergence'] = phase2_res['chunks_for_convergence_phase3']
        details['best_feature_list'] = phase2_res['best_feature_list']
        details['best_val_lr'] = phase2_res['best_val_lr']
        details['best_fs_features'] = phase2_res['best_fs_num_features']
        details['val_rmses'] = phase2_res['val_rmses']
        details['train_rmses'] = phase2_res['train_rmses']
        details['test_rmses'] = phase2_res['test_rmses']
        details['best_chunk'] = phase2_res['best_chunk']
        details['best_model_active_gaussian_weights'] = phase2_res['best_model_active_gaussian_weights']
        details['best_model_active_linear_weights'] = phase2_res['best_model_active_linear_weights']
        details['bound_hitting_gaussian_weights'] = phase2_res['bound_hitting_gaussian_weights']
        details['score'] = phase2_res['score']
        details['scores'] =  phase2_res['scores']
        details['min_test_rmse'] = phase2_res['min_test_rmse']
        details['min_train_rmse'] = phase2_res['min_train_rmse']
        details['min_validation_rmse'] = phase2_res['min_validation_rmse']
        details['min_test_rmse_model_score'] = phase2_res['min_test_rmse_model_score']
        details['min_test_rmse_lr'] = phase2_res['min_test_rmse_lr']
        details['min_test_rmse_fs'] = phase2_res['min_test_rmse_fs']
        details['min_test_rmse_chunk'] = phase2_res['min_test_rmse_chunk']
        
        imp_features = []
        for rank in phase1_res['rankings']:
            imp_features.append(rank['Feature'])
        details['important_features'] = imp_features
        
        # backend_proc.kill()
        proc.kill()
        gateway_proc.kill()
    except:
        traceback.print_exc()
        # if backend_proc:
        #     backend_proc.kill()
        if proc:
            proc.kill()
        if gateway_proc:
            gateway_proc.kill()
    return details

train_data, test_data, list_of_feature_names = [], [], []
data_directory = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    "Facebook_data", # -> done
    # "Features_TestSet", # -> done
    # "House_Price_Adv_Regression", # -> done
    # "Instant_Liking",
    # "Insurance", # -> done
#     # "Isolet" need to redo cuz of some error in RF,
    # "new_data_trans", # -> done
    # "OnlineNewsPopularity", # -> done
    # "ParkinsonData", # -> done
    # "Sberbank_Russian_Housing_Market", # -> done
    # "slice_localization_data", # -> done
    # "Telecom_data", # -> done
    # "yearMSD_new", # -> done
    # "arrhythmia", # -> done
#     # "Big_mart_sales",
    # "blogData", # -> done
    # "communities", # -> done
    # "dengue_features", # -> done
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE" # -> done
]

df_column_order = [
    'dataset',
    'seed',
    'hostTime',
    'KernelTime',
    'best_chunk',
    'test_rmse',
    'test_rmses',
    'score',
    'scores',
    'train_rmse',
    'train_rmses',
    'val_rmse',
    'val_rmses',
    'best_fs_features',
    'best_val_lr',
    'best_feature_list',
    'important_features',
    'best_model_active_gaussian_weights',
    'best_model_active_linear_weights',
    'bound_hitting_gaussian_weights',
    'phase1_chunks_for_convergence',
    'phase2_chunks_for_convergence',
    'phase3_chunks_for_convergence',
    'min_test_rmse',
    'min_train_rmse',
    'min_validation_rmse',
    'min_test_rmse_model_score',
    'min_test_rmse_lr',
    'min_test_rmse_fs',
    'min_test_rmse_chunk'
]

for index, dataset in enumerate(datasets):
    print(dataset)
    seeds = [50] #, 50, 100, 150, 200, 250, 300, 350, 400, 450]
    df = pd.DataFrame()
    for s in tqdm(seeds):
        details = process_results(data_directory, dataset, s, df)
        df = df.append(details, ignore_index=True)
        filepath = Path('%s_NL45_conv4_FeaturesMax_RandomInit_Epochs20.csv' % dataset)
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df[df_column_order].to_csv(filepath, index=False)
        
# Only for ECG

# dataset = 'ECG'
# print(dataset)
# seeds = [1]
# df = pd.DataFrame()
# for s in tqdm(seeds):
#     details = process_results(data_directory, dataset, s, df)
#     df = df.append(details, ignore_index=True)
#     filepath = Path('%s-4-5.csv' % dataset)
#     filepath.parent.mkdir(parents=True, exist_ok=True)
#     df.to_csv(filepath, index=False)


Facebook_data


  0%|                                                                                                                                                                                                             | 0/1 [00:00<?, ?it/s]

['Page.total.likes', 'Category', 'Post.Month', 'Post.Weekday', 'Post.Hour', 'Paid', 'Lifetime.Post.Total.Reach', 'Lifetime.Post.Total.Impressions', 'Lifetime.Engaged.Users', 'Lifetime.Post.Consumers', 'Lifetime.Post.Consumptions', 'Lifetime.Post.Impressions.by.people.who.have.liked.your.Page', 'Lifetime.Post.reach.by.people.who.like.your.Page', 'Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post', 'comment', 'like', 'share', 'Type_Link', 'Type_Photo', 'Type_Status', 'Type_Video', 'Total.Interactions']
ECG connected
connecting to broker
Subscribing to all topics
adding topic to latest_elements 


Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
    httplib_response = self._make_request(
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 421, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 416, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib/python3.8/http/client.py", line 1348, in getresponse
    response.begin()
  File "/usr/lib/python3.8/http/client.py", line 316, in begin
    version, status, reason = self._read_status()
  File "/usr/lib/python3.8/http/client.py", line 285, in _read_status
    raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/u

KeyError: "None of [Index(['dataset', 'seed', 'hostTime', 'KernelTime', 'best_chunk', 'test_rmse',\n       'test_rmses', 'score', 'scores', 'train_rmse', 'train_rmses',\n       'val_rmse', 'val_rmses', 'best_fs_features', 'best_val_lr',\n       'best_feature_list', 'important_features',\n       'best_model_active_gaussian_weights',\n       'best_model_active_linear_weights', 'bound_hitting_gaussian_weights',\n       'phase1_chunks_for_convergence', 'phase2_chunks_for_convergence',\n       'phase3_chunks_for_convergence', 'min_test_rmse', 'min_train_rmse',\n       'min_validation_rmse', 'min_test_rmse_model_score', 'min_test_rmse_lr',\n       'min_test_rmse_fs', 'min_test_rmse_chunk'],\n      dtype='object')] are in the [columns]"

In [None]:
def process_missing_results(data_dir, dataset, seed, df):
    proc, gateway_proc, details = None, None, {}
    try:
        gateway_proc = subprocess.Popen(['python', os.path.join(data_directory, 'gateway_simulation.py')])
        backend_proc = subprocess.Popen(['python', '/home/asim/ssriva59/MainAppCuda/main.py'])
        proc = subprocess.Popen(['python', os.path.join(data_directory, 'ECG_Stream_V2.py'), dataset, str(seed)])
        time.sleep(10)
        
        prereq_res = get_prereq_results()
        time.sleep(10)
        
        phase0_res = get_phase0_results()
        time.sleep(10)
        
        phase1_res = get_phase1_results()
        time.sleep(10)
        
        phase2_res = get_phase2_results(
            train_dataset_path=os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_train.csv'),
            validation_dataset_path=os.path.join(data_dir, dataset, "Train", dataset + '_seed_' + str(seed) + '_validation.csv'),
            test_dataset_path=os.path.join(data_dir, dataset, "Test", dataset + '_Test_seed' + str(seed) + '_modified.csv'),
            net_sizes=[3,4,5,6]
        )
        time.sleep(10)
        
        features = min(len(phase1_res), 15)
        details = {
            'dataset': dataset,
            'seed': seed,
            'train_rmse': phase2_res['trainError'],
            'test_rmse': phase2_res['testError'],
            'KernelTime': phase2_res['kernelTime'],
            'hostTime': phase2_res['hostTime']
        }
        
        details['global_min_val_rmse'] = phase2_res['global_min_val_rmse']
        details['phase1_chunks_for_convergence'] = phase2_res['chunks_for_convergence_phase1']
        details['phase2_chunks_for_convergence'] = phase2_res['chunks_for_convergence_phase2']
        details['phase3_chunks_for_convergence'] = phase2_res['chunks_for_convergence_phase3']
        details['best_feature_list'] = phase2_res['best_feature_list']
        details['best_val_lr'] = phase2_res['best_val_lr']
        details['best_val_fs'] = phase2_res['best_val_fs']
        details['val_rmses'] = phase2_res['val_rmses']
        details['train_rmses'] = phase2_res['train_rmses']
        details['test_rmses'] = phase2_res['test_rmses']
        details['best_chunk'] = phase2_res['best_chunk']
        
        imp_features = []
        for i in range(features):
            imp_features.append(phase1_res[i]['Feature'])
        details['important_features'] = imp_features
        proc.kill()
        backend_proc.kill()
        gateway_proc.kill()
    except:
        traceback.print_exc()
        if backend_proc:
            backend_proc.kill()
        if proc:
            proc.kill()
        if gateway_proc:
            gateway_proc.kill()
    return details


train_data, test_data, list_of_feature_names = [], [], []
data_directory = "/home/asim/ssriva59/setup-stuff/gateway_and_dataset"
datasets = [
    # "Facebook_data",
    # "Features_TestSet",
    # "House_Price_Adv_Regression",
    # "Instant_Liking",
    # "Insurance",
    # "Isolet",
    # "new_data_trans",
    # "OnlineNewsPopularity",
    # "ParkinsonData",
    # "Sberbank_Russian_Housing_Market",
    "slice_localization_data",
    # "Telecom_data",
    # "yearMSD_new",
    # "arrhythmia",
    # "Big_mart_sales",
    "blogData",
    # "communities",
    # "dengue_features",
    # "ECG0_p02",
    # "ENERGY_DATA_COMPLETE"
]

dataset_seed = {
    "Facebook_data": [150],
    "Features_TestSet": [400],
    "House_Price_Adv_Regression": [100],
    "Insurance": [100],
    "OnlineNewsPopularity": [300],
    "new_data_trans": [50, 250],
    "ParkinsonData": [400],
    "arrhythmia": [50, 100, 150],
    "communities": [450],
    "dengue_features": [50, 300],
    "new_data_trans": [450],
    "Insurance": [200],
    "slice_localization_data": [300, 350, 400, 450],
    "Telecom_data": [250, 400],
    "blogData": [200, 250, 300, 350, 400, 450]
}

# missing seed here
# seeds = [250]
for dataset in datasets:
    print(dataset)
    df = pd.DataFrame()
    for s in tqdm(dataset_seed[dataset]):
        details = process_missing_results(data_directory, dataset, s, df)
        df = df.append(details, ignore_index=True)
        filepath = Path('%s_missing-4-5.csv' % dataset)
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)