In [None]:
from sklearn import linear_model
import numpy as np
from metrics import nse, kge
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import json
import glob
import os

In [None]:
DATASET_PATH = '/home/bdemiray/storage/Benchmark/Data/'
RIDGE_PATH = "/home/bdemiray/storage/Benchmark/Results/SingleStation/Ridge"
LASSO_PATH = "/home/bdemiray/storage/Benchmark/Results/SingleStation/Lasso"
RIDGE_SUMMARY = "ridgeSummary.json"
LASSO_SUMMARY = "lassoSummary.json"

In [None]:
def createSummaryFile(filename):
    """
    Create a summary file for the results of each gauge
        
    Parameters:
        filename, str:
            filename(path) of the summary file
    
    """

    Results = {}
    Results["Train"] = {}
    Results["Test"] = {}
    Results["Train"]["NSE"] = {}
    Results["Train"]["KGE"] = {}
    Results["Test"]["NSE"] = {}
    Results["Test"]["KGE"] = {}
    Results["Train"]["NSE"]["max"] = {}
    Results["Train"]["NSE"]["min"] = {}
    Results["Train"]["NSE"]["median"] = {}
    Results["Train"]["NSE"]["mean"] = {}
    Results["Train"]["KGE"]["max"] = {}
    Results["Train"]["KGE"]["min"] = {}
    Results["Train"]["KGE"]["median"] = {}
    Results["Train"]["KGE"]["mean"] = {}
    Results["Test"]["NSE"]["max"] = {}
    Results["Test"]["NSE"]["min"] = {}
    Results["Test"]["NSE"]["median"] = {}
    Results["Test"]["NSE"]["mean"] = {}
    Results["Test"]["KGE"]["max"] = {}
    Results["Test"]["KGE"]["min"] = {}
    Results["Test"]["KGE"]["median"] = {}
    Results["Test"]["KGE"]["mean"] = {}
    
    with open(filename, "w") as outfile:
        json.dump(Results, outfile)


In [None]:
def updateJSON(station_id, NSE_train, NSE_test, KGE_train, KGE_test, jsonFilePath):
    """
    Updates the summary file with results of a gauge
    
    Parameters:
        station_id, int:
            id of the gauge which results will be added to summary file
        NSE_train, list:
            NSE scores of train set
        NSE_test, list:
            NSE scores of test set
        KGE_train, list:
            KGE scores of train set
        KGE_test, list:
            KGE scores of test set
        jsonFilePath, str:
            path of the updated summary file
    
    """

    nse_train_max = np.max(NSE_train)
    nse_train_min = np.min(NSE_train)
    nse_train_median = np.median(NSE_train)
    nse_train_mean = np.mean(NSE_train)
    
    nse_test_max = np.max(NSE_test)
    nse_test_min = np.min(NSE_test)
    nse_test_median = np.median(NSE_test)
    nse_test_mean = np.mean(NSE_test)
    
    kge_train_max = np.max(KGE_train)
    kge_train_min = np.min(KGE_train)
    kge_train_median = np.median(KGE_train)
    kge_train_mean = np.mean(KGE_train)
    
    kge_test_max = np.max(KGE_test)
    kge_test_min = np.min(KGE_test)
    kge_test_median = np.median(KGE_test)
    kge_test_mean = np.mean(KGE_test)
    
    with open(jsonFilePath, "r") as jsonFile:
        Results = json.load(jsonFile)
    
    Results["Train"]["NSE"]["max"][station_id] = nse_train_max
    Results["Train"]["NSE"]["min"][station_id] = nse_train_min
    Results["Train"]["NSE"]["median"][station_id] = nse_train_median
    Results["Train"]["NSE"]["mean"][station_id] = nse_train_mean
    
    Results["Train"]["KGE"]["max"][station_id] = kge_train_max
    Results["Train"]["KGE"]["min"][station_id] = kge_train_min
    Results["Train"]["KGE"]["median"][station_id] = kge_train_median
    Results["Train"]["KGE"]["mean"][station_id] = kge_train_mean
    
    Results["Test"]["NSE"]["max"][station_id] = nse_test_max
    Results["Test"]["NSE"]["min"][station_id] = nse_test_min
    Results["Test"]["NSE"]["median"][station_id] = nse_test_median
    Results["Test"]["NSE"]["mean"][station_id] = nse_test_mean
    
    Results["Test"]["KGE"]["max"][station_id] = kge_test_max
    Results["Test"]["KGE"]["min"][station_id] = kge_test_min
    Results["Test"]["KGE"]["median"][station_id] = kge_test_median
    Results["Test"]["KGE"]["mean"][station_id] = kge_test_mean
    
    with open(jsonFilePath, "w") as jsonFile:
        json.dump(Results, jsonFile)
    
    

In [None]:
def Ridge_main(station_id, DATASET_PATH, RESULT_PATH=None, summaryFile):
    """
    Main function to get results of Seq2Seq model for given gauge
    
    Parameters:
        station_id, str:
            id of gauge which model will use its training and test data
        RESULT_PATH, str:
            directory to store the results
        DATASET_PATH, str:
            path to dataset which contains test and training data for each gauge as csv    
    
    """

    train_x = pd.read_csv(DATASET_PATH+str(station_id)+'_train_x.csv',index_col='datetime')
    train_y = pd.read_csv(DATASET_PATH+str(station_id)+'_train_y.csv',index_col='datetime')
    test_x = pd.read_csv(DATASET_PATH+str(station_id)+'_test_x.csv',index_col='datetime')
    test_y = pd.read_csv(DATASET_PATH+str(station_id)+'_test_y.csv',index_col='datetime')
    
    train_x = train_x.values[:,:-7]
    test_x = test_x.values[:,:-7]

    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    scaler_x.fit(train_x)
    scaler_y.fit(train_y)
    
    train_x_scaled = scaler_x.transform(train_x)
    test_x_scaled = scaler_x.transform(test_x)
    train_y_scaled = scaler_y.transform(train_y)
    
    
    clf = linear_model.Ridge()

    clf.fit(train_x_scaled, train_y_scaled)
    
    y_train_pred = clf.predict(train_x_scaled)
    y_test_pred = clf.predict(test_x_scaled)
    
    y_train_pred = scaler_y.inverse_transform(y_train_pred)
    y_test_pred = scaler_y.inverse_transform(y_test_pred)
    
    NSEs_train = []
    NSEs_test = []
    KGEs_train = []
    KGEs_test = []
    
    for i in range(120):
        NSEs_train.append(nse(train_y.values[:, i], y_train_pred[:, i]))
        NSEs_test.append(nse(test_y.values[:, i], y_test_pred[:, i]))
        KGEs_train.append(kge(train_y.values[:, i], y_train_pred[:, i]))
        KGEs_test.append(kge(test_y.values[:, i], y_test_pred[:, i]))
    
    updateJSON(station_id, NSEs_train, NSEs_test, KGEs_train, KGEs_test, summaryFile)
    
    KGE_train = pd.DataFrame(KGEs_train)
    KGE_test = pd.DataFrame(KGEs_test)
    KGE_train.columns = ["KGEsTrain"]
    KGE_test.columns = ["KGEsTest"]
    NSE_train = pd.DataFrame(NSEs_train)
    NSE_test = pd.DataFrame(NSEs_test)
    NSE_train.columns = ["NSEsTrain"]
    NSE_test.columns = ["NSEsTest"]
    
    
    combined = pd.concat([NSE_train, NSE_test, KGE_train, KGE_test], axis=1)
    combined.to_csv("%s/%s.csv" % (RESULT_PATH, str(station_id)), index=True)

In [None]:
createSummaryFile(summary_file_path)

In [None]:
DATASET_PATH = originalData # unzip the files in originalData folder and gave as dataset path

In [None]:
l = os.listdir(data)
sensors = [i.split("_")[0] for i in l]
sensors = list(set(sensors))

In [None]:
for sensor in sensors:
    Ridge_main(sensor, DATA, RESULT_PATH, summary_file_path)