# Basic Functions and Settings

In [1]:
%load_ext autoreload
%autoreload

#%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

import pandas as pd
pd.set_option("display.max_rows",1000)
pd.set_option("display.max_columns",500)
pd.set_option('precision', 3)

from os.path import join
from os import listdir
from pandas import read_csv, read_excel, Series, DataFrame, to_numeric, read_parquet, to_datetime, read_pickle
from pandas import date_range, concat
from sklearn.externals import joblib
from sklearn import metrics

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})

import warnings
warnings.filterwarnings('ignore')

from pmdarima.arima import auto_arima
import tsfresh as tsf

# ML Training Functions

In [None]:
def trainLogReg(X_train, y_train):
    name = 'logreg'
    glm = LogisticRegression()
    model = trainEstimator(name, glm, X_train, y_train)
    return model

def trainRF(X_train, y_train):
    name = 'rf'
    rf = RandomForestClassifier(n_estimators=50)
    model = trainEstimator(name, rf, X_train, y_train)
    return model

def trainXGboost(X_train, y_train):
    name = 'xgb'
    xgb = xgboost.XGBClassifier(n_jobs=1)
    model = trainEstimator(name, xgb, X_train, y_train)
    return model

In [None]:
def predAll(models, X_test):
    perfs = {}
    for modelname, model in models.items():
        y_probs  = model.predict_proba(X_test)[:,1]
        y_probs  = Series(data=y_probs, name="predicted")
        y_preds  = model.predict(X_test)
        y_preds  = Series(data=y_preds, name="predicted")
        perfs[modelname] = {"Probs": y_probs, "Preds": y_preds}
    return perfs
    
    
def trainAll(X_train, y_train):
    logreg = trainLogReg(X_train, y_train)
    rf     = trainRF(X_train, y_train)
    xgb    = trainXGboost(X_train, y_train)
    models = {"logreg": logreg, "xgb": xgb, "rf": rf}
    return models

def perfAll(perfs, y_test):
    modelperfdata = {}
    for modelname,modeldata in perfs.items():
        retval = {}
        y_truth = y_test
        y_probs = modeldata['Probs']
        y_preds = modeldata['Preds']
        precision, recall, pr_thresholds = precision_recall_curve(y_truth, y_probs)
        retval["PR"] = {"precision": precision, "recall": recall, "thresholds": pr_thresholds}

        fpr, tpr, roc_thresholds = roc_curve(y_truth, y_probs)
        retval["ROC"] = {"fpr": fpr, "tpr": tpr, "thresholds": roc_thresholds}

        auc = roc_auc_score(y_truth, y_probs)
        retval["AUC"] = auc

        cfm = confusion_matrix(y_truth, y_preds)
        tn, fp, fn, tp = cfm.ravel()
        retval["Confusion"] = {"matrix": cfm, "tn": tn, "tp": tp, "fn": fn, "fp": fp}

        modelperfdata[modelname] = retval
    return modelperfdata

# Load Data

In [2]:
def cutData(data, startYear):
    salesdata = data.copy()
    salesdata = salesdata[(salesdata.index <= to_datetime('2019-05-01')) & (salesdata.index >= to_datetime('{0}-01-01'.format(startYear)))]
    for colname in salesdata.columns:
        salesdata[colname] = salesdata[colname]/1000
    return salesdata

In [8]:
data["Date"].values

array(['2013-01-01', '2013-02-01', '2013-03-01', '2013-04-01',
       '2013-05-01', '2013-06-01', '2013-07-01', '2013-08-01',
       '2013-09-01', '2013-10-01', '2013-11-01', '2013-12-01',
       '2014-01-01', '2014-02-01', '2014-03-01', '2014-04-01',
       '2014-05-01', '2014-06-01', '2014-07-01', '2014-08-01',
       '2014-09-01', '2014-10-01', '2014-11-01', '2014-12-01',
       '2015-01-01', '2015-02-01', '2015-03-01', '2015-04-01',
       '2015-05-01', '2015-06-01', '2015-07-01', '2015-08-01',
       '2015-09-01', '2015-10-01', '2015-11-01', '2015-12-01',
       '2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01',
       '2016-05-01', '2016-06-01', '2016-07-01', '2016-08-01',
       '2016-09-01', '2016-10-01', '2016-11-01', '2016-12-01',
       '2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01',
       '2017-05-01', '2017-06-01', '2017-07-01', '2017-08-01',
       '2017-09-01', '2017-10-01', '2017-11-01', '2017-12-01',
       '2018-01-01', '2018-02-01', '2018-03-01', '2018-

In [13]:
#data = read_pickle("Aggdata/TotalSales.p")
data = read_csv("Aggdata/TotalSales.csv")
data.columns = ["Date", "Total"]
data.index = to_datetime(data["Date"].values)
data.drop("Date", axis=1, inplace=True)
data["Total"] *= 1000
startYear = 2013
salesdata = cutData(data, startYear)
salesdata.head()

Unnamed: 0,Total
2013-01-01,211.842
2013-02-01,239.873
2013-03-01,189.759
2013-04-01,188.456
2013-05-01,242.191


# Feature Creation

In [None]:
##########################################################################################
## Data <-> Row Functions
##########################################################################################
def getPos(salesdata, currDate):
    try:
        ipos = salesdata.index.get_loc(currDate)
        return ipos
    except:
        raise ValueError("Date {0} not found!".format(currDate))
        return None
    
def getDateFromPos(salesdata, ipos):
    try:
        currDate = salesdata.index[ipos]
    except:
        raise ValueError("Date for position {0} doesn't exist".format(ipos))
    return currDate


##########################################################################################
## Window Functions
##########################################################################################
def getWindow(salesdata, currDate, entries):
    ipos = getPos(salesdata, currDate)
    if entries <= ipos:
        windowData = salesdata[(ipos-entries):ipos]
        return windowData
    else:
        #print("Not enough entries before {0}".format(currDate))
        return None
    
def getPrevYearWindow(salesdata, currDate, entries, years):
    pyData = {}
    ipos = getPos(salesdata, currDate)
    for iyear in range(1,years+1):
        iypos    = ipos - 12*iyear
        if iypos < 0:
            pyData[iyear] = None
            continue
        prevDate = getDateFromPos(salesdata, iypos)
        windowData = getWindow(salesdata, prevDate, entries)
        pyData[iyear] = windowData

    return pyData


##########################################################################################
## Window --> Number Function
##########################################################################################
def getWindowValue(window, valtype="avg"):
    if window is None:
        return None
    
    if valtype in ["mean", "avg"]:
        retval = window.mean()
    elif valtype in ["max"]:
        retval = window.max()
    elif valtype in ["min"]:
        retval = window.min()
    elif valtype in ["diff"]:
        retval = window.diff().mean()
    return float(retval.values)

def getMultiWindowValue(multiwindow, valtype="avg"):
    mvwin  = Series([getWindowValue(multiwindow[x], valtype) for x in reversed(list(multiwindow.keys()))])
    if len(multiwindow) == 1:
        retval = float(mvwin.mean())
    else:
        if sum([1 for x in mvwin.values if x is not None]) < 2:
            retval = None
        else:
            retval = float(mvwin.diff().mean())
    return retval


##########################################################################################
## Current Month Functions
##########################################################################################
def getCurrQuarter(salesdata, currDate):
    ipos = getPos(salesdata, currDate)
    currData = salesdata[ipos:(ipos+1)]
    quarter  = "Q{0}".format(currData.index.quarter[0])
    return quarter

def getCurrValue(salesdata, currDate):
    ipos = getPos(salesdata, currDate)
    currData = salesdata[ipos:(ipos+1)]
    return float(currData.values)

        
if getPos(salesdata, salesdata.index[12]) != 12:
    raise ValueError("getPos() doesn't work")

In [None]:
win = getWindow(salesdata, '2019-05-01', 3)
win

In [None]:
valtype = "avg"
multiwindow = getPrevYearWindow(salesdata, '2019-05-01', 3, 3)
mvwin  = Series([getWindowValue(multiwindow[x], valtype) for x in reversed(list(multiwindow.keys()))])

sum([1 for x in pwin.values() if x is not None]) < 2

In [None]:
qrtr = getCurrQuarter(salesdata, '2019-05-01')
qrtr

In [None]:
regdata = {}
for i,ts in enumerate(list(salesdata.index)):
    y = getCurrValue(salesdata, ts)
    win  = getWindow(salesdata, ts, 3)
    pwin = getPrevYearWindow(salesdata, ts, 3, 1)
    
    winval   = getWindowValue(win)
    mvwinval = getMultiWindowValue(pwin)
    
    quarter = getCurrQuarter(salesdata, ts)
    
    mondata = {"Y": y, "Quarter": quarter, "PrevMonths": winval, "PrevYears": mvwinval}
    regdata[ts] = mondata
    #print(i,'\t',ts,'\t',y,'\t',winval,'\t',mvwinval)
    
DataFrame(regdata).T

In [None]:
pwin = getPrevYearWindow(salesdata, '2016-01-01', 3, 1)

In [None]:
pwin

In [None]:
win.diff().mean()

In [None]:
vtypes = ["mean", "min", "max", "diff"]
for vtype in vtypes:
    print(vtype,'\t',getWindowValue(win, vtype))

In [None]:
win.ewm(alpha=0.01).mean().mean()

In [None]:
tsfdata.dtypes

In [16]:
tsfdata = salesdata.reset_index(drop=True)
tsfdata["Time"] = range(len(salesdata.index))
tsfdata["ID"] = 1
tsfdata

Unnamed: 0,Total,Time,ID
0,211.842,0,1
1,239.873,1,1
2,189.759,2,1
3,188.456,3,1
4,242.191,4,1
5,202.538,5,1
6,219.153,6,1
7,211.684,7,1
8,221.814,8,1
9,311.817,9,1


In [17]:
#tsfdata = salesdata.reset_index(drop=True)
#tsfdata["Time"] = str(tsfdata.index)
#tsfdata["ID"] = 1
from tsfresh import extract_features
extracted_features = extract_features(tsfdata, column_id='ID')


Feature Extraction:   0%|          | 0/2 [00:00<?, ?it/s][A
Feature Extraction:  50%|█████     | 1/2 [00:00<00:00,  5.78it/s][A
Feature Extraction: 100%|██████████| 2/2 [00:00<00:00, 11.32it/s][A

In [19]:
extracted_features.T

id,1
variable,Unnamed: 1_level_1
Time__abs_energy,1.492e+05
Time__absolute_sum_of_changes,7.600e+01
"Time__agg_autocorrelation__f_agg_""mean""__maxlag_40",2.807e-01
"Time__agg_autocorrelation__f_agg_""median""__maxlag_40",3.256e-01
"Time__agg_autocorrelation__f_agg_""var""__maxlag_40",2.128e-01
"Time__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""",9.500e+00
"Time__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""",9.993e-01
"Time__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""",9.750e+00
"Time__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""",1.443e-01
"Time__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",4.900e+01


In [None]:
pwin

In [None]:
Series([getWindowValue(pwin[x]) for x in reversed(list(pwin.keys()))]).diff().mean()

In [None]:
getWindowValue(pwin[1])

In [None]:
sum([1 for x in pwin.values() if x is not None])

In [None]:
getMultiWindowValue(pwin)

In [None]:
timeseries = salesdata.reset_index()

In [None]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, \
    load_robot_execution_failures
download_robot_execution_failures()
timeseries, y = load_robot_execution_failures()

In [None]:
timeseries

In [None]:
y

# Testing

In [None]:
pip install fbprophet