In [None]:
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 01:30:12 2018

@author: xshitova
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn import datasets, linear_model, metrics 
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from sklearn.preprocessing import Imputer
from statsmodels.tsa.ar_model import AR
import functools as ft
import datetime

# function returnes accumulated volatility during day
# as SUM(V*P), where V,P vectors of volatility,price for morning day ticks 
def DayVolatility(ser):
    l=ser.size
    if l % 2 == 1:
        print('DayVolatility: error: serie length is not even:',l)
        return np.NaN
    l = int(l/2)
    v=ser[:l].values # vector of volatilities
    p=ser[l:].values # vector of price movements
    return ft.reduce(lambda x, y: x + y, v*p)   # sum(V*P)

# function returnes maximal volatility jump at a sequential price increase (joint ticks with prices +1,0)
def DayPriceUpMax(ser):
    l=ser.size
    if l % 2 == 1:
        print('DayVolatility: error: serie length is not even:',l)
        return np.NaN
    l = int(l/2)
    vol=ser[:l].values # vector of volatilities
    pm=ser[l:].values # vector of price movements
    vmax=0; vcur=0;
    for i,v in enumerate(vol):
        if pd.isnull(v):
            continue
        if pm[i]>0:                 # check up price movement
            vcur += v               # accumulate volatility
            if vcur > vmax:
                vmax = vcur
        elif pm[i]<0:               # reset volatility if price falls
            vcur=0
    return vmax

# function returnes maximal volatility jump at a sequential price decrease (joint ticks with prices -1,0)
def DayPriceDownMax(ser):
    l=ser.size
    if l % 2 == 1:
        print('DayVolatility: error: serie length is not even:',l)
        return np.NaN
    l = int(l/2)
    vol=ser[:l].values # vector of volatilities
    pm=ser[l:].values # vector of price movements
    vmax=0; vcur=0;
    for i,v in enumerate(vol):
        if pd.isnull(v):
            continue
        if pm[i]<0:                 # check down price movement
            vcur += v               # accumulate volatility
            if vcur > vmax:
                vmax = vcur
        elif pm[i]>0:               # reset volatility if price ups
            vcur=0
    return vmax

# function returnes total volatility in price up ticks
def DayPriceUpTot(ser):
    l=ser.size
    if l % 2 == 1:
        print('DayVolatility: error: serie length is not even:',l)
        return np.NaN
    l = int(l/2)
    vol=ser[:l].values # vector of volatilities
    pm=ser[l:].values # vector of price movements
    vmax=0
    for i,v in enumerate(vol):
        if pd.isnull(v):
            continue
        if pm[i]>0:                 # check up price movement
            vmax += v               # accumulate volatility
    return vmax

# function returnes total volatility in price down ticks
def DayPriceDownTot(ser):
    l=ser.size
    if l % 2 == 1:
        print('DayVolatility: error: serie length is not even:',l)
        return np.NaN
    l = int(l/2)
    vol=ser[:l].values # vector of volatilities
    pm=ser[l:].values # vector of price movements
    vmax=0
    for i,v in enumerate(vol):
        if pd.isnull(v):
            continue
        if pm[i]<0:                 # check down price movement
            vmax += v               # accumulate volatility
    return vmax

# function returnes average volatility in price up ticks
def DayPriceUpTotAvg(ser):
    l=ser.size
    if l % 2 == 1:
        print('DayVolatility: error: serie length is not even:',l)
        return np.NaN
    l = int(l/2)
    vol=ser[:l].values # vector of volatilities
    pm=ser[l:].values # vector of price movements
    vmax=0; n=0
    for i,v in enumerate(vol):
        if pd.isnull(v):
            continue
        if pm[i]>0:                 # check up price movement
            vmax += v               # accumulate volatility
            n += 1
    if n<=0:
        n=1
    return vmax/n

# function returnes average volatility in price down ticks
def DayPriceDownTotAvg(ser):
    l=ser.size
    if l % 2 == 1:
        print('DayVolatility: error: serie length is not even:',l)
        return np.NaN
    l = int(l/2)
    vol=ser[:l].values # vector of volatilities
    pm=ser[l:].values # vector of price movements
    vmax=0; n=0
    for i,v in enumerate(vol):
        if pd.isnull(v):
            continue
        if pm[i]<0:                 # check down price movement
            vmax += v               # accumulate volatility
            n += 1
    if n<=0:
        n=1
    return vmax/n

#uploading the files with data 
input_train=pd.read_csv('C:/Users/xshitova/Downloads/training_input.csv', delimiter=';')
output_train=pd.read_csv('C:/Users/xshitova/Downloads/challenge_output_data_training_file_volatility_prediction_in_financial_markets.csv', delimiter=';') 

input_test=pd.read_csv('C:/Users/xshitova/Downloads/testing_input.csv', delimiter=';')

#note to self - the train and the test are actually of the same size 

#column names in inputs 
#ID is the name of the line and primary key here
cols=["ID","date","product_id","volatility 09:30:00","volatility 09:35:00","volatility 09:40:00",
      "volatility 09:45:00","volatility 09:50:00","volatility 09:55:00","volatility 10:00:00","volatility 10:05:00",
      "volatility 10:10:00","volatility 10:15:00","volatility 10:20:00","volatility 10:25:00","volatility 10:30:00",
      "volatility 10:35:00","volatility 10:40:00","volatility 10:45:00","volatility 10:50:00","volatility 10:55:00",
      "volatility 11:00:00","volatility 11:05:00","volatility 11:10:00","volatility 11:15:00","volatility 11:20:00",
      "volatility 11:25:00","volatility 11:30:00","volatility 11:35:00","volatility 11:40:00","volatility 11:45:00",
      "volatility 11:50:00","volatility 11:55:00","volatility 12:00:00","volatility 12:05:00","volatility 12:10:00",
      "volatility 12:15:00","volatility 12:20:00","volatility 12:25:00","volatility 12:30:00","volatility 12:35:00",
      "volatility 12:40:00","volatility 12:45:00","volatility 12:50:00","volatility 12:55:00","volatility 13:00:00",
      "volatility 13:05:00","volatility 13:10:00","volatility 13:15:00","volatility 13:20:00","volatility 13:25:00",
      "volatility 13:30:00","volatility 13:35:00","volatility 13:40:00","volatility 13:45:00","volatility 13:50:00",
      "volatility 13:55:00",
      "return 09:30:00","return 09:35:00","return 09:40:00","return 09:45:00","return 09:50:00","return 09:55:00",
      "return 10:00:00","return 10:05:00","return 10:10:00","return 10:15:00","return 10:20:00","return 10:25:00",
      "return 10:30:00","return 10:35:00","return 10:40:00","return 10:45:00","return 10:50:00","return 10:55:00",
      "return 11:00:00","return 11:05:00","return 11:10:00","return 11:15:00","return 11:20:00","return 11:25:00",
      "return 11:30:00","return 11:35:00","return 11:40:00","return 11:45:00","return 11:50:00","return 11:55:00",
      "return 12:00:00","return 12:05:00","return 12:10:00","return 12:15:00","return 12:20:00","return 12:25:00",
      "return 12:30:00","return 12:35:00","return 12:40:00","return 12:45:00","return 12:50:00","return 12:55:00",
      "return 13:00:00","return 13:05:00","return 13:10:00","return 13:15:00","return 13:20:00","return 13:25:00",
      "return 13:30:00","return 13:35:00","return 13:40:00","return 13:45:00","return 13:50:00","return 13:55:00"]

#column names in outputs 
cols1=["ID","TARGET"]

#taking all the volatilities for each line and calculating their mean
col = input_train.loc[: , "volatility 09:30:00":"volatility 13:55:00"]
output_train['PREDICTED'] = col.mean(axis=1)
input_train['MEAN']=col.mean(axis=1)

#simple OLS model with a non-binary output, drop missing 
model = sm.OLS(output_train['TARGET'],input_train[cols], missing='drop')
results=model.fit()
output_train['TARGET1']  = results.predict(input_train[cols]) 

start_time = datetime.datetime.now()
print('Starting to calculate variables')
print(datetime.datetime.now()-start_time)

# create new dataframe with calculated features from original data
nf=pd.DataFrame()
nf['TARGET']=output_train['TARGET'].values #add target volativility into the dataframe 
nf['VOL_MEAN']=input_train.loc[:,"volatility 09:30:00":"volatility 13:55:00"].mean(axis=1).values #add mean volatility into the dataframe
nf['VOL_MAX']=input_train.loc[:,"volatility 09:30:00":"volatility 13:55:00"].max(axis=1).values #add max volatility into the dataframe

# take price movement data into separate dataset for processing
pmv=input_train.loc[:,"return 09:30:00":"return 13:55:00"].copy()
pmv.fillna(0) #fill all empty values by zeros 
pmv_up=pmv.replace(1.0,np.NaN) # setup up going prices as missed values in order to count them later on 
nf['Sall-Up']=pmv_up.count(axis=1).values # intermediate variable = total non-missed - number of bars with price going up
pmv_down=pmv.replace(-1.0,np.NaN) # setup up going prices as missed values in order to count them further
nf['Sall-Down']=pmv_down.count(axis=1).values # intermediate variable = total non-missed - number of bars with price going down
nf['rp_up']=(54. - nf['Sall-Up'])/54. #true number of ticks where price goes up 
nf['rp_down']=(54. - nf['Sall-Down'])/54. #true number of ticks where price goes down 
nf.drop(columns=['Sall-Up','Sall-Down'],inplace=True) #deleting intermediate variables 

# accumulated volatility per day
nf['v_acc']=input_train.loc[:,"volatility 09:30:00":"return 13:55:00"].agg(DayVolatility,axis=1).values
# maximal volatility up movement in a single price up spike during the day
nf['v_up_max']=input_train.loc[:,"volatility 09:30:00":"return 13:55:00"].agg(DayPriceUpMax,axis=1).values
# maximal volatility down movement in a single price down spike during the day
nf['v_down_max']=input_train.loc[:,"volatility 09:30:00":"return 13:55:00"].agg(DayPriceDownMax,axis=1).values
# total volatility accumulated in price up ticks
nf['v_up_tot']=input_train.loc[:,"volatility 09:30:00":"return 13:55:00"].agg(DayPriceUpTot,axis=1).values
# total volatility accumulated in price down ticks
nf['v_down_tot']=input_train.loc[:,"volatility 09:30:00":"return 13:55:00"].agg(DayPriceDownTot,axis=1).values
# average volatility accumulated in price up ticks
nf['v_up_tot_avg']=input_train.loc[:,"volatility 09:30:00":"return 13:55:00"].agg(DayPriceUpTotAvg,axis=1).values
# average volatility accumulated in price down ticks
nf['v_down_tot_avg']=input_train.loc[:,"volatility 09:30:00":"return 13:55:00"].agg(DayPriceDownTotAvg,axis=1).values

# asymmetry parameters
nf['rp_asym']=(nf['rp_up'] - nf['rp_down'])/(nf['rp_up'] + nf['rp_down'])
nf['v_max_asym']=(nf['v_up_max'] - nf['v_down_max'])/(nf['v_up_max'] + nf['v_down_max'])
nf['v_tot_asym']=(nf['v_up_tot'] - nf['v_down_tot'])/(nf['v_up_tot'] + nf['v_down_tot'])
nf['v_tot_avg_asym']=(nf['v_up_tot_avg'] - nf['v_down_tot_avg'])/(nf['v_up_tot_avg'] + nf['v_down_tot_avg'])

cols_new=['VOL_MEAN','VOL_MAX','rp_up','rp_down','v_acc','v_up_max','v_down_max',
          'v_up_tot','v_down_tot','v_up_tot_avg','v_down_tot_avg','rp_asym','v_max_asym','v_tot_asym','v_tot_avg_asym'] #all columns for prediction

print('Variables calculated, prediction starting....')
print(datetime.datetime.now()-start_time)

nf[cols_new].fillna(0) #fill all empty values by zeros
#linear regression model - aggregated variables
model_linreg = LinearRegression()
model_linreg.fit(nf[cols_new], nf['TARGET'])
nf['TARGET_4VAR'] = model_linreg.predict(nf[cols_new]) 

#simple OLS model with a non-binary output, - aggregated variables
model1 = sm.OLS(nf['TARGET'],nf[cols_new])
results1=model.fit()
nf['TARGET_OLS_4VAR']  = results1.predict(nf[cols_new]) 

print("")
#calculate model errors on train and print it out for each model separately
model_error= np.mean(np.abs((output_train['TARGET'] - output_train['PREDICTED']) / output_train['TARGET'])) * 100
print("Error is "+str(model_error)+" for mean")
model_error_ols= np.mean(np.abs((output_train['TARGET'] - output_train['TARGET1']) / output_train['TARGET'])) * 100
print("Error is "+str(model_error_ols)+" for OLS, missing values ommitted")
model_error_lr4=np.mean(np.abs((nf['TARGET'] - nf['TARGET_4VAR']) / nf['TARGET'])) * 100
print("Error is "+str(model_error_lr4)+" for LR, fifteen aggregated variables")
model_error_ols4=np.mean(np.abs((nf['TARGET'] - nf['TARGET_OLS_4VAR']) / nf['TARGET'])) * 100
print("Error is "+str(model_error_ols4)+" for OLS, fifteen aggregated variables")


print(datetime.datetime.now()-start_time) #final time 



#autoregression model
#input_train.index = pd.to_datetime(pd.Index(input_train.index))
#model = AR(input_train, missing='drop') 
#results_ar = model.fit()
#output_train['TARGET_AR']  = results_ar.predict(dymanic=False)



