In [1]:
spark

In [2]:
import ts.flint
from ts.flint import FlintContext
flintContext = FlintContext(sqlContext)

In [3]:
sp500 = spark.read.option('header', True).option('inferSchema', True).csv('sp500.csv').withColumnRenamed('Date', 'time')
sp500 = flintContext.read.dataframe(sp500)
sp500_return = sp500.withColumn('return', 10000 * (sp500['Close'] - sp500['Open']) / sp500['Open']).select('time', 'return')
sp500_return.show()

+-------------------+--------------------+
|               time|              return|
+-------------------+--------------------+
|2017-07-26 00:00:00|  -8.628705286851352|
|2017-07-27 00:00:00|  -29.56422678968445|
|2017-07-28 00:00:00|  12.068999719708462|
|2017-07-31 00:00:00| -22.778791628209955|
|2017-08-01 00:00:00| -3.0277339240571943|
|2017-08-02 00:00:00| -11.328163960923687|
|2017-08-03 00:00:00| -15.630331436501073|
|2017-08-04 00:00:00|-0.20107959349155807|
|2017-08-07 00:00:00|   15.21924139469683|
|2017-08-08 00:00:00| -13.840562730697165|
|2017-08-09 00:00:00|   35.16710266437803|
|2017-08-10 00:00:00| -110.20582340007698|
|2017-08-11 00:00:00|  1.1471708596583703|
|2017-08-14 00:00:00|  44.318959057759976|
|2017-08-15 00:00:00|  -16.40487205351478|
|2017-08-16 00:00:00|  -2.105524216406371|
|2017-08-17 00:00:00| -133.74182039966306|
|2017-08-18 00:00:00|   -8.60854200833519|
|2017-08-21 00:00:00|   11.83309420737966|
|2017-08-22 00:00:00|   77.08273240883396|
+----------

In [4]:
from ts.flint import windows

sp500_previous_day_return = sp500_return.shiftTime(windows.future_absolute_time('1day')).toDF('time', 'previous_day_return')
sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return)
sp500_joined_return.show()

+-------------------+--------------------+-------------------+
|               time|              return|previous_day_return|
+-------------------+--------------------+-------------------+
|2017-07-26 00:00:00|  -8.628705286851352|               null|
|2017-07-27 00:00:00|  -29.56422678968445| -8.628705286851352|
|2017-07-28 00:00:00|  12.068999719708462| -29.56422678968445|
|2017-07-31 00:00:00| -22.778791628209955|               null|
|2017-08-01 00:00:00| -3.0277339240571943|-22.778791628209955|
|2017-08-02 00:00:00| -11.328163960923687|-3.0277339240571943|
|2017-08-03 00:00:00| -15.630331436501073|-11.328163960923687|
|2017-08-04 00:00:00|-0.20107959349155807|-15.630331436501073|
|2017-08-07 00:00:00|   15.21924139469683|               null|
|2017-08-08 00:00:00| -13.840562730697165|  15.21924139469683|
|2017-08-09 00:00:00|   35.16710266437803|-13.840562730697165|
|2017-08-10 00:00:00| -110.20582340007698|  35.16710266437803|
|2017-08-11 00:00:00|  1.1471708596583703|-110.20582340

In [5]:
sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return, tolerance='3days').dropna()
sp500_joined_return.show()

+-------------------+--------------------+--------------------+
|               time|              return| previous_day_return|
+-------------------+--------------------+--------------------+
|2017-07-27 00:00:00|  -29.56422678968445|  -8.628705286851352|
|2017-07-28 00:00:00|  12.068999719708462|  -29.56422678968445|
|2017-07-31 00:00:00| -22.778791628209955|  12.068999719708462|
|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955|
|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943|
|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687|
|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073|
|2017-08-07 00:00:00|   15.21924139469683|-0.20107959349155807|
|2017-08-08 00:00:00| -13.840562730697165|   15.21924139469683|
|2017-08-09 00:00:00|   35.16710266437803| -13.840562730697165|
|2017-08-10 00:00:00| -110.20582340007698|   35.16710266437803|
|2017-08-11 00:00:00|  1.1471708596583703| -110.20582340007698|
|2017-08-14 00:00:00|  44.31895905775997

In [6]:
from ts.flint import summarizers

sp500_decayed_return = sp500_joined_return.summarizeWindows(
    window = windows.past_absolute_time('7day'),
    summarizer = summarizers.ewma('previous_day_return', alpha=0.5)
)

sp500_decayed_return.show()

+-------------------+--------------------+--------------------+------------------------+
|               time|              return| previous_day_return|previous_day_return_ewma|
+-------------------+--------------------+--------------------+------------------------+
|2017-07-27 00:00:00|  -29.56422678968445|  -8.628705286851352|      -8.628705286851352|
|2017-07-28 00:00:00|  12.068999719708462|  -29.56422678968445|     -33.878579433110126|
|2017-07-31 00:00:00| -22.778791628209955|  12.068999719708462|       7.834177290569695|
|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955|      -18.86170298292511|
|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943|      -12.45858541551975|
|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687|     -17.557456668683564|
|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073|     -24.375353890816093|
|2017-08-07 00:00:00|   15.21924139469683|-0.20107959349155807|     -3.2191275146192693|
|2017-08-08 00:00:00|

In [7]:
from ts.flint import summarizers

sp500_decayed_return = sp500_joined_return.summarizeWindows(
    window = windows.past_absolute_time('7day'),
    summarizer = summarizers.ewma('previous_day_return', alpha=0.5)
)

sp500_decayed_return.show()

+-------------------+--------------------+--------------------+------------------------+
|               time|              return| previous_day_return|previous_day_return_ewma|
+-------------------+--------------------+--------------------+------------------------+
|2017-07-27 00:00:00|  -29.56422678968445|  -8.628705286851352|      -8.628705286851352|
|2017-07-28 00:00:00|  12.068999719708462|  -29.56422678968445|     -33.878579433110126|
|2017-07-31 00:00:00| -22.778791628209955|  12.068999719708462|       7.834177290569695|
|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955|      -18.86170298292511|
|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943|      -12.45858541551975|
|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687|     -17.557456668683564|
|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073|     -24.375353890816093|
|2017-08-07 00:00:00|   15.21924139469683|-0.20107959349155807|     -3.2191275146192693|
|2017-08-08 00:00:00|

In [8]:
from ts.flint import udf
import numpy as np

@udf('double', arg_type='numpy')
def decayed(columns): 
    v = columns[0]
    decay = np.power(0.5, np.arange(len(v)))[::-1]
    return (v * decay).sum()

sp500_decayed_return = sp500_joined_return.summarizeWindows(
    window = windows.past_absolute_time('7day'),
    summarizer = {'previous_day_return_decayed_sum': decayed(sp500_joined_return[['previous_day_return']])}
)

sp500_decayed_return.show()

+-------------------+--------------------+--------------------+-------------------------------+
|               time|              return| previous_day_return|previous_day_return_decayed_sum|
+-------------------+--------------------+--------------------+-------------------------------+
|2017-07-27 00:00:00|  -29.56422678968445|  -8.628705286851352|             -8.628705286851352|
|2017-07-28 00:00:00|  12.068999719708462|  -29.56422678968445|            -33.878579433110126|
|2017-07-31 00:00:00| -22.778791628209955|  12.068999719708462|             -4.870289996846601|
|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955|            -25.213936626633256|
|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943|            -15.634702237373823|
|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687|            -19.145515079610597|
|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073|             -25.06826545619932|
|2017-08-07 00:00:00|   15.2192413946968

In [9]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["previous_day_return", "previous_day_return_decayed_sum"],
    outputCol="features")

output = assembler.transform(sp500_decayed_return).select('return', 'features').toDF('label', 'features')

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

model = lr.fit(output)