Permalink
Browse files

Add example notebook and data (#52)

  • Loading branch information...
icexelloss committed Sep 20, 2018
1 parent 055da75 commit ee1dc08b5a7f2c84e41bfbc7a02e069d23d02c72
Showing with 640 additions and 0 deletions.
  1. +387 −0 example/Flint Example.ipynb
  2. +253 −0 example/sp500.csv
@@ -0,0 +1,387 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <p><b>SparkSession - in-memory</b></p>\n",
" \n",
" <div>\n",
" <p><b>SparkContext</b></p>\n",
"\n",
" <p><a href=\"http://10.200.4.40:4040\">Spark UI</a></p>\n",
"\n",
" <dl>\n",
" <dt>Version</dt>\n",
" <dd><code>v2.3.3-SNAPSHOT</code></dd>\n",
" <dt>Master</dt>\n",
" <dd><code>local[*]</code></dd>\n",
" <dt>AppName</dt>\n",
" <dd><code>PySparkShell</code></dd>\n",
" </dl>\n",
" </div>\n",
" \n",
" </div>\n",
" "
],
"text/plain": [
"<pyspark.sql.session.SparkSession at 0x109a86dd8>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spark"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import ts.flint\n",
"from ts.flint import FlintContext\n",
"flintContext = FlintContext(sqlContext)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------+--------------------+\n",
"| time| return|\n",
"+-------------------+--------------------+\n",
"|2017-07-26 00:00:00| -8.628705286851352|\n",
"|2017-07-27 00:00:00| -29.56422678968445|\n",
"|2017-07-28 00:00:00| 12.068999719708462|\n",
"|2017-07-31 00:00:00| -22.778791628209955|\n",
"|2017-08-01 00:00:00| -3.0277339240571943|\n",
"|2017-08-02 00:00:00| -11.328163960923687|\n",
"|2017-08-03 00:00:00| -15.630331436501073|\n",
"|2017-08-04 00:00:00|-0.20107959349155807|\n",
"|2017-08-07 00:00:00| 15.21924139469683|\n",
"|2017-08-08 00:00:00| -13.840562730697165|\n",
"|2017-08-09 00:00:00| 35.16710266437803|\n",
"|2017-08-10 00:00:00| -110.20582340007698|\n",
"|2017-08-11 00:00:00| 1.1471708596583703|\n",
"|2017-08-14 00:00:00| 44.318959057759976|\n",
"|2017-08-15 00:00:00| -16.40487205351478|\n",
"|2017-08-16 00:00:00| -2.105524216406371|\n",
"|2017-08-17 00:00:00| -133.74182039966306|\n",
"|2017-08-18 00:00:00| -8.60854200833519|\n",
"|2017-08-21 00:00:00| 11.83309420737966|\n",
"|2017-08-22 00:00:00| 77.08273240883396|\n",
"+-------------------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"sp500 = spark.read.option('header', True).option('inferSchema', True).csv('sp500.csv').withColumnRenamed('Date', 'time')\n",
"sp500 = flintContext.read.dataframe(sp500)\n",
"sp500_return = sp500.withColumn('return', 10000 * (sp500['Close'] - sp500['Open']) / sp500['Open']).select('time', 'return')\n",
"sp500_return.show()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------+--------------------+-------------------+\n",
"| time| return|previous_day_return|\n",
"+-------------------+--------------------+-------------------+\n",
"|2017-07-26 00:00:00| -8.628705286851352| null|\n",
"|2017-07-27 00:00:00| -29.56422678968445| -8.628705286851352|\n",
"|2017-07-28 00:00:00| 12.068999719708462| -29.56422678968445|\n",
"|2017-07-31 00:00:00| -22.778791628209955| null|\n",
"|2017-08-01 00:00:00| -3.0277339240571943|-22.778791628209955|\n",
"|2017-08-02 00:00:00| -11.328163960923687|-3.0277339240571943|\n",
"|2017-08-03 00:00:00| -15.630331436501073|-11.328163960923687|\n",
"|2017-08-04 00:00:00|-0.20107959349155807|-15.630331436501073|\n",
"|2017-08-07 00:00:00| 15.21924139469683| null|\n",
"|2017-08-08 00:00:00| -13.840562730697165| 15.21924139469683|\n",
"|2017-08-09 00:00:00| 35.16710266437803|-13.840562730697165|\n",
"|2017-08-10 00:00:00| -110.20582340007698| 35.16710266437803|\n",
"|2017-08-11 00:00:00| 1.1471708596583703|-110.20582340007698|\n",
"|2017-08-14 00:00:00| 44.318959057759976| null|\n",
"|2017-08-15 00:00:00| -16.40487205351478| 44.318959057759976|\n",
"|2017-08-16 00:00:00| -2.105524216406371| -16.40487205351478|\n",
"|2017-08-17 00:00:00| -133.74182039966306| -2.105524216406371|\n",
"|2017-08-18 00:00:00| -8.60854200833519|-133.74182039966306|\n",
"|2017-08-21 00:00:00| 11.83309420737966| null|\n",
"|2017-08-22 00:00:00| 77.08273240883396| 11.83309420737966|\n",
"+-------------------+--------------------+-------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from ts.flint import windows\n",
"\n",
"sp500_previous_day_return = sp500_return.shiftTime(windows.future_absolute_time('1day')).toDF('time', 'previous_day_return')\n",
"sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return)\n",
"sp500_joined_return.show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------+--------------------+--------------------+\n",
"| time| return| previous_day_return|\n",
"+-------------------+--------------------+--------------------+\n",
"|2017-07-27 00:00:00| -29.56422678968445| -8.628705286851352|\n",
"|2017-07-28 00:00:00| 12.068999719708462| -29.56422678968445|\n",
"|2017-07-31 00:00:00| -22.778791628209955| 12.068999719708462|\n",
"|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955|\n",
"|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943|\n",
"|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687|\n",
"|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073|\n",
"|2017-08-07 00:00:00| 15.21924139469683|-0.20107959349155807|\n",
"|2017-08-08 00:00:00| -13.840562730697165| 15.21924139469683|\n",
"|2017-08-09 00:00:00| 35.16710266437803| -13.840562730697165|\n",
"|2017-08-10 00:00:00| -110.20582340007698| 35.16710266437803|\n",
"|2017-08-11 00:00:00| 1.1471708596583703| -110.20582340007698|\n",
"|2017-08-14 00:00:00| 44.318959057759976| 1.1471708596583703|\n",
"|2017-08-15 00:00:00| -16.40487205351478| 44.318959057759976|\n",
"|2017-08-16 00:00:00| -2.105524216406371| -16.40487205351478|\n",
"|2017-08-17 00:00:00| -133.74182039966306| -2.105524216406371|\n",
"|2017-08-18 00:00:00| -8.60854200833519| -133.74182039966306|\n",
"|2017-08-21 00:00:00| 11.83309420737966| -8.60854200833519|\n",
"|2017-08-22 00:00:00| 77.08273240883396| 11.83309420737966|\n",
"|2017-08-23 00:00:00| -3.435113544185862| 77.08273240883396|\n",
"+-------------------+--------------------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return, tolerance='3days').dropna()\n",
"sp500_joined_return.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------+--------------------+--------------------+------------------------+\n",
"| time| return| previous_day_return|previous_day_return_ewma|\n",
"+-------------------+--------------------+--------------------+------------------------+\n",
"|2017-07-27 00:00:00| -29.56422678968445| -8.628705286851352| -8.628705286851352|\n",
"|2017-07-28 00:00:00| 12.068999719708462| -29.56422678968445| -33.878579433110126|\n",
"|2017-07-31 00:00:00| -22.778791628209955| 12.068999719708462| 7.834177290569695|\n",
"|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955| -18.86170298292511|\n",
"|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943| -12.45858541551975|\n",
"|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687| -17.557456668683564|\n",
"|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073| -24.375353890816093|\n",
"|2017-08-07 00:00:00| 15.21924139469683|-0.20107959349155807| -3.2191275146192693|\n",
"|2017-08-08 00:00:00| -13.840562730697165| 15.21924139469683| 13.562533107232085|\n",
"|2017-08-09 00:00:00| 35.16710266437803| -13.840562730697165| -6.970316522283428|\n",
"|2017-08-10 00:00:00| -110.20582340007698| 35.16710266437803| 31.69377148887716|\n",
"|2017-08-11 00:00:00| 1.1471708596583703| -110.20582340007698| -94.31468701516604|\n",
"|2017-08-14 00:00:00| 44.318959057759976| 1.1471708596583703| -10.626901021693929|\n",
"|2017-08-15 00:00:00| -16.40487205351478| 44.318959057759976| 39.00629401407509|\n",
"|2017-08-16 00:00:00| -2.105524216406371| -16.40487205351478| 3.03882479182473|\n",
"|2017-08-17 00:00:00| -133.74182039966306| -2.105524216406371| -0.5320471223272201|\n",
"|2017-08-18 00:00:00| -8.60854200833519| -133.74182039966306| -134.1452154556094|\n",
"|2017-08-21 00:00:00| 11.83309420737966| -8.60854200833519| -25.269071065872232|\n",
"|2017-08-22 00:00:00| 77.08273240883396| 11.83309420737966| -0.8059224617269969|\n",
"|2017-08-23 00:00:00| -3.435113544185862| 77.08273240883396| 76.50665024415109|\n",
"+-------------------+--------------------+--------------------+------------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from ts.flint import summarizers\n",
"\n",
"sp500_decayed_return = sp500_joined_return.summarizeWindows(\n",
" window = windows.past_absolute_time('7day'),\n",
" summarizer = summarizers.ewma('previous_day_return', alpha=0.5)\n",
")\n",
"\n",
"sp500_decayed_return.show()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------+--------------------+--------------------+------------------------+\n",
"| time| return| previous_day_return|previous_day_return_ewma|\n",
"+-------------------+--------------------+--------------------+------------------------+\n",
"|2017-07-27 00:00:00| -29.56422678968445| -8.628705286851352| -8.628705286851352|\n",
"|2017-07-28 00:00:00| 12.068999719708462| -29.56422678968445| -33.878579433110126|\n",
"|2017-07-31 00:00:00| -22.778791628209955| 12.068999719708462| 7.834177290569695|\n",
"|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955| -18.86170298292511|\n",
"|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943| -12.45858541551975|\n",
"|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687| -17.557456668683564|\n",
"|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073| -24.375353890816093|\n",
"|2017-08-07 00:00:00| 15.21924139469683|-0.20107959349155807| -3.2191275146192693|\n",
"|2017-08-08 00:00:00| -13.840562730697165| 15.21924139469683| 13.562533107232085|\n",
"|2017-08-09 00:00:00| 35.16710266437803| -13.840562730697165| -6.970316522283428|\n",
"|2017-08-10 00:00:00| -110.20582340007698| 35.16710266437803| 31.69377148887716|\n",
"|2017-08-11 00:00:00| 1.1471708596583703| -110.20582340007698| -94.31468701516604|\n",
"|2017-08-14 00:00:00| 44.318959057759976| 1.1471708596583703| -10.626901021693929|\n",
"|2017-08-15 00:00:00| -16.40487205351478| 44.318959057759976| 39.00629401407509|\n",
"|2017-08-16 00:00:00| -2.105524216406371| -16.40487205351478| 3.03882479182473|\n",
"|2017-08-17 00:00:00| -133.74182039966306| -2.105524216406371| -0.5320471223272201|\n",
"|2017-08-18 00:00:00| -8.60854200833519| -133.74182039966306| -134.1452154556094|\n",
"|2017-08-21 00:00:00| 11.83309420737966| -8.60854200833519| -25.269071065872232|\n",
"|2017-08-22 00:00:00| 77.08273240883396| 11.83309420737966| -0.8059224617269969|\n",
"|2017-08-23 00:00:00| -3.435113544185862| 77.08273240883396| 76.50665024415109|\n",
"+-------------------+--------------------+--------------------+------------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from ts.flint import summarizers\n",
"\n",
"sp500_decayed_return = sp500_joined_return.summarizeWindows(\n",
" window = windows.past_absolute_time('7day'),\n",
" summarizer = summarizers.ewma('previous_day_return', alpha=0.5)\n",
")\n",
"\n",
"sp500_decayed_return.show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------+--------------------+--------------------+-------------------------------+\n",
"| time| return| previous_day_return|previous_day_return_decayed_sum|\n",
"+-------------------+--------------------+--------------------+-------------------------------+\n",
"|2017-07-27 00:00:00| -29.56422678968445| -8.628705286851352| -8.628705286851352|\n",
"|2017-07-28 00:00:00| 12.068999719708462| -29.56422678968445| -33.878579433110126|\n",
"|2017-07-31 00:00:00| -22.778791628209955| 12.068999719708462| -4.870289996846601|\n",
"|2017-08-01 00:00:00| -3.0277339240571943| -22.778791628209955| -25.213936626633256|\n",
"|2017-08-02 00:00:00| -11.328163960923687| -3.0277339240571943| -15.634702237373823|\n",
"|2017-08-03 00:00:00| -15.630331436501073| -11.328163960923687| -19.145515079610597|\n",
"|2017-08-04 00:00:00|-0.20107959349155807| -15.630331436501073| -25.06826545619932|\n",
"|2017-08-07 00:00:00| 15.21924139469683|-0.20107959349155807| -12.2732712780024|\n",
"|2017-08-08 00:00:00| -13.840562730697165| 15.21924139469683| 8.894027635075187|\n",
"|2017-08-09 00:00:00| 35.16710266437803| -13.840562730697165| -9.037630293968792|\n",
"|2017-08-10 00:00:00| -110.20582340007698| 35.16710266437803| 30.695595859957024|\n",
"|2017-08-11 00:00:00| 1.1471708596583703| -110.20582340007698| -94.68102290820903|\n",
"|2017-08-14 00:00:00| 44.318959057759976| 1.1471708596583703| -45.94911666575082|\n",
"|2017-08-15 00:00:00| -16.40487205351478| 44.318959057759976| 21.34754259353287|\n",
"|2017-08-16 00:00:00| -2.105524216406371| -16.40487205351478| -5.968901403540482|\n",
"|2017-08-17 00:00:00| -133.74182039966306| -2.105524216406371| -4.873716125509468|\n",
"|2017-08-18 00:00:00| -8.60854200833519| -133.74182039966306| -136.7281644415487|\n",
"|2017-08-21 00:00:00| 11.83309420737966| -8.60854200833519| -75.25065823848334|\n",
"|2017-08-22 00:00:00| 77.08273240883396| 11.83309420737966| -25.81015945654417|\n",
"|2017-08-23 00:00:00| -3.435113544185862| 77.08273240883396| 63.48516894528438|\n",
"+-------------------+--------------------+--------------------+-------------------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from ts.flint import udf\n",
"import numpy as np\n",
"\n",
"@udf('double', arg_type='numpy')\n",
"def decayed(columns): \n",
" v = columns[0]\n",
" decay = np.power(0.5, np.arange(len(v)))[::-1]\n",
" return (v * decay).sum()\n",
"\n",
"sp500_decayed_return = sp500_joined_return.summarizeWindows(\n",
" window = windows.past_absolute_time('7day'),\n",
" summarizer = {'previous_day_return_decayed_sum': decayed(sp500_joined_return[['previous_day_return']])}\n",
")\n",
"\n",
"sp500_decayed_return.show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.ml.regression import LinearRegression\n",
"from pyspark.ml.feature import VectorAssembler\n",
"\n",
"assembler = VectorAssembler(\n",
" inputCols=[\"previous_day_return\", \"previous_day_return_decayed_sum\"],\n",
" outputCol=\"features\")\n",
"\n",
"output = assembler.transform(sp500_decayed_return).select('return', 'features').toDF('label', 'features')\n",
"\n",
"lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)\n",
"\n",
"model = lr.fit(output)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Oops, something went wrong.

0 comments on commit ee1dc08

Please sign in to comment.