visualize.py simulaenous raw-scale-denoise output; seperate scale/den…

…oise_periods func
timothyyu · Mar 3, 2019 · b715d88 · b715d88
1 parent 8073c42
commit b715d88
Show file tree

Hide file tree

Showing 27 changed files with 2,118 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,9 @@ Repository that aims to implement the WSAE-LSTM model and replicate the results
 
 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0180944
 
+This implementation of the WSAE-LSTM models aims to address potential issues in the implementation of the WSAE-LSTM model as defined by Bao et al. (2017)while also addressing issues in previous attempts to implement and replicate results of said model (i.e. [mlpanda/DeepLearning_Financial](https://github.com/mlpanda/DeepLearning_Financial)). 
+
+
 ## Source journal (APA)
 
 Bao W, Yue J, Rao Y (2017). "A deep learning framework for financial time series using stacked autoencoders and long-short term memory". PLOS ONE 12(7): e0180944. https://doi.org/10.1371/journal.pone.0180944
@@ -24,8 +27,5 @@ Repository package requirements/dependencies are defined in `requirements.txt` f
 
 ### `mlpanda/DeepLearning_Financial`:
 
-Repository of an existing attempt to replicate above paper in PyTorch: [mlpanda/DeepLearning_Financial:](https://github.com/mlpanda/DeepLearning_Financial)
-, checked out as a `git-subrepo` for reference in `submodules` folder. 
-
-
-
+Repository of an existing attempt to replicate above paper in PyTorch: [mlpanda/DeepLearning_Financial](https://github.com/mlpanda/DeepLearning_Financial)
+, checked out as a `git-subrepo` for reference in `submodules` folder. 
diff --git a/data/interim/cdii_tvt_split.pickle b/data/interim/cdii_tvt_split.pickle
diff --git a/data/interim/cdii_tvt_split_scaled_denoised.pickle b/data/interim/cdii_tvt_split_scaled_denoised.pickle
diff --git a/data/interim/clean_data_index_interval.pickle b/data/interim/clean_data_index_interval.pickle
diff --git a/notebooks/3a scaling exploration.ipynb b/notebooks/3a scaling exploration.ipynb
@@ -455,7 +455,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/4a - denoise_periods function seperate from scale and chart colors.ipynb b/notebooks/4a - denoise_periods function seperate from scale and chart colors.ipynb
diff --git a/notebooks/archived/1e_stacked_autoencoder_exploration.ipynb b/notebooks/archived/1e_stacked_autoencoder_exploration.ipynb
@@ -948,7 +948,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,

diff --git a/...let tvt split exploration continued.ipynb → ...let tvt split exploration continued.ipynb b/...let tvt split exploration continued.ipynb → ...let tvt split exploration continued.ipynb
@@ -2046,7 +2046,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,

diff --git a/...nsight into scaling issue tvt debug.ipynb → ...nsight into scaling issue tvt debug.ipynb b/...nsight into scaling issue tvt debug.ipynb → ...nsight into scaling issue tvt debug.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -33,7 +33,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -42,7 +49,7 @@
        "dict_keys(['csi300 index data', 'nifty 50 index data', 'hangseng index data', 'nikkei 225 index data', 's&p500 index data', 'djia index data'])"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -52,6 +59,261 @@
     "dict_dataframes_index.keys()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(dict_dataframes_index['djia index data'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "24"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dict_dataframes_index['djia index data'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TRAIN: 2008-07-01 00:00:00\n",
+      "TRAIN: 2008-10-01 00:00:00\n",
+      "TRAIN: 2009-01-01 00:00:00\n",
+      "TRAIN: 2009-04-01 00:00:00\n",
+      "TRAIN: 2009-07-01 00:00:00\n",
+      "TRAIN: 2009-10-01 00:00:00\n",
+      "TRAIN: 2010-01-04 00:00:00\n",
+      "TRAIN: 2010-04-01 00:00:00\n",
+      "TRAIN: 2010-07-01 00:00:00\n",
+      "TRAIN: 2010-10-01 00:00:00\n",
+      "TRAIN: 2011-01-03 00:00:00\n",
+      "TRAIN: 2011-04-01 00:00:00\n",
+      "TRAIN: 2011-07-01 00:00:00\n",
+      "TRAIN: 2011-10-03 00:00:00\n",
+      "TRAIN: 2012-01-02 00:00:00\n",
+      "TRAIN: 2012-04-02 00:00:00\n",
+      "TRAIN: 2012-07-02 00:00:00\n",
+      "TRAIN: 2012-10-01 00:00:00\n",
+      "TRAIN: 2013-01-01 00:00:00\n",
+      "TRAIN: 2013-04-01 00:00:00\n",
+      "TRAIN: 2013-07-01 00:00:00\n",
+      "TRAIN: 2013-10-01 00:00:00\n",
+      "TRAIN: 2014-01-01 00:00:00\n",
+      "TRAIN: 2014-04-01 00:00:00\n"
+     ]
+    }
+   ],
+   "source": [
+    "for item in dict_dataframes_index['djia index data']:\n",
+    "    print(\"TRAIN:\",dict_dataframes_index['nifty 50 index data'][item][1].index[0])\n",
+    "    #print(\"VAL:\",dict_dataframes_index['nifty 50 index data'][item][2].index[0])\n",
+    "    #print(\"TEST:\",dict_dataframes_index['nifty 50 index data'][item][3].index[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "VAL: 2010-07-01 00:00:00\n",
+      "VAL: 2010-10-01 00:00:00\n",
+      "VAL: 2011-01-03 00:00:00\n",
+      "VAL: 2011-04-01 00:00:00\n",
+      "VAL: 2011-07-01 00:00:00\n",
+      "VAL: 2011-10-03 00:00:00\n",
+      "VAL: 2012-01-04 00:00:00\n",
+      "VAL: 2012-04-02 00:00:00\n",
+      "VAL: 2012-07-02 00:00:00\n",
+      "VAL: 2012-10-01 00:00:00\n",
+      "VAL: 2013-01-03 00:00:00\n",
+      "VAL: 2013-04-01 00:00:00\n",
+      "VAL: 2013-07-01 00:00:00\n",
+      "VAL: 2013-10-03 00:00:00\n",
+      "VAL: 2014-01-02 00:00:00\n",
+      "VAL: 2014-04-02 00:00:00\n",
+      "VAL: 2014-07-02 00:00:00\n",
+      "VAL: 2014-10-01 00:00:00\n",
+      "VAL: 2015-01-01 00:00:00\n",
+      "VAL: 2015-04-01 00:00:00\n",
+      "VAL: 2015-07-01 00:00:00\n",
+      "VAL: 2015-10-01 00:00:00\n",
+      "VAL: 2016-01-01 00:00:00\n",
+      "VAL: 2016-04-01 00:00:00\n"
+     ]
+    }
+   ],
+   "source": [
+    "for item in dict_dataframes_index['djia index data']:\n",
+    "    #print(\"TRAIN:\",dict_dataframes_index['nifty 50 index data'][item][1].index[0])\n",
+    "    print(\"VAL:\",dict_dataframes_index['djia index data'][item][2].index[0])\n",
+    "    #print(\"TEST:\",dict_dataframes_index['nifty 50 index data'][item][3].index[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TEST: 2010-10-01 00:00:00\n",
+      "TEST: 2011-01-03 00:00:00\n",
+      "TEST: 2011-04-01 00:00:00\n",
+      "TEST: 2011-07-01 00:00:00\n",
+      "TEST: 2011-10-03 00:00:00\n",
+      "TEST: 2012-01-02 00:00:00\n",
+      "TEST: 2012-04-04 00:00:00\n",
+      "TEST: 2012-07-02 00:00:00\n",
+      "TEST: 2012-10-01 00:00:00\n",
+      "TEST: 2013-01-01 00:00:00\n",
+      "TEST: 2013-04-03 00:00:00\n",
+      "TEST: 2013-07-01 00:00:00\n",
+      "TEST: 2013-10-01 00:00:00\n",
+      "TEST: 2014-01-03 00:00:00\n",
+      "TEST: 2014-04-02 00:00:00\n",
+      "TEST: 2014-07-02 00:00:00\n",
+      "TEST: 2014-10-07 00:00:00\n",
+      "TEST: 2015-01-01 00:00:00\n",
+      "TEST: 2015-04-01 00:00:00\n",
+      "TEST: 2015-07-01 00:00:00\n",
+      "TEST: 2015-10-01 00:00:00\n",
+      "TEST: 2016-01-01 00:00:00\n",
+      "TEST: 2016-04-01 00:00:00\n",
+      "TEST: 2016-07-01 00:00:00\n"
+     ]
+    }
+   ],
+   "source": [
+    "for item in dict_dataframes_index['djia index data']:\n",
+    "    #print(\"TRAIN:\",dict_dataframes_index['nifty 50 index data'][item][1].index[0])\n",
+    "    #print(\"VAL:\",dict_dataframes_index['nifty 50 index data'][item][2].index[0])\n",
+    "    print(\"TEST:\",dict_dataframes_index['djia index data'][item][3].index[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'preprocessing' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-17-0911614acaaf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     33\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mddi_scaled\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mddi_denoised\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 35\u001b[1;33m \u001b[0mddi_scaled\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mddi_denoised\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdenoise_periods\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdict_dataframes_index\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32m<ipython-input-17-0911614acaaf>\u001b[0m in \u001b[0;36mdenoise_periods\u001b[1;34m(dict_dataframes)\u001b[0m\n\u001b[0;32m      7\u001b[0m         \u001b[0mddi_scaled\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex_name\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeepcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdict_dataframes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex_name\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex_name\u001b[0m \u001b[1;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mddi_denoised\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m         \u001b[0mscaler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpreprocessing\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRobustScaler\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mvalue\u001b[0m \u001b[1;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mddi_denoised\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex_name\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'preprocessing' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "def denoise_periods(dict_dataframes):\n",
+    "    \n",
+    "    ddi_scaled = dict()\n",
+    "    ddi_denoised= dict() \n",
+    "    for key, index_name in enumerate(dict_dataframes):\n",
+    "        ddi_denoised[index_name] = copy.deepcopy(dict_dataframes[index_name])\n",
+    "        ddi_scaled[index_name] = copy.deepcopy(dict_dataframes[index_name])\n",
+    "    for key, index_name in enumerate(ddi_denoised): \n",
+    "        scaler = preprocessing.RobustScaler()\n",
+    "\n",
+    "        for index,value in enumerate(ddi_denoised[index_name]):\n",
+    "            \n",
+    "            X_train = ddi_denoised[index_name][value][1]\n",
+    "            X_train_scaled = scaler.fit_transform(X_train)\n",
+    "            X_train_scaled = pd.DataFrame(X_train_scaled,columns=list(X_train.columns))\n",
+    "            \n",
+    "            X_val = ddi_denoised[index_name][value][2]\n",
+    "            X_val_scaled = scaler.transform(X_val)\n",
+    "            X_val_scaled = pd.DataFrame(X_val_scaled,columns=list(X_val.columns))\n",
+    "            \n",
+    "            X_test = ddi_denoised[index_name][value][3]\n",
+    "            X_test_scaled = scaler.transform(X_test)\n",
+    "            X_test_scaled = pd.DataFrame(X_test_scaled,columns=list(X_test.columns))\n",
+    "            \n",
+    "            ddi_scaled[index_name][value][1] = X_train_scaled\n",
+    "            ddi_scaled[index_name][value][2] = X_val_scaled\n",
+    "            ddi_scaled[index_name][value][3] = X_test_scaled\n",
+    "            \n",
+    "            ddi_denoised[index_name][value][1] = waveletSmooth(X_train_scaled)\n",
+    "            ddi_denoised[index_name][value][2] = waveletSmooth(X_val_scaled)\n",
+    "            ddi_denoised[index_name][value][3] = waveletSmooth(X_test_scaled)\n",
+    "            \n",
+    "    return ddi_scaled,ddi_denoised\n",
+    "\n",
+    "ddi_scaled,ddi_denoised = denoise_periods(dict_dataframes_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -640,7 +902,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/archived/split_dataset_debug.ipynb → ...oks/archived/2d split_dataset_debug.ipynb b/notebooks/archived/split_dataset_debug.ipynb → ...oks/archived/2d split_dataset_debug.ipynb