In [7]:
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

import math
import os
# import Quandl
import pathlib
import time

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression
# deprecated: cross validation is used for splitting up data sets
# svm = support vector machine. svm is able to perform regression
# from sklearn import preprocessing, cross_validation, svm
from sklearn.model_selection import train_test_split

from machine_learning_with_python.utils.file_functions import get_dataframe_from_csv

from rich import print as rich_print
from rich.console import Console
import matplotlib.pyplot as plt
from matplotlib import style

import datetime


CONSOLE = Console()

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# https://stackoverflow.com/questions/39125532/file-does-not-exist-in-jupyter-notebook
# /Users/<USER>/dev/universityofprofessorex/machine-learning-with-python
current_folder = globals()['_dh'][0]

HERE = os.path.abspath('')

_dir = pathlib.Path(HERE).resolve()

rich_print(_dir.parent)
rich_print(current_folder)

parent_dir = _dir.parent

# csv_file = f"{_dir.parent}/data/WIKI_PRICES_212b326a081eacca455e13140d7bb9db.csv"
# parquet_file = (
#     f"{_dir.parent}/data/WIKI_PRICES_212b326a081eacca455e13140d7bb9db.parquet"
# )


In [8]:

input_dir = f"{parent_dir}/data/"

CONSOLE.log(os.listdir(input_dir))


In [9]:
csv_file = f"{parent_dir}/data/WIKI_PRICES_212b326a081eacca455e13140d7bb9db.csv"

csv_file


'/Users/malcolm/dev/universityofprofessorex/machine-learning-with-python/data/WIKI_PRICES_212b326a081eacca455e13140d7bb9db.csv'

In [10]:
parquet_file = (
    f"{parent_dir}/data/WIKI_PRICES_212b326a081eacca455e13140d7bb9db.parquet"
)

parquet_file


'/Users/malcolm/dev/universityofprofessorex/machine-learning-with-python/data/WIKI_PRICES_212b326a081eacca455e13140d7bb9db.parquet'

In [45]:
# Pandas: Read Parquet
t1 = time.time()
df = pd.read_parquet(parquet_file, engine="pyarrow")
t2 = time.time()
delta_t = round((t2 - t1), 3)
print(f"Time it took = {delta_t} seconds\n")

rich_print(df.head())

TypeError: read_table() got an unexpected keyword argument 'index_col'

In [12]:
# We only need some of these categories for linear regression
df = df[
    [
        "adj_open",
        "adj_high",
        "adj_low",
        "adj_close",
        "adj_volume",
    ]
]

df

Unnamed: 0,adj_open,adj_high,adj_low,adj_close,adj_volume
0,31.041951,34.112034,27.289627,30.018590,44739900.0
1,29.295415,29.336350,27.160002,27.548879,10897100.0
2,28.183363,30.018590,27.330562,30.018590,4705200.0
3,28.995229,29.766161,27.460188,27.460188,4274400.0
4,27.378319,28.613174,27.289627,28.012803,3464400.0
...,...,...,...,...,...
15389309,23.800000,24.600000,23.605800,23.950000,354092.0
15389310,23.900000,24.350000,23.300000,23.350000,269607.0
15389311,23.550000,24.200000,23.450000,23.550000,301584.0
15389312,23.750000,24.800000,23.700000,24.650000,375320.0


In [13]:
# high minus low column
df["HL_PCT"] = (df["adj_high"] - df["adj_close"]) / df["adj_close"] * 100.0

df


Unnamed: 0,adj_open,adj_high,adj_low,adj_close,adj_volume,HL_PCT
0,31.041951,34.112034,27.289627,30.018590,44739900.0,13.636364
1,29.295415,29.336350,27.160002,27.548879,10897100.0,6.488361
2,28.183363,30.018590,27.330562,30.018590,4705200.0,0.000000
3,28.995229,29.766161,27.460188,27.460188,4274400.0,8.397516
4,27.378319,28.613174,27.289627,28.012803,3464400.0,2.143205
...,...,...,...,...,...,...
15389309,23.800000,24.600000,23.605800,23.950000,354092.0,2.713987
15389310,23.900000,24.350000,23.300000,23.350000,269607.0,4.282655
15389311,23.550000,24.200000,23.450000,23.550000,301584.0,2.760085
15389312,23.750000,24.800000,23.700000,24.650000,375320.0,0.608519


In [14]:
# daily percent change
df["PCT_change"] = (df["adj_close"] - df["adj_open"]) / df["adj_open"] * 100.0

df

Unnamed: 0,adj_open,adj_high,adj_low,adj_close,adj_volume,HL_PCT,PCT_change
0,31.041951,34.112034,27.289627,30.018590,44739900.0,13.636364,-3.296703
1,29.295415,29.336350,27.160002,27.548879,10897100.0,6.488361,-5.961807
2,28.183363,30.018590,27.330562,30.018590,4705200.0,0.000000,6.511740
3,28.995229,29.766161,27.460188,27.460188,4274400.0,8.397516,-5.294118
4,27.378319,28.613174,27.289627,28.012803,3464400.0,2.143205,2.317468
...,...,...,...,...,...,...,...
15389309,23.800000,24.600000,23.605800,23.950000,354092.0,2.713987,0.630252
15389310,23.900000,24.350000,23.300000,23.350000,269607.0,4.282655,-2.301255
15389311,23.550000,24.200000,23.450000,23.550000,301584.0,2.760085,0.000000
15389312,23.750000,24.800000,23.700000,24.650000,375320.0,0.608519,3.789474


In [15]:
# We define a new datafram

df = df[["adj_close", "HL_PCT", "PCT_change", "adj_volume"]]

df

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume
0,30.018590,13.636364,-3.296703,44739900.0
1,27.548879,6.488361,-5.961807,10897100.0
2,30.018590,0.000000,6.511740,4705200.0
3,27.460188,8.397516,-5.294118,4274400.0
4,28.012803,2.143205,2.317468,3464400.0
...,...,...,...,...
15389309,23.950000,2.713987,0.630252,354092.0
15389310,23.350000,4.282655,-2.301255,269607.0
15389311,23.550000,2.760085,0.000000,301584.0
15389312,24.650000,0.608519,3.789474,375320.0


In [16]:
forecast_col = "adj_close"


In [17]:
# fill columns with NaN, but replace it with a real value. better than getting rid of data
df.fillna(-99999, inplace=True)

df

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume
0,30.018590,13.636364,-3.296703,44739900.0
1,27.548879,6.488361,-5.961807,10897100.0
2,30.018590,0.000000,6.511740,4705200.0
3,27.460188,8.397516,-5.294118,4274400.0
4,28.012803,2.143205,2.317468,3464400.0
...,...,...,...,...
15389309,23.950000,2.713987,0.630252,354092.0
15389310,23.350000,4.282655,-2.301255,269607.0
15389311,23.550000,2.760085,0.000000,301584.0
15389312,24.650000,0.608519,3.789474,375320.0


In [19]:
# round everything up to the nearest show number. We are trying to perdict 10% of the dataframe ( that's what the 0.1 is )
forecast_out = int(math.ceil(0.1 * len(df)))
print(forecast_out)


1538932


In [20]:
forecast_out


1538932

In [21]:
# classifier ( the shift is forcasting the columns out negatively)
df["label"] = df[forecast_col].shift(-forecast_out)

df

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume,label
0,30.018590,13.636364,-3.296703,44739900.0,27.130393
1,27.548879,6.488361,-5.961807,10897100.0,27.400187
2,30.018590,0.000000,6.511740,4705200.0,26.956626
3,27.460188,8.397516,-5.294118,4274400.0,23.380701
4,28.012803,2.143205,2.317468,3464400.0,22.553025
...,...,...,...,...,...
15389309,23.950000,2.713987,0.630252,354092.0,
15389310,23.350000,4.282655,-2.301255,269607.0,
15389311,23.550000,2.760085,0.000000,301584.0,
15389312,24.650000,0.608519,3.789474,375320.0,


In [22]:
# features = capital X
X = np.array(df.drop(["label"], 1))  # get everything except for label

X

array([[ 3.00185902e+01,  1.36363636e+01, -3.29670330e+00,
         4.47399000e+07],
       [ 2.75488789e+01,  6.48836057e+00, -5.96180717e+00,
         1.08971000e+07],
       [ 3.00185902e+01,  0.00000000e+00,  6.51174050e+00,
         4.70520000e+06],
       ...,
       [ 2.35500000e+01,  2.76008493e+00,  0.00000000e+00,
         3.01584000e+05],
       [ 2.46500000e+01,  6.08519270e-01,  3.78947368e+00,
         3.75320000e+05],
       [ 2.36000000e+01,  4.44915254e+00, -4.25963489e+00,
         4.03884000e+05]])

In [23]:
X = X[:-forecast_out]  # the point of where we were able to forecast the out plus

X

array([[ 3.00185902e+01,  1.36363636e+01, -3.29670330e+00,
         4.47399000e+07],
       [ 2.75488789e+01,  6.48836057e+00, -5.96180717e+00,
         1.08971000e+07],
       [ 3.00185902e+01,  0.00000000e+00,  6.51174050e+00,
         4.70520000e+06],
       ...,
       [ 1.49750000e+01,  2.00333890e+00,  1.52542373e+00,
         8.73200000e+05],
       [ 1.47300000e+01,  2.64765784e+00, -1.43860823e+00,
         4.88400000e+05],
       [ 1.46050000e+01,  2.63608353e+00, -1.51719488e+00,
         4.27000000e+05]])

In [24]:
X_lately = X[-forecast_out:]  # this is the stuff we are going to predict against


X_lately


array([[ 1.88700000e+01,  4.50450450e-01,  1.06100796e-01,
         2.97390000e+04],
       [ 1.82400000e+01,  4.16666667e+00, -4.00000000e+00,
         4.38880000e+04],
       [ 1.87300000e+01,  1.60170849e-01,  2.74273176e+00,
         2.82520000e+04],
       ...,
       [ 1.49750000e+01,  2.00333890e+00,  1.52542373e+00,
         8.73200000e+05],
       [ 1.47300000e+01,  2.64765784e+00, -1.43860823e+00,
         4.88400000e+05],
       [ 1.46050000e+01,  2.63608353e+00, -1.51719488e+00,
         4.27000000e+05]])

In [26]:
# Now we are going to scale x
# in order to properly scale it, you need to scale them alongside all your other values (when training)
# SOURCE: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.scale.html
# SOURCE: https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler
# Standardize a dataset along any axis.
# Center to the mean and component wise scale to unit variance.
# Read more in the User Guide.
X = preprocessing.scale(X)

X


array([[-1.99443063e-02,  2.96335272e-02,  8.37553117e-04,
         6.42933796e+00],
       [-2.06040974e-02,  1.22269166e-02, -3.43211417e-03,
         1.40656005e+00],
       [-1.99443063e-02, -3.57335173e-03,  1.65513090e-02,
         4.87589446e-01],
       ...,
       [-2.39632485e-02,  1.30512129e-03,  8.56290979e-03,
        -8.11366917e-02],
       [-2.40287011e-02,  2.87414818e-03,  3.81434055e-03,
        -1.38246769e-01],
       [-2.40620952e-02,  2.84596276e-03,  3.68843970e-03,
        -1.47359448e-01]])

In [27]:
df.dropna(inplace=True)  # Remove missing values.
# labels = lowercase y

df

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume,label
0,30.018590,13.636364,-3.296703,44739900.0,27.130393
1,27.548879,6.488361,-5.961807,10897100.0,27.400187
2,30.018590,0.000000,6.511740,4705200.0,26.956626
3,27.460188,8.397516,-5.294118,4274400.0,23.380701
4,28.012803,2.143205,2.317468,3464400.0,22.553025
...,...,...,...,...,...
13850377,14.640000,0.717213,0.273973,456800.0,23.950000
13850378,14.750000,0.000000,0.067843,771200.0,23.350000
13850379,14.975000,2.003339,1.525424,873200.0,23.550000
13850380,14.730000,2.647658,-1.438608,488400.0,24.650000


In [28]:
y = np.array(df["label"])

y

array([27.13039259, 27.40018749, 26.95662638, ..., 23.55      ,
       24.65      , 23.6       ])

In [29]:
y = np.array(df["label"])

y


array([27.13039259, 27.40018749, 26.95662638, ..., 23.55      ,
       24.65      , 23.6       ])

In [31]:
# training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)  # 20% of the data





In [32]:
X_train


array([[-1.48716757e-02, -1.29575324e-03,  5.47947582e-03,
        -4.03684372e-02],
       [-6.63271859e-03, -3.51735792e-03,  8.94945267e-03,
         1.92763122e-01],
       [-2.38614846e-02,  3.35130548e-03,  3.13849877e-03,
        -1.93457136e-01],
       ...,
       [-2.14987609e-02, -8.56425277e-04,  4.41615320e-03,
         6.49087184e+00],
       [-2.00959704e-02, -1.46628154e-03,  7.06528206e-03,
        -1.79624861e-01],
       [-1.60472866e-02, -9.96729300e-04,  4.67917962e-03,
        -1.89655635e-01]])

In [33]:
X_test

array([[-2.58283001e-02, -1.57345275e-03,  1.33166453e-02,
         5.03144409e-02],
       [-1.98907591e-03, -1.86959385e-03,  9.98143112e-03,
        -1.42180063e-01],
       [-1.68456798e-02, -2.22198816e-04,  3.94433550e-03,
        -8.00826487e-02],
       ...,
       [-2.02431411e-02, -2.02872675e-04,  5.56664727e-03,
        -2.07378459e-01],
       [-2.52923354e-02,  2.27105895e-03,  2.36424540e-03,
        -2.10043806e-01],
       [-9.43134812e-03, -3.57335173e-03,  9.41959443e-03,
         9.97825247e-01]])

In [34]:
y_train

array([ 5.14187187, 13.53113114,  4.12384416, ..., 17.28071293,
       17.24950048,  4.54588789])

In [35]:
y_test


array([ 45.46090467, 360.        , 493.53878309, ...,  50.83496256,
         2.99      ,  44.24496773])

In [36]:
# classifier definition and fit it
clf = LinearRegression(n_jobs=-1)  # choice A
# clf = svm.SVR() # change algorithm to   # choice B

clf

LinearRegression(n_jobs=-1)

In [37]:
clf.fit(X_train, y_train)  # train
accuracy = clf.score(
    X_train, y_train
)  # test ( on seperate data, you want to use different data for this to make sure it actually works )


accuracy


0.0005596973589790943

In [38]:
print(f"accuracy = {accuracy}\n")  # 0.000595491194672948 ( not very accurate )


accuracy = 0.0005596973589790943



In [39]:
forecast_set = clf.predict(X_lately)

print(
    f"forecast_set,accuracy,forecast_out  = {forecast_set},{accuracy},{forecast_out}\n"
)  # 0.000595491194672948 ( not very accurate )



forecast_set,accuracy,forecast_out  = [  462362.42352536   680981.08844161   439361.33161246 ...
 13499600.67609166  7551540.45292207  6602436.76293474],0.0005596973589790943,1538932



In [40]:
forecast_set


array([  462362.42352536,   680981.08844161,   439361.33161246, ...,
       13499600.67609166,  7551540.45292207,  6602436.76293474])

In [41]:
accuracy


0.0005596973589790943

In [42]:
forecast_out


1538932

In [43]:
df["Forecast"] = np.nan


df

Unnamed: 0,adj_close,HL_PCT,PCT_change,adj_volume,label,Forecast
0,30.018590,13.636364,-3.296703,44739900.0,27.130393,
1,27.548879,6.488361,-5.961807,10897100.0,27.400187,
2,30.018590,0.000000,6.511740,4705200.0,26.956626,
3,27.460188,8.397516,-5.294118,4274400.0,23.380701,
4,28.012803,2.143205,2.317468,3464400.0,22.553025,
...,...,...,...,...,...,...
13850377,14.640000,0.717213,0.273973,456800.0,23.950000,
13850378,14.750000,0.000000,0.067843,771200.0,23.350000,
13850379,14.975000,2.003339,1.525424,873200.0,23.550000,
13850380,14.730000,2.647658,-1.438608,488400.0,24.650000,


In [44]:
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day


AttributeError: 'numpy.int64' object has no attribute 'timestamp'