In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("BTC-USD.csv")
btcDF = pd.read_csv(file_path)

# Display sample data
btcDF.head() 

# Review the DataFrame
btcDF

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-01,320.434998,320.434998,314.002991,314.248993,314.248993,8036550
1,2015-01-02,314.079010,315.838989,313.565002,315.032013,315.032013,7860650
2,2015-01-03,314.846008,315.149994,281.082001,281.082001,281.082001,33054400
3,2015-01-04,281.145996,287.230011,257.612000,264.195007,264.195007,55629100
4,2015-01-05,265.084015,278.341003,265.084015,274.473999,274.473999,43962800
...,...,...,...,...,...,...,...
3062,2023-05-21,27118.423828,27265.917969,26706.921875,26753.826172,26753.826172,8647416921
3063,2023-05-22,26749.892578,27045.734375,26549.734375,26851.277344,26851.277344,11056770492
3064,2023-05-23,26855.960938,27434.683594,26816.179688,27225.726563,27225.726563,13697203143
3065,2023-05-24,27224.603516,27224.603516,26106.576172,26334.818359,26334.818359,16299104428


In [3]:
btcDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3067 entries, 0 to 3066
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       3067 non-null   object 
 1   Open       3067 non-null   float64
 2   High       3067 non-null   float64
 3   Low        3067 non-null   float64
 4   Close      3067 non-null   float64
 5   Adj Close  3067 non-null   float64
 6   Volume     3067 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 167.9+ KB


In [4]:

# Convert 'date' column to datetime
btcDF['Date'] = pd.to_datetime(btcDF['Date'])

# Extract year, month, and day into separate columns
btcDF['year'] = btcDF['Date'].dt.year.astype(int)
btcDF['month'] = btcDF['Date'].dt.month.astype(int)
btcDF['day'] = btcDF['Date'].dt.day.astype(int)

btcDF.set_index("Date", inplace=True)

# Display the updated DataFrame
btcDF


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01,320.434998,320.434998,314.002991,314.248993,314.248993,8036550,2015,1,1
2015-01-02,314.079010,315.838989,313.565002,315.032013,315.032013,7860650,2015,1,2
2015-01-03,314.846008,315.149994,281.082001,281.082001,281.082001,33054400,2015,1,3
2015-01-04,281.145996,287.230011,257.612000,264.195007,264.195007,55629100,2015,1,4
2015-01-05,265.084015,278.341003,265.084015,274.473999,274.473999,43962800,2015,1,5
...,...,...,...,...,...,...,...,...,...
2023-05-21,27118.423828,27265.917969,26706.921875,26753.826172,26753.826172,8647416921,2023,5,21
2023-05-22,26749.892578,27045.734375,26549.734375,26851.277344,26851.277344,11056770492,2023,5,22
2023-05-23,26855.960938,27434.683594,26816.179688,27225.726563,27225.726563,13697203143,2023,5,23
2023-05-24,27224.603516,27224.603516,26106.576172,26334.818359,26334.818359,16299104428,2023,5,24


In [5]:
btcDF["TomorrowClose"] = btcDF["Close"].shift(-1)
btcDF.dropna(inplace=True)
btcDF

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day,TomorrowClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-01,320.434998,320.434998,314.002991,314.248993,314.248993,8036550,2015,1,1,315.032013
2015-01-02,314.079010,315.838989,313.565002,315.032013,315.032013,7860650,2015,1,2,281.082001
2015-01-03,314.846008,315.149994,281.082001,281.082001,281.082001,33054400,2015,1,3,264.195007
2015-01-04,281.145996,287.230011,257.612000,264.195007,264.195007,55629100,2015,1,4,274.473999
2015-01-05,265.084015,278.341003,265.084015,274.473999,274.473999,43962800,2015,1,5,286.188995
...,...,...,...,...,...,...,...,...,...,...
2023-05-20,26888.841797,27155.158203,26843.277344,27129.585938,27129.585938,7044911360,2023,5,20,26753.826172
2023-05-21,27118.423828,27265.917969,26706.921875,26753.826172,26753.826172,8647416921,2023,5,21,26851.277344
2023-05-22,26749.892578,27045.734375,26549.734375,26851.277344,26851.277344,11056770492,2023,5,22,27225.726563
2023-05-23,26855.960938,27434.683594,26816.179688,27225.726563,27225.726563,13697203143,2023,5,23,26334.818359


In [6]:
btcDF.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3066 entries, 2015-01-01 to 2023-05-24
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           3066 non-null   float64
 1   High           3066 non-null   float64
 2   Low            3066 non-null   float64
 3   Close          3066 non-null   float64
 4   Adj Close      3066 non-null   float64
 5   Volume         3066 non-null   int64  
 6   year           3066 non-null   int64  
 7   month          3066 non-null   int64  
 8   day            3066 non-null   int64  
 9   TomorrowClose  3066 non-null   float64
dtypes: float64(6), int64(4)
memory usage: 263.5 KB


In [7]:
# Separate the data into labels and features

y = btcDF["TomorrowClose"]
X = btcDF.drop(columns=["TomorrowClose"])



In [8]:
# Review the y variable Series
y

Date
2015-01-01      315.032013
2015-01-02      281.082001
2015-01-03      264.195007
2015-01-04      274.473999
2015-01-05      286.188995
                  ...     
2023-05-20    26753.826172
2023-05-21    26851.277344
2023-05-22    27225.726563
2023-05-23    26334.818359
2023-05-24    26481.371094
Name: TomorrowClose, Length: 3066, dtype: float64

In [9]:
# Review the X variable DataFrame
X

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01,320.434998,320.434998,314.002991,314.248993,314.248993,8036550,2015,1,1
2015-01-02,314.079010,315.838989,313.565002,315.032013,315.032013,7860650,2015,1,2
2015-01-03,314.846008,315.149994,281.082001,281.082001,281.082001,33054400,2015,1,3
2015-01-04,281.145996,287.230011,257.612000,264.195007,264.195007,55629100,2015,1,4
2015-01-05,265.084015,278.341003,265.084015,274.473999,274.473999,43962800,2015,1,5
...,...,...,...,...,...,...,...,...,...
2023-05-20,26888.841797,27155.158203,26843.277344,27129.585938,27129.585938,7044911360,2023,5,20
2023-05-21,27118.423828,27265.917969,26706.921875,26753.826172,26753.826172,8647416921,2023,5,21
2023-05-22,26749.892578,27045.734375,26549.734375,26851.277344,26851.277344,11056770492,2023,5,22
2023-05-23,26855.960938,27434.683594,26816.179688,27225.726563,27225.726563,13697203143,2023,5,23


In [10]:
# Check the balance of our target values
y.value_counts()

6741.750000     2
236.153000      2
1179.969971     2
11784.137695    1
11205.892578    1
               ..
5930.319824     1
5526.640137     1
5750.799805     1
5904.830078     1
26481.371094    1
Name: TomorrowClose, Length: 3063, dtype: int64

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    )


In [12]:
from sklearn.linear_model import LinearRegression

# Instantiate the Linear Regression model
regressor = LinearRegression()

# Fit the model using training data
regressor.fit(X_train, y_train)

# Predict the future values using test data
y_pred = regressor.predict(X_test)

In [13]:
X_train

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-04-04,7456.410156,7469.879883,6803.879883,6853.839844,6853.839844,4936000000,2018,4,4
2016-04-21,441.415985,450.548004,440.951996,449.424988,449.424988,68204704,2016,4,21
2018-05-09,9223.730469,9374.759766,9031.620117,9325.179688,9325.179688,7226890240,2018,5,9
2017-06-28,2553.030029,2603.979980,2484.419922,2574.790039,2574.790039,1183869952,2017,6,28
2020-03-10,7922.146973,8136.945313,7814.763184,7909.729492,7909.729492,42213940994,2020,3,10
...,...,...,...,...,...,...,...,...,...
2022-07-26,21361.121094,21361.121094,20776.816406,21239.753906,21239.753906,28624673855,2022,7,26
2017-06-24,2738.520020,2757.939941,2583.189941,2608.719971,2608.719971,982750016,2017,6,24
2018-01-01,14112.200195,14112.200195,13154.700195,13657.200195,13657.200195,10291200000,2018,1,1
2015-08-24,228.112000,228.139008,210.442993,210.494995,210.494995,59220700,2015,8,24


In [14]:


# Evaluate the model's performance
from sklearn.metrics import mean_squared_error, r2_score

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate the coefficient of determination (R^2 score)
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)


Mean Squared Error: 501820.09116117447
R^2 Score: 0.9978538522918803


In [15]:
print("Bitcoin (BTC)")
highest_value = btcDF['High'].max()
highest_date = btcDF.loc[btcDF['High'] == highest_value, ['month', 'day', 'year']].astype(str).agg('/'.join, axis=1)
rounded_highest_value = round(highest_value, 2)
print("Highest Value: $", rounded_highest_value, "on", highest_date.iloc[0])

lowest_value = btcDF['High'].min()
lowest_date = btcDF.loc[btcDF['High'] == lowest_value, ['month', 'day', 'year']].astype(str).agg('/'.join, axis=1)
rounded_lowest_value = round(lowest_value, 2)
print("Lowest Value: $", rounded_lowest_value, "on", lowest_date.iloc[0])


Bitcoin (BTC)
Highest Value: $ 68789.62 on 11/10/2021
Lowest Value: $ 211.73 on 1/17/2015


In [16]:
import math
math.sqrt(mse)

708.3926108883227

In [17]:
math.sqrt(mse) / X["Close"].mean()

0.05111096715350391

In [18]:
import statsmodels.api as sm

model = sm.OLS(y_train,X_train)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          TomorrowClose   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.997
Method:                 Least Squares   F-statistic:                 1.263e+05
Date:                Fri, 02 Jun 2023   Prob (F-statistic):               0.00
Time:                        16:44:03   Log-Likelihood:                -18720.
No. Observations:                2299   AIC:                         3.746e+04
Df Residuals:                    2291   BIC:                         3.750e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Open           0.0336      0.047      0.719      0.4

In [19]:
y_test

Date
2017-06-25     2478.449951
2015-01-07      283.348999
2019-02-18     3947.094482
2015-02-15      233.843002
2020-01-28     9316.629883
                  ...     
2022-09-28    19573.050781
2016-07-09      649.359985
2015-07-06      266.207001
2015-06-16      249.283997
2017-10-17     5590.689941
Name: TomorrowClose, Length: 767, dtype: float64

In [20]:
y_pred

array([ 2613.68714341,   232.38516949,  3996.58387523,   183.46647593,
        9441.46259967,  9994.05304954,  2842.69967082, 26750.54255451,
         201.25625347,  1003.84972627,   402.91426076,   318.65222906,
        9445.95610605, 10589.13526705,  6738.65119018,   748.92023925,
         301.70661997,  2618.57262829, 18753.6292859 , 29443.80782643,
        8756.2065411 ,  9666.39113208, 12591.72257608,  8010.00567643,
       38047.92787909, 16812.58254898,  9237.45453936, 19856.66800687,
         210.48192304,   293.19708986, 10226.34230156, 11183.60809738,
       17906.04469904,  5937.47033616,   261.31073398, 36887.72010898,
       65152.57190127, 27893.39321889,   188.26878707, 46064.42935915,
       29065.67311906, 43068.39115759,   232.18714153,  2825.8902122 ,
       19684.41011849,   204.41292047,   767.2099946 ,  7654.06648093,
       46711.76226724, 20182.14623297,  3456.83426639,   669.89746207,
        7816.18399965,  8275.45304447,  7274.31051823, 51405.59163842,
      

In [21]:
# Make a prediction using the testing data

predictions = regressor.predict(X_test)
predictions

array([ 2613.68714341,   232.38516949,  3996.58387523,   183.46647593,
        9441.46259967,  9994.05304954,  2842.69967082, 26750.54255451,
         201.25625347,  1003.84972627,   402.91426076,   318.65222906,
        9445.95610605, 10589.13526705,  6738.65119018,   748.92023925,
         301.70661997,  2618.57262829, 18753.6292859 , 29443.80782643,
        8756.2065411 ,  9666.39113208, 12591.72257608,  8010.00567643,
       38047.92787909, 16812.58254898,  9237.45453936, 19856.66800687,
         210.48192304,   293.19708986, 10226.34230156, 11183.60809738,
       17906.04469904,  5937.47033616,   261.31073398, 36887.72010898,
       65152.57190127, 27893.39321889,   188.26878707, 46064.42935915,
       29065.67311906, 43068.39115759,   232.18714153,  2825.8902122 ,
       19684.41011849,   204.41292047,   767.2099946 ,  7654.06648093,
       46711.76226724, 20182.14623297,  3456.83426639,   669.89746207,
        7816.18399965,  8275.45304447,  7274.31051823, 51405.59163842,
      