In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("ETH-USD.csv")
adaDF = pd.read_csv(file_path)

# Display sample data
adaDF.head() 

# Review the DataFrame
adaDF

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-01-01,755.757019,782.530029,742.004028,772.640991,772.640991,2595760128
1,2018-01-02,772.346008,914.830017,772.346008,884.443970,884.443970,5783349760
2,2018-01-03,886.000000,974.471008,868.450989,962.719971,962.719971,5093159936
3,2018-01-04,961.713013,1045.079956,946.085999,980.921997,980.921997,6502859776
4,2018-01-05,975.750000,1075.390015,956.325012,997.719971,997.719971,6683149824
...,...,...,...,...,...,...,...
1966,2023-05-21,1820.138062,1827.921997,1799.946899,1804.531372,1804.531372,3386768865
1967,2023-05-22,1804.841919,1826.696533,1793.223999,1817.534790,1817.534790,4534841049
1968,2023-05-23,1817.781128,1869.343994,1816.287964,1854.380615,1854.380615,6820047160
1969,2023-05-24,1854.299683,1854.299683,1780.925537,1800.099976,1800.099976,7101647419


In [3]:
adaDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1971 entries, 0 to 1970
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1971 non-null   object 
 1   Open       1971 non-null   float64
 2   High       1971 non-null   float64
 3   Low        1971 non-null   float64
 4   Close      1971 non-null   float64
 5   Adj Close  1971 non-null   float64
 6   Volume     1971 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 107.9+ KB


In [4]:

# Convert 'date' column to datetime
adaDF['Date'] = pd.to_datetime(adaDF['Date'])

# Extract year, month, and day into separate columns
adaDF['year'] = adaDF['Date'].dt.year.astype(int)
adaDF['month'] = adaDF['Date'].dt.month.astype(int)
adaDF['day'] = adaDF['Date'].dt.day.astype(int)

adaDF.set_index("Date", inplace=True)

# Display the updated DataFrame
adaDF


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01,755.757019,782.530029,742.004028,772.640991,772.640991,2595760128,2018,1,1
2018-01-02,772.346008,914.830017,772.346008,884.443970,884.443970,5783349760,2018,1,2
2018-01-03,886.000000,974.471008,868.450989,962.719971,962.719971,5093159936,2018,1,3
2018-01-04,961.713013,1045.079956,946.085999,980.921997,980.921997,6502859776,2018,1,4
2018-01-05,975.750000,1075.390015,956.325012,997.719971,997.719971,6683149824,2018,1,5
...,...,...,...,...,...,...,...,...,...
2023-05-21,1820.138062,1827.921997,1799.946899,1804.531372,1804.531372,3386768865,2023,5,21
2023-05-22,1804.841919,1826.696533,1793.223999,1817.534790,1817.534790,4534841049,2023,5,22
2023-05-23,1817.781128,1869.343994,1816.287964,1854.380615,1854.380615,6820047160,2023,5,23
2023-05-24,1854.299683,1854.299683,1780.925537,1800.099976,1800.099976,7101647419,2023,5,24


In [5]:
adaDF["TomorrowClose"] = adaDF["Close"].shift(-1)
adaDF.dropna(inplace=True)
adaDF

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day,TomorrowClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01,755.757019,782.530029,742.004028,772.640991,772.640991,2595760128,2018,1,1,884.443970
2018-01-02,772.346008,914.830017,772.346008,884.443970,884.443970,5783349760,2018,1,2,962.719971
2018-01-03,886.000000,974.471008,868.450989,962.719971,962.719971,5093159936,2018,1,3,980.921997
2018-01-04,961.713013,1045.079956,946.085999,980.921997,980.921997,6502859776,2018,1,4,997.719971
2018-01-05,975.750000,1075.390015,956.325012,997.719971,997.719971,6683149824,2018,1,5,1041.680054
...,...,...,...,...,...,...,...,...,...,...
2023-05-20,1812.766113,1829.009644,1808.046753,1820.478027,1820.478027,2951655969,2023,5,20,1804.531372
2023-05-21,1820.138062,1827.921997,1799.946899,1804.531372,1804.531372,3386768865,2023,5,21,1817.534790
2023-05-22,1804.841919,1826.696533,1793.223999,1817.534790,1817.534790,4534841049,2023,5,22,1854.380615
2023-05-23,1817.781128,1869.343994,1816.287964,1854.380615,1854.380615,6820047160,2023,5,23,1800.099976


In [6]:
adaDF.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1970 entries, 2018-01-01 to 2023-05-24
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           1970 non-null   float64
 1   High           1970 non-null   float64
 2   Low            1970 non-null   float64
 3   Close          1970 non-null   float64
 4   Adj Close      1970 non-null   float64
 5   Volume         1970 non-null   int64  
 6   year           1970 non-null   int64  
 7   month          1970 non-null   int64  
 8   day            1970 non-null   int64  
 9   TomorrowClose  1970 non-null   float64
dtypes: float64(6), int64(4)
memory usage: 169.3 KB


In [7]:
# Separate the data into labels and features

y = adaDF["TomorrowClose"]
X = adaDF.drop(columns=["TomorrowClose"])



In [8]:
# Review the y variable Series
y

Date
2018-01-01     884.443970
2018-01-02     962.719971
2018-01-03     980.921997
2018-01-04     997.719971
2018-01-05    1041.680054
                 ...     
2023-05-20    1804.531372
2023-05-21    1817.534790
2023-05-22    1854.380615
2023-05-23    1800.099976
2023-05-24    1810.353027
Name: TomorrowClose, Length: 1970, dtype: float64

In [9]:
# Review the X variable DataFrame
X

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01,755.757019,782.530029,742.004028,772.640991,772.640991,2595760128,2018,1,1
2018-01-02,772.346008,914.830017,772.346008,884.443970,884.443970,5783349760,2018,1,2
2018-01-03,886.000000,974.471008,868.450989,962.719971,962.719971,5093159936,2018,1,3
2018-01-04,961.713013,1045.079956,946.085999,980.921997,980.921997,6502859776,2018,1,4
2018-01-05,975.750000,1075.390015,956.325012,997.719971,997.719971,6683149824,2018,1,5
...,...,...,...,...,...,...,...,...,...
2023-05-20,1812.766113,1829.009644,1808.046753,1820.478027,1820.478027,2951655969,2023,5,20
2023-05-21,1820.138062,1827.921997,1799.946899,1804.531372,1804.531372,3386768865,2023,5,21
2023-05-22,1804.841919,1826.696533,1793.223999,1817.534790,1817.534790,4534841049,2023,5,22
2023-05-23,1817.781128,1869.343994,1816.287964,1854.380615,1854.380615,6820047160,2023,5,23


In [10]:
# Check the balance of our target values
y.value_counts()

884.443970     1
3014.845947    1
3310.504150    1
3265.443359    1
3322.211670    1
              ..
176.013504     1
181.406067     1
186.960907     1
182.075150     1
1810.353027    1
Name: TomorrowClose, Length: 1970, dtype: int64

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    )


In [12]:
from sklearn.linear_model import LinearRegression

# Instantiate the Linear Regression model
regressor = LinearRegression()

# Fit the model using training data
regressor.fit(X_train, y_train)

# Predict the future values using test data
y_pred = regressor.predict(X_test)

In [13]:
X_train

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-03-26,136.190674,138.830566,134.886032,138.361557,138.361557,11367261176,2020,3,26
2018-09-28,229.041000,231.748001,218.669006,222.401993,222.401993,2018120000,2018,9,28
2022-11-22,1107.895996,1136.442627,1081.138184,1135.173462,1135.173462,12040670755,2022,11,22
2023-05-15,1800.371216,1845.695435,1787.536377,1817.549927,1817.549927,6748889346,2023,5,15
2019-09-01,172.458405,173.696854,169.531281,171.629425,171.629425,5554799576,2019,9,1
...,...,...,...,...,...,...,...,...,...
2022-11-27,1205.905273,1220.349487,1195.039063,1195.126953,1195.126953,4486976868,2022,11,27
2021-01-01,737.708374,749.201843,719.792236,730.367554,730.367554,13652004358,2021,1,1
2023-04-17,2120.001221,2120.113525,2063.038574,2076.242920,2076.242920,9648882546,2023,4,17
2018-08-24,278.110992,283.302002,273.859985,282.967010,282.967010,1450170000,2018,8,24


In [14]:


# Evaluate the model's performance
from sklearn.metrics import mean_squared_error, r2_score

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate the coefficient of determination (R^2 score)
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)




Mean Squared Error: 6002.692963165724
R^2 Score: 0.9955045192100591


In [15]:
print("Ethereum (ETH)")
highest_value = adaDF['High'].max()
highest_date = adaDF.loc[adaDF['High'] == highest_value, ['month', 'day', 'year']].astype(str).agg('/'.join, axis=1)
rounded_highest_value = round(highest_value, 2)
print("Highest Value: $", rounded_highest_value, "on", highest_date.iloc[0])

lowest_value = adaDF['High'].min()
lowest_date = adaDF.loc[adaDF['High'] == lowest_value, ['month', 'day', 'year']].astype(str).agg('/'.join, axis=1)
rounded_lowest_value = round(lowest_value, 2)
print("Lowest Value: $", rounded_lowest_value, "on", lowest_date.iloc[0])


Ethereum (ETH)
Highest Value: $ 4891.7 on 11/16/2021
Lowest Value: $ 85.34 on 12/15/2018


In [16]:
import statsmodels.api as sm

model = sm.OLS(y_train,X_train)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          TomorrowClose   R-squared:                       0.996
Model:                            OLS   Adj. R-squared:                  0.996
Method:                 Least Squares   F-statistic:                 4.760e+04
Date:                Fri, 02 Jun 2023   Prob (F-statistic):               0.00
Time:                        16:46:27   Log-Likelihood:                -8519.6
No. Observations:                1477   AIC:                         1.706e+04
Df Residuals:                    1469   BIC:                         1.710e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Open           0.3173      0.057      5.580      0.0

In [17]:
y_test

Date
2019-11-12     188.258743
2021-04-25    2534.481689
2021-03-09    1799.166260
2021-06-04    2630.576904
2020-10-05     340.815857
                 ...     
2018-01-20    1049.579956
2021-06-21    1874.950073
2022-05-11    1961.701538
2019-03-09     136.758682
2022-09-22    1328.259521
Name: TomorrowClose, Length: 493, dtype: float64

In [18]:
y_pred

array([ 188.30602522, 2309.17154524, 1872.36143383, 2719.08373302,
        356.28200208, 1845.28615868,  223.24884935,  134.37720348,
        146.23304793,  133.637577  ,  478.04887708,  287.31587267,
        121.22047463,  171.21730646, 3313.62329007,  217.03104077,
        168.54042042,  151.08411074,  433.62009692, 1634.22580869,
        148.76023822,  145.26558419,  274.13653626, 1938.96014462,
        582.04468735,  244.32053054, 1760.53613294, 3021.50374713,
       1627.97067464,  192.60099109,  105.24587057, 1338.80684904,
       1314.74175336, 1635.75666608, 1902.99765879,  432.60903437,
        471.05150619, 2130.75814967, 2776.71592889,  862.21617677,
       3588.51175837,  217.68552713,  378.02625779,  389.87053594,
        188.80318866, 2835.74787901,  394.61630517, 1538.93117083,
       4331.89775315,  247.56139085, 1391.1521351 ,  192.04726825,
        197.0234726 ,  220.13728885, 1820.47830515, 1781.03657178,
        615.75377432, 1576.94825591,  381.6750535 ,  387.85069

In [19]:
# Make a prediction using the testing data

predictions = regressor.predict(X_test)
predictions

array([ 188.30602522, 2309.17154524, 1872.36143383, 2719.08373302,
        356.28200208, 1845.28615868,  223.24884935,  134.37720348,
        146.23304793,  133.637577  ,  478.04887708,  287.31587267,
        121.22047463,  171.21730646, 3313.62329007,  217.03104077,
        168.54042042,  151.08411074,  433.62009692, 1634.22580869,
        148.76023822,  145.26558419,  274.13653626, 1938.96014462,
        582.04468735,  244.32053054, 1760.53613294, 3021.50374713,
       1627.97067464,  192.60099109,  105.24587057, 1338.80684904,
       1314.74175336, 1635.75666608, 1902.99765879,  432.60903437,
        471.05150619, 2130.75814967, 2776.71592889,  862.21617677,
       3588.51175837,  217.68552713,  378.02625779,  389.87053594,
        188.80318866, 2835.74787901,  394.61630517, 1538.93117083,
       4331.89775315,  247.56139085, 1391.1521351 ,  192.04726825,
        197.0234726 ,  220.13728885, 1820.47830515, 1781.03657178,
        615.75377432, 1576.94825591,  381.6750535 ,  387.85069