In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("USDT-USD.csv")
adaDF = pd.read_csv(file_path)

# Display sample data
adaDF.head() 

# Review the DataFrame
adaDF

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-01-01,1.012550,1.015360,1.001530,1.007280,1.007280,1685299968
1,2018-01-02,1.005740,1.009880,0.993250,1.004900,1.004900,2635859968
2,2018-01-03,1.006600,1.023280,1.002640,1.013440,1.013440,2471689984
3,2018-01-04,1.013200,1.016180,0.993822,1.002530,1.002530,3200130048
4,2018-01-05,1.001750,1.005010,0.985915,0.998634,0.998634,3096620032
...,...,...,...,...,...,...,...
1966,2023-05-21,1.000476,1.000705,1.000090,1.000183,1.000183,13532622119
1967,2023-05-22,1.000142,1.000834,0.999855,1.000201,1.000201,17216429735
1968,2023-05-23,1.000245,1.000628,0.999703,1.000104,1.000104,21420969303
1969,2023-05-24,1.000061,1.000449,0.999595,1.000039,1.000039,25306791660


In [3]:
adaDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1971 entries, 0 to 1970
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1971 non-null   object 
 1   Open       1971 non-null   float64
 2   High       1971 non-null   float64
 3   Low        1971 non-null   float64
 4   Close      1971 non-null   float64
 5   Adj Close  1971 non-null   float64
 6   Volume     1971 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 107.9+ KB


In [4]:

# Convert 'date' column to datetime
adaDF['Date'] = pd.to_datetime(adaDF['Date'])

# Extract year, month, and day into separate columns
adaDF['year'] = adaDF['Date'].dt.year.astype(int)
adaDF['month'] = adaDF['Date'].dt.month.astype(int)
adaDF['day'] = adaDF['Date'].dt.day.astype(int)

adaDF.set_index("Date", inplace=True)

# Display the updated DataFrame
adaDF


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01,1.012550,1.015360,1.001530,1.007280,1.007280,1685299968,2018,1,1
2018-01-02,1.005740,1.009880,0.993250,1.004900,1.004900,2635859968,2018,1,2
2018-01-03,1.006600,1.023280,1.002640,1.013440,1.013440,2471689984,2018,1,3
2018-01-04,1.013200,1.016180,0.993822,1.002530,1.002530,3200130048,2018,1,4
2018-01-05,1.001750,1.005010,0.985915,0.998634,0.998634,3096620032,2018,1,5
...,...,...,...,...,...,...,...,...,...
2023-05-21,1.000476,1.000705,1.000090,1.000183,1.000183,13532622119,2023,5,21
2023-05-22,1.000142,1.000834,0.999855,1.000201,1.000201,17216429735,2023,5,22
2023-05-23,1.000245,1.000628,0.999703,1.000104,1.000104,21420969303,2023,5,23
2023-05-24,1.000061,1.000449,0.999595,1.000039,1.000039,25306791660,2023,5,24


In [5]:
adaDF["TomorrowClose"] = adaDF["Close"].shift(-1)
adaDF.dropna(inplace=True)
adaDF

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day,TomorrowClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01,1.012550,1.015360,1.001530,1.007280,1.007280,1685299968,2018,1,1,1.004900
2018-01-02,1.005740,1.009880,0.993250,1.004900,1.004900,2635859968,2018,1,2,1.013440
2018-01-03,1.006600,1.023280,1.002640,1.013440,1.013440,2471689984,2018,1,3,1.002530
2018-01-04,1.013200,1.016180,0.993822,1.002530,1.002530,3200130048,2018,1,4,0.998634
2018-01-05,1.001750,1.005010,0.985915,0.998634,0.998634,3096620032,2018,1,5,1.008990
...,...,...,...,...,...,...,...,...,...,...
2023-05-20,1.000250,1.000578,1.000016,1.000399,1.000399,11851408172,2023,5,20,1.000183
2023-05-21,1.000476,1.000705,1.000090,1.000183,1.000183,13532622119,2023,5,21,1.000201
2023-05-22,1.000142,1.000834,0.999855,1.000201,1.000201,17216429735,2023,5,22,1.000104
2023-05-23,1.000245,1.000628,0.999703,1.000104,1.000104,21420969303,2023,5,23,1.000039


In [6]:
adaDF.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1970 entries, 2018-01-01 to 2023-05-24
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           1970 non-null   float64
 1   High           1970 non-null   float64
 2   Low            1970 non-null   float64
 3   Close          1970 non-null   float64
 4   Adj Close      1970 non-null   float64
 5   Volume         1970 non-null   int64  
 6   year           1970 non-null   int64  
 7   month          1970 non-null   int64  
 8   day            1970 non-null   int64  
 9   TomorrowClose  1970 non-null   float64
dtypes: float64(6), int64(4)
memory usage: 169.3 KB


In [7]:
# Separate the data into labels and features

y = adaDF["TomorrowClose"]
X = adaDF.drop(columns=["TomorrowClose"])



In [8]:
# Review the y variable Series
y

Date
2018-01-01    1.004900
2018-01-02    1.013440
2018-01-03    1.002530
2018-01-04    0.998634
2018-01-05    1.008990
                ...   
2023-05-20    1.000183
2023-05-21    1.000201
2023-05-22    1.000104
2023-05-23    1.000039
2023-05-24    1.000011
Name: TomorrowClose, Length: 1970, dtype: float64

In [9]:
# Review the X variable DataFrame
X

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01,1.012550,1.015360,1.001530,1.007280,1.007280,1685299968,2018,1,1
2018-01-02,1.005740,1.009880,0.993250,1.004900,1.004900,2635859968,2018,1,2
2018-01-03,1.006600,1.023280,1.002640,1.013440,1.013440,2471689984,2018,1,3
2018-01-04,1.013200,1.016180,0.993822,1.002530,1.002530,3200130048,2018,1,4
2018-01-05,1.001750,1.005010,0.985915,0.998634,0.998634,3096620032,2018,1,5
...,...,...,...,...,...,...,...,...,...
2023-05-20,1.000250,1.000578,1.000016,1.000399,1.000399,11851408172,2023,5,20
2023-05-21,1.000476,1.000705,1.000090,1.000183,1.000183,13532622119,2023,5,21
2023-05-22,1.000142,1.000834,0.999855,1.000201,1.000201,17216429735,2023,5,22
2023-05-23,1.000245,1.000628,0.999703,1.000104,1.000104,21420969303,2023,5,23


In [10]:
# Check the balance of our target values
y.value_counts()

1.000077    5
1.000092    5
1.000559    4
1.000132    4
1.000111    4
           ..
1.004573    1
1.003744    1
1.002090    1
0.996918    1
1.000248    1
Name: TomorrowClose, Length: 1603, dtype: int64

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    )


In [12]:
from sklearn.linear_model import LinearRegression

# Instantiate the Linear Regression model
regressor = LinearRegression()

# Fit the model using training data
regressor.fit(X_train, y_train)

# Predict the future values using test data
y_pred = regressor.predict(X_test)

In [13]:
X_train

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-03-26,0.998597,1.012367,0.995058,0.998483,0.998483,42020889460,2020,3,26
2018-09-28,0.994787,1.013500,0.988149,1.001420,1.001420,3447280000,2018,9,28
2022-11-22,0.999081,0.999309,0.998874,0.999157,0.999157,44056204568,2022,11,22
2023-05-15,1.000455,1.000837,0.999935,1.000262,1.000262,21594744662,2023,5,15
2019-09-01,1.003919,1.006601,0.997522,1.004573,1.004573,13692453817,2019,9,1
...,...,...,...,...,...,...,...,...,...
2022-11-27,0.999568,0.999654,0.999503,0.999604,0.999604,27588594951,2022,11,27
2021-01-01,1.000614,1.002052,1.000608,1.001916,1.001916,60509986287,2021,1,1
2023-04-17,1.000410,1.001024,0.999914,1.000530,1.000530,32406207872,2023,4,17
2018-08-24,0.998429,1.008960,0.994293,1.001780,1.001780,2850500000,2018,8,24


In [14]:


# Evaluate the model's performance
from sklearn.metrics import mean_squared_error, r2_score

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate the coefficient of determination (R^2 score)
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)




Mean Squared Error: 8.421787382629662e-06
R^2 Score: 0.5655266380657396


In [15]:
print("Tether (USDT)")
highest_value = adaDF['High'].max()
highest_date = adaDF.loc[adaDF['High'] == highest_value, ['month', 'day', 'year']].astype(str).agg('/'.join, axis=1)
rounded_highest_value = round(highest_value, 2)
print("Highest Value: $", rounded_highest_value, "on", highest_date.iloc[0])

lowest_value = adaDF['High'].min()
lowest_date = adaDF.loc[adaDF['High'] == lowest_value, ['month', 'day', 'year']].astype(str).agg('/'.join, axis=1)
rounded_lowest_value = round(lowest_value, 2)
print("Lowest Value: $", rounded_lowest_value, "on", lowest_date.iloc[0])

Tether (USDT)
Highest Value: $ 1.08 on 3/13/2020
Lowest Value: $ 0.98 on 10/17/2018


In [16]:
import statsmodels.api as sm

model = sm.OLS(y_train,X_train)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          TomorrowClose   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.541
Method:                 Least Squares   F-statistic:                     249.1
Date:                Fri, 02 Jun 2023   Prob (F-statistic):          2.29e-244
Time:                        17:03:09   Log-Likelihood:                 6274.3
No. Observations:                1477   AIC:                        -1.253e+04
Df Residuals:                    1469   BIC:                        -1.249e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Open           0.2110      0.031      6.732      0.0

In [17]:
y_test

Date
2019-11-12    1.003175
2021-04-25    1.000042
2021-03-09    1.000526
2021-06-04    1.000313
2020-10-05    1.000448
                ...   
2018-01-20    0.999861
2021-06-21    1.001002
2022-05-11    0.997609
2019-03-09    1.008822
2022-09-22    0.999981
Name: TomorrowClose, Length: 493, dtype: float64

In [18]:
y_pred

array([1.001932  , 1.00017492, 1.00084107, 1.00097151, 1.00111167,
       1.00070574, 0.9959939 , 1.00771939, 0.99994004, 1.00732706,
       1.0005606 , 1.00011036, 1.00896049, 1.00061441, 1.00010411,
       1.00459369, 1.00423476, 1.01785372, 1.00140397, 1.00045929,
       1.0051647 , 1.00268827, 1.00109806, 0.99928044, 0.99998888,
       1.00078621, 1.00024192, 1.00044394, 1.0003662 , 0.99424708,
       1.00545495, 0.99987087, 1.00031572, 1.00036942, 1.00008425,
       1.00255121, 1.0041433 , 1.00042562, 1.00070959, 0.99987117,
       1.00035963, 1.00063777, 1.00210224, 1.00412776, 1.00207053,
       1.00072162, 1.00030121, 1.00041336, 1.00014874, 1.00406704,
       1.00098888, 1.00043006, 0.99260286, 1.00376138, 1.0000946 ,
       1.00065721, 0.99966968, 1.00017244, 1.00098476, 0.99997759,
       1.0003424 , 1.00514169, 1.00102854, 1.00269216, 1.00039779,
       0.99976999, 0.99963893, 1.00179843, 1.00006042, 1.00500888,
       1.00086366, 1.00213558, 1.00073672, 1.00068173, 1.00209

In [19]:
# Make a prediction using the testing data

predictions = regressor.predict(X_test)
predictions

array([1.001932  , 1.00017492, 1.00084107, 1.00097151, 1.00111167,
       1.00070574, 0.9959939 , 1.00771939, 0.99994004, 1.00732706,
       1.0005606 , 1.00011036, 1.00896049, 1.00061441, 1.00010411,
       1.00459369, 1.00423476, 1.01785372, 1.00140397, 1.00045929,
       1.0051647 , 1.00268827, 1.00109806, 0.99928044, 0.99998888,
       1.00078621, 1.00024192, 1.00044394, 1.0003662 , 0.99424708,
       1.00545495, 0.99987087, 1.00031572, 1.00036942, 1.00008425,
       1.00255121, 1.0041433 , 1.00042562, 1.00070959, 0.99987117,
       1.00035963, 1.00063777, 1.00210224, 1.00412776, 1.00207053,
       1.00072162, 1.00030121, 1.00041336, 1.00014874, 1.00406704,
       1.00098888, 1.00043006, 0.99260286, 1.00376138, 1.0000946 ,
       1.00065721, 0.99966968, 1.00017244, 1.00098476, 0.99997759,
       1.0003424 , 1.00514169, 1.00102854, 1.00269216, 1.00039779,
       0.99976999, 0.99963893, 1.00179843, 1.00006042, 1.00500888,
       1.00086366, 1.00213558, 1.00073672, 1.00068173, 1.00209