# Logistic Regression Model (with PCA) - Historical Bitcoin Data

### Importing Essential Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns

### Reading Data

In [None]:
df = pd.read_pickle('./dfe.pkl')    # reading 5 yr cleaned data pickle file

In [None]:
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,"Oct 16, 2015",262.9,254.4,267.3,253.9,95180.0,3.31
1,"Oct 17, 2015",269.6,262.9,275.4,261.7,113840.0,2.55
2,"Oct 18, 2015",261.7,269.6,272.4,259.5,52270.0,-2.93
3,"Oct 19, 2015",263.8,261.7,266.0,259.9,58170.0,0.82
4,"Oct 20, 2015",269.8,263.8,272.1,262.7,75400.0,2.25


In [None]:
df.shape

(2193, 7)

In [None]:
print("Number of data points:", df.shape[0])

Number of data points: 2193


In [None]:
df["Date"] = pd.to_datetime(df["Date"], infer_datetime_format=True)
df = df.set_index("Date")

In [None]:
df.head()

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-16,262.9,254.4,267.3,253.9,95180.0,3.31
2015-10-17,269.6,262.9,275.4,261.7,113840.0,2.55
2015-10-18,261.7,269.6,272.4,259.5,52270.0,-2.93
2015-10-19,263.8,261.7,266.0,259.9,58170.0,0.82
2015-10-20,269.8,263.8,272.1,262.7,75400.0,2.25


### Encoding Change % Column in 0/1

In [None]:
def partition(x):
  if x<0:
    return 1
  return 0

In [None]:
bit_pred=df['Change %']

In [None]:
up_down=bit_pred.map(partition)

In [None]:
df['Change %']=up_down

In [None]:
df.head()

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-16,262.9,254.4,267.3,253.9,95180.0,0
2015-10-17,269.6,262.9,275.4,261.7,113840.0,0
2015-10-18,261.7,269.6,272.4,259.5,52270.0,1
2015-10-19,263.8,261.7,266.0,259.9,58170.0,0
2015-10-20,269.8,263.8,272.1,262.7,75400.0,0


In [None]:
df['Change %'].value_counts()

0    1208
1     985
Name: Change %, dtype: int64

In [None]:
y=df['Change %']

In [None]:
df.drop(['Change %'], axis=1, inplace=True)
print(df.shape)

(2193, 5)


### Scaling Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()
scaler.fit(df)

StandardScaler()

In [None]:
scaled_data=scaler.transform(df)
scaled_data

array([[-0.75071195, -0.75134418, -0.75001256, -0.75202752, -0.31790091],
       [-0.75024567, -0.75075109, -0.74946467, -0.75146385, -0.30352852],
       [-0.75079546, -0.75028359, -0.74966759, -0.75162284, -0.35095124],
       ...,
       [ 3.22191766,  3.23396813,  3.18937757,  3.33910891, -0.34512064],
       [ 3.52303019,  3.2324191 ,  3.48607372,  3.33968703, -0.29764401],
       [ 3.46656159,  3.53392016,  3.44795094,  3.57722962, -0.35275356]])

### Performing Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA(n_components=2)
pca.fit(scaled_data)

PCA(n_components=2)

In [None]:
x_pca=pca.transform(scaled_data)

In [None]:
print('Shape of scaled data:', scaled_data.shape)
print('Shape of PCA data:', x_pca.shape)

Shape of scaled data: (2193, 5)
Shape of PCA data: (2193, 2)


### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, stratify = y, test_size = 0.3)
print('X-Train Shape:', X_train.shape)
print('Y-Train Shape:', y_train.shape)
print('X-Test Shape:', X_test.shape)
print('Y-Test Shape:', y_test.shape)

X-Train Shape: (1535, 2)
Y-Train Shape: (1535,)
X-Test Shape: (658, 2)
Y-Test Shape: (658,)


In [None]:
X_train

array([[-1.45636245e+00, -4.30066464e-01],
       [ 3.99700670e+00,  1.78196717e-03],
       [-1.44731227e+00, -3.76024444e-01],
       ...,
       [-2.65544712e-01,  1.52737694e-01],
       [-1.43256104e+00, -3.44801291e-01],
       [-4.18229370e-01, -3.59679866e-01]])

In [None]:
y_train

Date
2016-01-12    1
2021-01-08    0
2016-05-31    1
2019-06-16    0
2018-04-21    0
             ..
2020-09-29    0
2019-02-11    1
2019-06-18    1
2016-07-07    1
2017-11-19    0
Name: Change %, Length: 1535, dtype: int64

In [None]:
X_test

array([[-0.2167594 , -0.05823246],
       [-1.01677173, -0.35402218],
       [-1.06030993, -0.08116417],
       ...,
       [ 0.1311542 , -0.31220849],
       [-0.07619402,  0.1657587 ],
       [-0.18105675,  0.00570893]])

In [None]:
y_test

Date
2020-06-17    1
2017-09-22    1
2019-02-07    1
2018-02-20    0
2017-06-15    1
             ..
2021-07-01    1
2019-05-27    0
2020-10-20    0
2019-07-19    1
2020-06-05    1
Name: Change %, Length: 658, dtype: int64

### Performing Hyperparameter Tuning Using Grid SearchCV

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
pipeline_lr = Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [None]:
from sklearn.tree import DecisionTreeClassifier
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [None]:
from sklearn.ensemble import RandomForestClassifier
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [None]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [None]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [None]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [None]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.5547112462006079
Decision Tree Test Accuracy: 0.47112462006079026
RandomForest Test Accuracy: 0.48024316109422494


* Logistic Regression seems to be a better Model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# define models and parameters
model=LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1','l2','elasticnet', 'none']
c = np.logspace(0, 4, 10)

# define grid search
grid = dict(solver=solvers, penalty=penalty, C=c)
gridsearch = GridSearchCV(model, grid, cv=5, verbose=0, n_jobs=-1)
best_model = gridsearch.fit(X_train,y_train)

# summarize results
means = best_model.cv_results_['mean_test_score']
stds = best_model.cv_results_['std_test_score']
params = best_model.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("accuracy: %f (%f) using: %r" % (mean, stdev, param))

accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'newton-cg'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'lbfgs'}
accuracy: 0.537459 (0.021607) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'sag'}
accuracy: 0.538111 (0.020332) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}
accuracy: 0.538111 (0.020332) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
accuracy: 0.538111 (0.020332) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
accuracy: 0.537459 (0.021607) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
accuracy: 0.538111 (0.020332) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
accuracy: 0.538111 (0.020332) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
accuracy: na

 0.53811075 0.53745928 0.53811075 0.53811075        nan        nan
        nan        nan        nan 0.53745928 0.53745928        nan
 0.53745928 0.53745928        nan        nan 0.53745928        nan
 0.53811075 0.53745928 0.53745928 0.53745928 0.53745928 0.53745928
        nan        nan        nan        nan        nan 0.53745928
 0.53745928        nan 0.53745928 0.53745928        nan        nan
 0.53745928        nan 0.53811075 0.53745928 0.53745928 0.53745928
 0.53745928 0.53745928        nan        nan        nan        nan
        nan 0.53745928 0.53745928        nan 0.53745928 0.53745928
        nan        nan 0.53745928        nan 0.53811075 0.53745928
 0.53745928 0.53745928 0.53745928 0.53745928        nan        nan
        nan        nan        nan 0.53745928 0.53745928        nan
 0.53745928 0.53745928        nan        nan 0.53745928        nan
 0.53745928 0.53745928 0.53745928 0.53745928 0.53745928 0.53745928
        nan        nan        nan        nan        nan 0.5374

In [None]:
print("The mean accuracy of the model is:", best_model.score(X_test,y_test))
print("The model best paramters are:", best_model.best_params_)

The mean accuracy of the model is: 0.5531914893617021
The model best paramters are: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}


### Model Creation and Prediction

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=1, penalty='l1', solver='liblinear')
classifier.fit(X_train, y_train)

LogisticRegression(C=1, penalty='l1', solver='liblinear')

In [None]:
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)

### Confusion Matrix / Accuracy / Precision

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
  
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[362   0]
 [293   3]]


In [None]:
print(metrics.classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.55      1.00      0.71       362
           1       1.00      0.01      0.02       296

    accuracy                           0.55       658
   macro avg       0.78      0.51      0.37       658
weighted avg       0.75      0.55      0.40       658



In [None]:
from sklearn.metrics import accuracy_score, precision_score
print("Train Accuracy:", accuracy_score(y_train,y_pred_train))
print ("Test Accuracy : ", accuracy_score(y_test, y_pred_test))

Train Accuracy: 0.5517915309446254
Test Accuracy :  0.5547112462006079


In [None]:
print("Train Precision Score:", precision_score(y_train, y_pred_train))
print("Test Precision SCore:", precision_score(y_test, y_pred_test))

Train Precision Score: 0.6
Test Precision SCore: 1.0


## Implementing without PCA

### Train-Test Split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(scaled_data, y, stratify = y, test_size = 0.3)
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(1535, 5) (658, 5) (1535,) (658,)


In [None]:
xtrain

array([[-0.72662545, -0.72642028, -0.72664917, -0.72654685, -0.37196296],
       [-0.44906354, -0.46275862, -0.45370872, -0.45600802, -0.31357225],
       [-0.52046703, -0.51528581, -0.52047094, -0.51414529,  0.02844589],
       ...,
       [ 0.14000129,  0.14029234,  0.12678435,  0.15406834, -0.32229889],
       [-0.1098622 , -0.10856291, -0.12651975, -0.09325894, -0.11010207],
       [-0.22762923, -0.23282656, -0.23473948, -0.22185467,  0.19642413]])

In [None]:
ytrain

Date
2016-10-04    1
2017-08-29    0
2019-01-15    1
2015-11-01    0
2021-03-23    0
             ..
2018-08-18    1
2015-11-18    0
2020-10-26    0
2020-06-13    0
2019-06-05    0
Name: Change %, Length: 1535, dtype: int64

In [None]:
xtest

array([[-0.73892272, -0.73771   , -0.73645719, -0.74120221, -0.22835462],
       [-0.6850291 , -0.68506419, -0.68591542, -0.6840333 , -0.36039419],
       [-0.65164483, -0.6421521 , -0.64481641, -0.65177416, -0.30699453],
       ...,
       [-0.36819527, -0.37295024, -0.37609004, -0.36741825, -0.3439499 ],
       [-0.1747308 , -0.14765826, -0.163195  , -0.16493869, -0.24034701],
       [-0.3309903 , -0.32806349, -0.33574185, -0.32308361,  2.15052962]])

In [None]:
ytrain

Date
2016-10-04    1
2017-08-29    0
2019-01-15    1
2015-11-01    0
2021-03-23    0
             ..
2018-08-18    1
2015-11-18    0
2020-10-26    0
2020-06-13    0
2019-06-05    0
Name: Change %, Length: 1535, dtype: int64

### Performing Hyperparameter Tuning Using Grid SearchCV

In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1','l2','elasticnet', 'none']
c = np.logspace(0, 4, 10)

# define grid search
grid = dict(solver=solvers, penalty=penalty, C=c)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1)
grid_result = grid_search.fit(xtrain, ytrain)

# summarize results
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("accuracy: %f (%f) using: %r" % (mean, stdev, param))

accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'newton-cg'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'lbfgs'}
accuracy: 0.844951 (0.013758) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'sag'}
accuracy: 0.637785 (0.016787) using: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}
accuracy: 0.637785 (0.015608) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
accuracy: 0.637785 (0.015608) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
accuracy: 0.637785 (0.015608) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
accuracy: 0.635179 (0.016609) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
accuracy: 0.622150 (0.013351) using: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
accuracy: nan (nan) using: {'C': 1.0, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
accuracy: na

 0.63778502 0.63778502 0.63517915 0.62214984        nan        nan
        nan        nan        nan 0.99348534 0.99413681        nan
 0.68403909 0.64234528        nan        nan 0.88859935        nan
 0.64039088 0.69250814 0.69250814 0.69250814 0.65863192 0.63713355
        nan        nan        nan        nan        nan 0.99348534
 0.99413681        nan 0.68403909 0.64234528        nan        nan
 0.90162866        nan 0.64234528 0.7465798  0.7465798  0.7465798
 0.67035831 0.63908795        nan        nan        nan        nan
        nan 0.99348534 0.99413681        nan 0.68403909 0.64234528
        nan        nan 0.91205212        nan 0.64169381 0.79348534
 0.79348534 0.79348534 0.68013029 0.64104235        nan        nan
        nan        nan        nan 0.99348534 0.99413681        nan
 0.68403909 0.64234528        nan        nan 0.90553746        nan
 0.64234528 0.83843648 0.83843648 0.83843648 0.68273616 0.64169381
        nan        nan        nan        nan        nan 0.99348

In [None]:
print("The mean accuracy of the model is:", grid_result.score(xtest,ytest))
print("The model best paramters are:", grid_result.best_params_)

The mean accuracy of the model is: 0.9954407294832827
The model best paramters are: {'C': 1.0, 'penalty': 'none', 'solver': 'lbfgs'}


### Model Creation and Prediction

In [None]:
classifier = LogisticRegression(C=1, penalty= 'none', solver= 'lbfgs')
classifier.fit(xtrain, ytrain)

LogisticRegression(C=1, penalty='none')

In [None]:
ypred_train = classifier.predict(xtrain)
ypred_test = classifier.predict(xtest)

### Confusion Matrix / Accuracy / Precision

In [None]:
cm = confusion_matrix(ytest, ypred_test)
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[359   3]
 [  0 296]]


In [None]:
print(metrics.classification_report(ytest, ypred_test))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       362
           1       0.99      1.00      0.99       296

    accuracy                           1.00       658
   macro avg       0.99      1.00      1.00       658
weighted avg       1.00      1.00      1.00       658



In [None]:
print ("Train Accuracy : ", accuracy_score(ytrain, ypred_train))
print ("Test Accuracy : ", accuracy_score(ytest, ypred_test))

Train Accuracy :  0.9980456026058632
Test Accuracy :  0.9954407294832827


In [None]:
print("Train Precision Score:", precision_score(ytrain, ypred_train))
print("Test Precision Score:", precision_score(ytest, ypred_test))

Train Precision Score: 0.9971014492753624
Test Precision Score: 0.9899665551839465
