In [6]:
import pandas as pd

In [18]:
df = pd.read_csv('MSFT_final.csv', index_col=0)
df.head()

Unnamed: 0,score,day,Movement
2020-04-01,0.270791,2,2.070864
2020-04-02,0.237726,3,-0.921028
2020-04-05,0.438362,6,7.436775
2020-04-06,0.096384,0,-1.077021
2020-04-07,0.141523,1,1.003119


In [19]:
df['Direction'] = df.Movement.apply(lambda x: "up" if x>0 else "down")

In [20]:
cvar = ["day", "Direction"]
df[cvar] = df[cvar].astype('category')

In [21]:
df = pd.get_dummies(df, prefix_sep='_')

In [22]:
df.head()

Unnamed: 0,score,Movement,day_0,day_1,day_2,day_3,day_6,Direction_down,Direction_up
2020-04-01,0.270791,2.070864,0,0,1,0,0,0,1
2020-04-02,0.237726,-0.921028,0,0,0,1,0,1,0
2020-04-05,0.438362,7.436775,0,0,0,0,1,0,1
2020-04-06,0.096384,-1.077021,1,0,0,0,0,1,0
2020-04-07,0.141523,1.003119,0,1,0,0,0,0,1


In [39]:
df2 = df.copy()
rdummies = ["day_0", "Direction_down"]
df2 = df2.drop(columns=rdummies)

df3 = df2.copy()
df3 = df3.drop(columns="Movement")

In [41]:
df3.head()

Unnamed: 0,score,day_1,day_2,day_3,day_6,Direction_up
2020-04-01,0.270791,0,1,0,0,1
2020-04-02,0.237726,0,0,1,0,0
2020-04-05,0.438362,0,0,0,1,1
2020-04-06,0.096384,0,0,0,0,0
2020-04-07,0.141523,1,0,0,0,1


In [42]:
from sklearn.model_selection import train_test_split

In [51]:
testpart_size = 0.2
df_nontestData, df_testData = train_test_split(df3, test_size=testpart_size, random_state=1)

In [52]:
df_testData.shape

(25, 6)

## Logistic Regression with Penalty 

In [53]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [54]:
DV = 'Direction_up'
y = df_nontestData[DV]
X = df_nontestData.drop(columns=[DV])

In [55]:
kfolds = 5

min_alpha = 0.001
max_alpha = 100

n_candidates = 1000

In [56]:
import numpy as np

# C_list is the element-wise inverse of alpha_list. It is required as one of the paramater values for LogisticRegressionCV
C_list = list(1/np.linspace(min_alpha, max_alpha, num=n_candidates))

# Set n_jobs to be -1 to run LogisticRegressionCV on all CPU cores.
clf_optimal = LogisticRegressionCV(Cs=C_list, cv=kfolds, penalty='l1', solver='saga', max_iter=5000, random_state=1, n_jobs=-1).fit(X,y)

In [57]:
print(1/clf_optimal.C_)

[0.001]


In [58]:
def summary_coef(model_object):
    n_predictors = X.shape[1]
    model_coef = pd.DataFrame(model_object.coef_.reshape(1, n_predictors), columns=X.columns.values)
    model_coef['Intercept'] = model_object.intercept_
    return model_coef.transpose()

summary_coef(clf_optimal)

Unnamed: 0,0
score,0.772881
day_1,0.744015
day_2,0.421698
day_3,-0.181121
day_6,1.014082
Intercept,-0.319739


In [59]:
# y_test_actual is the actual values of the DV in the test partition
y_test_actual = df_testData[DV]

# X_test is the predictor values in the test partition
X_test = df_testData.drop(columns=[DV])

# Use predict method of the clf_optimal object to apply the model associated with clf_optimal to the test partition
# y_test_predicted is the predicted values of the DV in the test partition 
y_test_predicted = clf_optimal.predict(X_test)

In [60]:
# Import the metrics package
from sklearn import metrics

# Display the confusion matrix over the test partition
metrics.confusion_matrix(y_test_actual, y_test_predicted)

array([[ 2,  3],
       [ 8, 12]], dtype=int64)

In [61]:
# Display the accuracy over the test partition
clf_optimal.score(X_test, y_test_actual)

0.56