In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
import tweepy

In [18]:
def logisticRegression(ticker, weighted=True):
    if weighted == True:        
        df = pd.read_csv(ticker+'_weighted_joined.csv', index_col=0)
    else:
        df = pd.read_csv(ticker+'_avg_joined.csv', index_col=0)
    df['Direction'] = df.Movement.apply(lambda x: "up" if x>0 else "down")
    cvar = ["day", "Direction"]
    df[cvar] = df[cvar].astype('category')
    df = pd.get_dummies(df, prefix_sep='_')
    
    df2 = df.copy()
    rdummies = ["day_0", "Direction_down"]
    df2 = df2.drop(columns=rdummies)

    df3 = df2.copy()
    df3 = df3.drop(columns="Movement")
    
    testpart_size = 0.2
    df_nontestData, df_testData = train_test_split(df3, test_size=testpart_size, random_state=1)
    
    DV = 'Direction_up'
    y = df_nontestData[DV]
    X = df_nontestData.drop(columns=[DV])
    
    kfolds = 5
    min_alpha = 0.001
    max_alpha = 100
    n_candidates = 1000
    
    # C_list is the element-wise inverse of alpha_list. It is required as one of the paramater values for LogisticRegressionCV
    C_list = list(1/np.linspace(min_alpha, max_alpha, num=n_candidates))

    # Set n_jobs to be -1 to run LogisticRegressionCV on all CPU cores.
    clf_optimal = LogisticRegressionCV(Cs=C_list, cv=kfolds, penalty='l1', solver='saga', max_iter=5000, random_state=1, n_jobs=-1).fit(X,y)

        # y_test_actual is the actual values of the DV in the test partition
    y_test_actual = df_testData[DV]

    # X_test is the predictor values in the test partition
    X_test = df_testData.drop(columns=[DV])

    # Use predict method of the clf_optimal object to apply the model associated with clf_optimal to the test partition
    # y_test_predicted is the predicted values of the DV in the test partition 
    y_test_predicted = clf_optimal.predict(X_test)
    
    def summary_coef(model_object):
        n_predictors = X.shape[1]
        model_coef = pd.DataFrame(model_object.coef_.reshape(1, n_predictors), columns=X.columns.values)
        model_coef['Intercept'] = model_object.intercept_
        return model_coef.transpose()

    print(summary_coef(clf_optimal))
    
    print(metrics.confusion_matrix(y_test_actual, y_test_predicted))
    print(clf_optimal.score(X_test, y_test_actual))
    
    return clf_optimal

In [5]:
logisticRegression("MSFT")

                  0
score      0.372495
day_1      0.000000
day_2      0.000000
day_3     -0.356276
day_6      0.223397
Intercept -0.320488
[[29  2]
 [18  1]]
0.6


LogisticRegressionCV(Cs=[1000.0, 9.891284975940117, 4.970223436170689,
                         3.3189809831359884, 2.491302884075861,
                         1.9940358567168468, 1.6622489779415066,
                         1.4251232539030403, 1.2472050247755593,
                         1.1087803416242135, 0.9980129651774395,
                         0.9073668377856979, 0.8318158314786088,
                         0.7678791316739766, 0.713069733080654,
                         0.6655633904158872, 0.6239916351391614,
                         0.5873078021989651, 0.5546976897590813,
                         0.5255184168165894, 0.4992556143767625,
                         0.4754928419050557, 0.4538893409608551,
                         0.4341635897114964, 0.41608096710711273,
                         0.39944437647092695, 0.38408703204531536,
                         0.3698668479347435, 0.35666202898923266,
                         0.3443675736046909, ...],
                     cv=5, max_

In [19]:
logisticRegression("AAPL")

                  0
score      0.000000
day_1      0.423441
day_2     -0.287589
day_3      0.000000
day_6      1.120805
Intercept -0.223063
[[18  9]
 [13 10]]
0.56


LogisticRegressionCV(Cs=[1000.0, 9.891284975940117, 4.970223436170689,
                         3.3189809831359884, 2.491302884075861,
                         1.9940358567168468, 1.6622489779415066,
                         1.4251232539030403, 1.2472050247755593,
                         1.1087803416242135, 0.9980129651774395,
                         0.9073668377856979, 0.8318158314786088,
                         0.7678791316739766, 0.713069733080654,
                         0.6655633904158872, 0.6239916351391614,
                         0.5873078021989651, 0.5546976897590813,
                         0.5255184168165894, 0.4992556143767625,
                         0.4754928419050557, 0.4538893409608551,
                         0.4341635897114964, 0.41608096710711273,
                         0.39944437647092695, 0.38408703204531536,
                         0.3698668479347435, 0.35666202898923266,
                         0.3443675736046909, ...],
                     cv=5, max_

In [8]:
logisticRegression("TSLA")

                  0
score     -0.432180
day_1      0.000000
day_2      0.383076
day_3      0.000000
day_6      0.647180
Intercept -0.068511
[[17  6]
 [ 9 14]]
0.6739130434782609


LogisticRegressionCV(Cs=[1000.0, 9.891284975940117, 4.970223436170689,
                         3.3189809831359884, 2.491302884075861,
                         1.9940358567168468, 1.6622489779415066,
                         1.4251232539030403, 1.2472050247755593,
                         1.1087803416242135, 0.9980129651774395,
                         0.9073668377856979, 0.8318158314786088,
                         0.7678791316739766, 0.713069733080654,
                         0.6655633904158872, 0.6239916351391614,
                         0.5873078021989651, 0.5546976897590813,
                         0.5255184168165894, 0.4992556143767625,
                         0.4754928419050557, 0.4538893409608551,
                         0.4341635897114964, 0.41608096710711273,
                         0.39944437647092695, 0.38408703204531536,
                         0.3698668479347435, 0.35666202898923266,
                         0.3443675736046909, ...],
                     cv=5, max_

In [9]:
logisticRegression("NIO")

                  0
score     -0.575841
day_1     -0.770558
day_2     -0.708939
day_3     -1.515383
day_6     -0.002853
Intercept  0.513239
[[16 10]
 [14 11]]
0.5294117647058824


LogisticRegressionCV(Cs=[1000.0, 9.891284975940117, 4.970223436170689,
                         3.3189809831359884, 2.491302884075861,
                         1.9940358567168468, 1.6622489779415066,
                         1.4251232539030403, 1.2472050247755593,
                         1.1087803416242135, 0.9980129651774395,
                         0.9073668377856979, 0.8318158314786088,
                         0.7678791316739766, 0.713069733080654,
                         0.6655633904158872, 0.6239916351391614,
                         0.5873078021989651, 0.5546976897590813,
                         0.5255184168165894, 0.4992556143767625,
                         0.4754928419050557, 0.4538893409608551,
                         0.4341635897114964, 0.41608096710711273,
                         0.39944437647092695, 0.38408703204531536,
                         0.3698668479347435, 0.35666202898923266,
                         0.3443675736046909, ...],
                     cv=5, max_

In [13]:
def logisticRegressionPredict(ticker):
    clf_optimal = logisticRegression(ticker)
    df = pd.read_csv(ticker+"_test.csv",index_col=0)
    #df['Direction'] = df.Movement.apply(lambda x: "up" if x>0 else "down")
    cvar = ["day"]
    df[cvar] = df[cvar].astype('category')
    df = pd.get_dummies(df, prefix_sep='_')
    
    df2 = df.copy()
    rdummies = ["day_0"]
    df2 = df2.drop(columns=rdummies)
    
    predicted_results = clf_optimal.predict(df2)
    print(predicted_results)
    
    logReg = pd.DataFrame(predicted_results)
    logReg.to_excel(ticker+"_logReg.xlsx")

In [14]:
logisticRegressionPredict("TSLA")

                  0
score     -0.432180
day_1      0.000000
day_2      0.383076
day_3      0.000000
day_6      0.647180
Intercept -0.068511
[[17  6]
 [ 9 14]]
0.6739130434782609
[1 0 1 0 0 1 0 1 0 0 1 0 1 1 0 1 1 1 0 0 1 0]


In [15]:
logisticRegressionPredict("NIO")

                  0
score     -0.575841
day_1     -0.770558
day_2     -0.708939
day_3     -1.515383
day_6     -0.002853
Intercept  0.513239
[[16 10]
 [14 11]]
0.5294117647058824
[0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 0 0]


In [16]:
logisticRegressionPredict("MSFT")

                  0
score      0.372495
day_1      0.000000
day_2      0.000000
day_3     -0.356276
day_6      0.223397
Intercept -0.320488
[[29  2]
 [18  1]]
0.6
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [20]:
logisticRegressionPredict("AAPL")

                  0
score      0.000000
day_1      0.423441
day_2     -0.287589
day_3      0.000000
day_6      1.120805
Intercept -0.223063
[[18  9]
 [13 10]]
0.56
[0 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0]
