In [105]:
import os
import numpy as np
import pandas as pd
from re import match
from scipy.sparse import coo_matrix
from joblib import load
from sklearn.model_selection import cross_validate
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [23]:
MODELING_DIR = './modeling/'
LDA_DIR = './data/'
RAW_DATA_DIR = '../twitter/tokenized_corpus/'


def get_lda_model(ticker):
    lda = load(LDA_DIR + ticker + '/' + ticker + '.pickle')
    return lda


def get_word_idx_map(filename):
    word_idx_map = {}
    for line in open(filename):
        word, idx = line[:-1].split(',')
        word_idx_map[word] = int(idx)
    return word_idx_map


def get_dataframe(ticker):
    word_idx_map = get_word_idx_map(LDA_DIR + ticker + '/wordidx.dat')
    label_idx = max([x for x in word_idx_map.values()]) + 1

    records = []
    columns = []
    ticker_rows = 0
    rows = 0
    for filename in [ticker_filename, 'random.dat']:
        for line in open(RAW_DATA_DIR + filename):
            rows += 1
            if filename == ticker_filename:
                ticker_rows += 1
            else:
                if rows > ticker_rows * 2:
                    break
            line = line[:-1]
            tokens = line.split(',')
            data = []
            j = []
            word_count = {}
            for token in tokens:
                try:
                    word_count[token] += 1
                except KeyError:
                    word_count[token] = 1
            for token in tokens:
                try:
                    tok_idx = word_idx_map[token]
                    data.append(word_count[token])
                    j.append(tok_idx)
                except KeyError:
                    pass
            data.append(1 if filename != 'random.dat' else 0)
            j.append(label_idx)
            columns.append(j)
            records.append(data)

    data = []
    i = []
    j = []
    for row in range(len(records)):
        data += records[row]
        j += columns[row]
        i += [row] * len(records[row])
    wc_sparse_vector = coo_matrix((data, (i, j)))
    wc_data = pd.DataFrame(data=wc_sparse_vector.toarray())
    return wc_data

In [35]:
model_scores = {}
model_dict = {}
for ticker_filename in [x for x in os.listdir(RAW_DATA_DIR) if x != 'random.dat']:
    ticker = match('(.*)\.dat', ticker_filename).group(1)
    raw_data = get_dataframe(ticker)
    raw_data = raw_data.sample(frac=1).reset_index(drop=True)
    raw_input = raw_data[raw_data.columns[:-1]].values
    labels = raw_data[raw_data.columns[-1]].values
    lda = get_lda_model(ticker)
    clean_data = lda.transform(raw_input)
    models = [
        ('Naive Bayes', GaussianNB()),
        ('Logistic', LogisticRegression(solver='lbfgs')),
        ('ADABoost', AdaBoostClassifier()),
        ('Random Forest', RandomForestClassifier(n_estimators=100))
    ]
    print(ticker)
    for model_name, model in models:
        results = cross_validate(mymodel,
                             clean_data,
                             labels,
                             return_estimator=True,
                             cv=5)
        print(model_name)
        print(np.mean(results['test_score']))
        ret_model = results['estimator'][0]
        if model_name == 'Logistic':
            model_dict[ticker] = ret_model
        try:
            model_scores[model_name].append(np.mean(results['test_score']))
        except KeyError:
            model_scores[model_name] = [np.mean(results['test_score'])]

BA
Naive Bayes
0.8503113681783768
Logistic
0.8516021990164448
ADABoost
0.8486515336502718
Random Forest
0.8514207665182012
WMT
Naive Bayes
0.8291510963829938
Logistic
0.8298821862524888
ADABoost
0.8289662674630595
Random Forest
0.8296990360693386
DIS
Naive Bayes
0.9151
Logistic
0.9153
ADABoost
0.9167000000000002
Random Forest
0.9153499999999999
JPM
Naive Bayes
0.7826325848064979
Logistic
0.7826206402293359
ADABoost
0.7727902532250359
Random Forest
0.7815695174390827
CAT
Naive Bayes
0.8678801986343887
Logistic
0.8673215394165116
ADABoost
0.8600837988826816
Random Forest
0.8639726877715704
JNJ
Naive Bayes
0.6265151515151515
Logistic
0.625
ADABoost
0.6174242424242424
Random Forest
0.6
AXP
Naive Bayes
0.8198795180722891
Logistic
0.8271084337349398
ADABoost
0.8216867469879519
Random Forest
0.819277108433735
KO
Naive Bayes
0.8444214876033058
Logistic
0.8429752066115703
ADABoost
0.8413223140495868
Random Forest
0.846900826446281
GS
Naive Bayes
0.7126984126984126
Logistic
0.7063492063492063
AD

In [36]:
for model, scores in model_scores.items():
    print(model)
    print(np.mean([x for x in scores if x >= .8]))

Naive Bayes
0.8530178895301146
Logistic
0.8524433207740411
ADABoost
0.8489344682944097
Random Forest
0.851697642550215


In [67]:
DATA_DIR = '../twitter/processed_tweet_data/'
proc_data = {}
for datafile in os.listdir(DATA_DIR):
    ticker = match('(.*)\.dat', datafile).group(1)
    tweet_data = pd.read_csv(DATA_DIR + datafile, header=None).values
    ticker_model = model_dict[ticker]
    dated_data = {}
    tweet_dates = tweet_data[:,-1]
    tweet_data = tweet_data[:,:-1]
    preds = ticker_model.predict_proba(tweet_data)[:,1].reshape(-1, 1)
    mod_data = tweet_data * preds
    final_data = []
    for i in range(len(tweet_data)):
        tdate = str(tweet_dates[i])[:8]
        try:
            dated_data[tdate].append(mod_data[i])
        except KeyError:
            dated_data[tdate] = [mod_data[i]]
    for key in dated_data.keys():
        dated_data[key] = np.array(dated_data[key]).sum(0)
        final_data.append([key] + list(dated_data[key]))
    proc_data[ticker] = final_data

In [122]:
STK_DATA_DIR = '../stocks/stock_data/'
for datafile in os.listdir(STK_DATA_DIR):
    ticker = match('(.*)\.dat', datafile).group(1)
    stk_data = pd.read_csv(STK_DATA_DIR + datafile, delimiter='|').values
    stk_dates = stk_data[:,0]
    tik_dates = []
    for tdate in stk_dates:
        tik_dates.append(tdate[:4] + tdate[5:7] + tdate[8:10])
    tik_dates = np.array(tik_dates).reshape(-1, 1)
    stk_data = np.concatenate((stk_data, tik_dates), 1)
    stk_data = pd.DataFrame(data=stk_data[:,[4, 2]])
    tweet_data = pd.DataFrame(data=proc_data[ticker])
    
    all_data = pd.merge(tweet_data, stk_data, on=tweet_data.columns[0])
    all_data['1_y'] = abs(all_data['1_y'])
    labels = all_data['1_y'].values
    input_data = all_data.drop(columns=[0, '1_y']).values
    coefs = LinearRegression(normalize=True).fit(input_data, labels).coef_
    max_vector = list(coefs).index(max(coefs))
    vector_comps = get_lda_model(ticker).components_[max_vector]
    word_idx_map = get_word_idx_map(LDA_DIR + ticker + '/wordidx.dat')
    word_scores = []
    for word, idx in sorted(word_idx_map.items(), key=lambda x: x[1]):
        word_scores.append((word, vector_comps[idx]))
    print(ticker)
    for word, score in sorted(word_scores, key=lambda x: -x[1])[:10]:
        print(word + ': ' + str(round(score, 4)))
    print('\n')

BA
life: 20.4371
720: 9.7903
anniversari: 9.01
promis: 9.01
one: 8.9519
get: 8.6884
candid: 8.01
suck: 8.01
sit: 7.01
ram: 6.01


WMT
vote: 126.9048
dog: 94.01
hot: 75.01
popular: 57.01
retweet: 37.48
room: 25.2643
repli: 12.9933
respect: 12.7317
check: 9.5048
mickey: 8.01


DIS
one: 738.9738
song: 119.6681
student: 42.7314
perform: 31.9433
red: 28.6973
make: 23.4708
best: 20.7862
show: 17.5114
mama: 15.7537
want: 14.7091


JPM
vehicl: 6.01
track: 5.8213
explor: 2.01
use: 2.01
news: 1.01
lend: 1.01
sourc: 1.01
fed: 1.01
check: 1.01
thread: 1.01


CAT
old: 4.01
becam: 4.01
fashion: 4.01
part: 3.01
impeach: 3.01
commun: 2.01
call: 2.01
report: 2.01
show: 2.01
hous: 2.01


JNJ
world: 2.01
better: 1.01
present: 1.01
global: 0.01
medic: 0.01
box: 0.01
market: 0.01
research: 0.01
report: 0.01
20122024: 0.01


AXP
hand: 3.01
visa: 2.01
thing: 2.01
puppi: 2.01
dick: 2.01
time: 1.01
get: 1.01
payment: 1.01
new: 1.01
say: 1.01


KO
plastic: 14.4967
doctor: 14.01
pollut: 12.0854
world: 7.9427
dri

[[4.13571485e-01 6.25769885e-02 4.20055599e-02 ... 1.86722266e-02
  1.95092219e-01 6.28951989e-02]
 [1.78000886e+00 8.47687160e-01 5.76358518e-01 ... 5.88824217e-01
  5.61517512e-01 8.74036243e-01]
 [4.61342217e-01 3.06137280e-01 1.33130300e-01 ... 3.10580776e-02
  3.32608919e-01 9.08080776e-02]
 ...
 [4.00000000e-04 4.00000000e-04 4.00000000e-04 ... 4.00000000e-04
  4.00000000e-04 4.00000000e-04]
 [1.00000000e-04 1.00000000e-04 1.00000000e-04 ... 1.00000000e-04
  1.00000000e-04 1.00000000e-04]
 [7.98885862e-03 6.00000000e-05 6.00000000e-05 ... 6.00000000e-05
  6.00000000e-05 6.00000000e-05]]
