In [1]:
import os
import numpy as np
import pandas as pd
from re import match
from scipy.sparse import coo_matrix
from joblib import load
from sklearn.model_selection import cross_validate
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [2]:
MODELING_DIR = './modeling/'
LDA_DIR = './data/'
RAW_DATA_DIR = '../twitter/tokenized_corpus/'


def get_lda_model(ticker):
    lda = load(LDA_DIR + ticker + '/' + ticker + '.pickle')
    return lda


def get_word_idx_map(filename):
    word_idx_map = {}
    for line in open(filename):
        word, idx = line[:-1].split(',')
        word_idx_map[word] = int(idx)
    return word_idx_map


def get_dataframe(ticker):
    word_idx_map = get_word_idx_map(LDA_DIR + ticker + '/wordidx.dat')
    label_idx = max([x for x in word_idx_map.values()]) + 1

    records = []
    columns = []
    ticker_rows = 0
    rows = 0
    for filename in [ticker_filename, 'random.dat']:
        for line in open(RAW_DATA_DIR + filename):
            rows += 1
            if filename == ticker_filename:
                ticker_rows += 1
            else:
                if rows > ticker_rows * 2:
                    break
            line = line[:-1]
            tokens = line.split(',')
            data = []
            j = []
            word_count = {}
            for token in tokens:
                try:
                    word_count[token] += 1
                except KeyError:
                    word_count[token] = 1
            for token in tokens:
                try:
                    tok_idx = word_idx_map[token]
                    data.append(word_count[token])
                    j.append(tok_idx)
                except KeyError:
                    pass
            data.append(1 if filename != 'random.dat' else 0)
            j.append(label_idx)
            columns.append(j)
            records.append(data)

    data = []
    i = []
    j = []
    for row in range(len(records)):
        data += records[row]
        j += columns[row]
        i += [row] * len(records[row])
    wc_sparse_vector = coo_matrix((data, (i, j)))
    wc_data = pd.DataFrame(data=wc_sparse_vector.toarray())
    return wc_data

In [3]:
model_scores = {}
model_dict = {}
for ticker_filename in [x for x in os.listdir(RAW_DATA_DIR) if x != 'random.dat']:
    ticker = match('(.*)\.dat', ticker_filename).group(1)
    raw_data = get_dataframe(ticker)
    raw_data = raw_data.sample(frac=1).reset_index(drop=True)
    raw_input = raw_data[raw_data.columns[:-1]].values
    labels = raw_data[raw_data.columns[-1]].values
    lda = get_lda_model(ticker)
    clean_data = lda.transform(raw_input)
    models = [
        ('Naive Bayes', GaussianNB()),
        ('Logistic', LogisticRegression(solver='lbfgs')),
        ('ADABoost', AdaBoostClassifier()),
        ('Random Forest', RandomForestClassifier(n_estimators=100))
    ]
    print(ticker)
    for model_name, model in models:
        results = cross_validate(model,
                             clean_data,
                             labels,
                             return_estimator=True,
                             cv=5)
        print(model_name)
        print(np.mean(results['test_score']))
        ret_model = results['estimator'][0]
        if model_name == 'Logistic':
            model_dict[ticker] = ret_model
        try:
            model_scores[model_name].append(np.mean(results['test_score']))
        except KeyError:
            model_scores[model_name] = [np.mean(results['test_score'])]

BA
Naive Bayes
0.7456694927392897
Logistic
0.7753886134055425
ADABoost
0.7783379146175935
Random Forest
0.8506895799087382
WMT
Naive Bayes
0.7174798467649064
Logistic
0.7328682393072861
ADABoost
0.7501818071937228
Random Forest
0.8292401533693926
DIS
Naive Bayes
0.79485
Logistic
0.8530999999999999
ADABoost
0.8657499999999999
Random Forest
0.9151999999999999
JPM
Naive Bayes
0.6854634495938844
Logistic
0.7368609651218346
ADABoost
0.7543717152412804
Random Forest
0.7859173435260391
CAT
Naive Bayes
0.7770328988206083
Logistic
0.8171818746120423
ADABoost
0.8489447548106768
Random Forest
0.8511421477343266
JNJ
Naive Bayes
0.6265151515151516
Logistic
0.6439393939393939
ADABoost
0.6787878787878788
Random Forest
0.6522727272727271
AXP
Naive Bayes
0.7054216867469879
Logistic
0.7740963855421688
ADABoost
0.7734939759036145
Random Forest
0.8150602409638555
KO
Naive Bayes
0.6830578512396694
Logistic
0.7495867768595041
ADABoost
0.7904958677685949
Random Forest
0.8421487603305785
GS
Naive Bayes
0.5809

In [4]:
for model, scores in model_scores.items():
    print(model)
    print(np.mean([x for x in scores if x >= .8]))

Naive Bayes
0.8510509804795381
Logistic
0.8486025643112954
ADABoost
0.8485652923461268
Random Forest
0.8483943113089528


In [5]:
DATA_DIR = '../twitter/processed_tweet_data/'
proc_data = {}
for datafile in os.listdir(DATA_DIR):
    ticker = match('(.*)\.dat', datafile).group(1)
    tweet_data = pd.read_csv(DATA_DIR + datafile, header=None).values
    ticker_model = model_dict[ticker]
    dated_data = {}
    tweet_dates = tweet_data[:,-1]
    tweet_data = tweet_data[:,:-1]
    preds = ticker_model.predict_proba(tweet_data)[:,1].reshape(-1, 1)
    mod_data = tweet_data * preds
    final_data = []
    for i in range(len(tweet_data)):
        tdate = str(tweet_dates[i])[:8]
        try:
            dated_data[tdate].append(mod_data[i])
        except KeyError:
            dated_data[tdate] = [mod_data[i]]
    for key in dated_data.keys():
        dated_data[key] = np.array(dated_data[key]).sum(0)
        final_data.append([key] + list(dated_data[key]))
    proc_data[ticker] = final_data

In [6]:
STK_DATA_DIR = '../stocks/stock_data/'
for datafile in os.listdir(STK_DATA_DIR):
    ticker = match('(.*)\.dat', datafile).group(1)
    stk_data = pd.read_csv(STK_DATA_DIR + datafile, delimiter='|').values
    stk_dates = stk_data[:,0]
    tik_dates = []
    for tdate in stk_dates:
        tik_dates.append(tdate[:4] + tdate[5:7] + tdate[8:10])
    tik_dates = np.array(tik_dates).reshape(-1, 1)
    stk_data = np.concatenate((stk_data, tik_dates), 1)
    stk_data = pd.DataFrame(data=stk_data[:,[4, 2]])
    tweet_data = pd.DataFrame(data=proc_data[ticker])
    
    all_data = pd.merge(tweet_data, stk_data, on=tweet_data.columns[0])
    all_data['1_y'] = abs(all_data['1_y'])
    labels = all_data['1_y'].values
    input_data = all_data.drop(columns=[0, '1_y']).values
    coefs = LinearRegression(normalize=True).fit(input_data, labels).coef_
    max_vector = list(coefs).index(max(coefs))
    vector_comps = get_lda_model(ticker).components_[max_vector]
    word_idx_map = get_word_idx_map(LDA_DIR + ticker + '/wordidx.dat')
    word_scores = []
    for word, idx in sorted(word_idx_map.items(), key=lambda x: x[1]):
        word_scores.append((word, vector_comps[idx]))
    print(ticker)
    for word, score in sorted(word_scores, key=lambda x: -x[1])[:10]:
        print(word + ': ' + str(round(score, 4)))
    print('\n')

BA
cockpit: 59.9735
film: 44.5506
seen: 17.8834
airbu: 15.5023
save: 13.209
aviat: 13.1904
air: 13.1527
30: 13.01
hope: 12.1458
giveaway: 11.01


WMT
walk: 59.6552
know: 49.1175
support: 46.0081
anyon: 40.5524
pic: 26.8303
race: 19.8468
site: 17.7868
smile: 16.7154
reveal: 15.9311
believ: 13.5009


DIS
kid: 300.3857
start: 154.3203
thought: 135.9112
hate: 119.5582
fight: 103.6547
need: 96.3888
special: 74.6914
class: 63.6087
take: 45.2403
one: 44.6747


JPM
vehicl: 6.01
track: 5.8213
explor: 2.01
use: 2.01
news: 1.01
lend: 1.01
sourc: 1.01
fed: 1.01
check: 1.01
thread: 1.01


CAT
educ: 4.01
toy: 4.01
rattl: 4.01
start: 2.01
ask: 2.01
question: 2.01
manag: 2.01
march: 2.01
cat: 1.081
money: 1.0737


JNJ
must: 1.01
forgot: 1.01
famou: 1.01
global: 0.01
medic: 0.01
box: 0.01
market: 0.01
research: 0.01
report: 0.01
20122024: 0.01


AXP
decor: 55.5299
art: 52.2292
wall: 31.8453
photographi: 12.01
artistri: 8.01
natur: 6.9395
sale: 4.9813
pack: 4.01
owe: 4.01
50: 4.01


KO
tri: 57.1703
know