In [61]:
import pickle
import numpy as np
PATH = 'dependencies/meta'
with open(PATH, 'rb') as fp:
    data_raw = pickle.load(fp)
X, y = data_raw["X"], data_raw["y"]
print(len(y))
#X: Source, Title, Headline, content

1242


In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [133]:
import nltk
from nltk.collocations import *
plain_text = Cleaner().fit_transform([[' '.join([' '.join(i) for i in X])]])[0].split()

finder = BigramCollocationFinder.from_words(plain_text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 20)
# tokens = nltk.wordpunct_tokenize(text)
# finder = BigramCollocationFinder.from_words(tokens)
# finder
# trigram_measures = nltk.collocations.TrigramAssocMeasures()
# finder = BigramCollocationFinder.from_words('a')
# print(type(nltk.corpus.genesis.words('english-web.txt')))

[('adversarial', 'narrative'),
 ('asx', 'coh'),
 ('bme', 'vid'),
 ('changi', 'airport'),
 ('conventional', 'wisdom'),
 ('ed', 'yardeni'),
 ('hunt', 'showdown'),
 ('leons', 'furniture'),
 ('mary', 'altaffer'),
 ('nintendos', 'lawyers'),
 ('proshares', 'ultrashort'),
 ('royal', 'caribbean'),
 ('sophie', 'turner'),
 ('susa', 'ventures'),
 ('tse', 'lnf'),
 ('venkat', 'subramaniam'),
 ('wells', 'fargo'),
 ('angela', 'merkel'),
 ('crosstown', 'brooklyn'),
 ('declining', 'num')]

In [63]:
import re
from html import unescape
import string
def html_to_plain_text(html: str) -> str:
    from bs4 import BeautifulSoup
    return BeautifulSoup(html, 'html.parser').get_text()
def drop(s):
    return re.sub(r'\W+', ' ', s, flags=re.M)

In [5]:
s = html_to_plain_text(X[1][3])
s = drop(s)
print(s)

ap mary altaffer nike is scheduled to release fourth quarter earnings after the bell on thursday wall street is maintaining its overwhelmingly bullish view on the shoe giant amid investors fears around business in china where the broader e


In [64]:
import nltk
from sklearn.pipeline import BaseEstimator, TransformerMixin, Pipeline
import urlextract 
class Cleaner(BaseEstimator, TransformerMixin):
    def __init__(self, include_subj=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True):
        self.include_subj = include_subj
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
    def fit(self, X):
        return self
    def transform(self, X):
        X_transformed = []
        for article in X:
            text = " ".join(article) if self.include_subj else " ".join(article[1:3])
            text = html_to_plain_text(text)
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                url_extractor = urlextract.URLExtract() 
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text)
            if self.remove_punctuation:
                text = text.replace("\'", "")
                text = text.replace("’", "")
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            X_transformed.append(text)
        return X_transformed

In [10]:
from collections import Counter
class CleanWordToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, num_poly_features:int = 1, stemming:bool = True):
        self.num_poly_features = num_poly_features
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def __push_pop__(self, s:str, span:int = 1, stemming:bool = True) -> [str]:
        poly_arr = s.split()
        stemmer = nltk.PorterStemmer()
        if stemming:
            poly_arr = [stemmer.stem(word) for word in poly_arr]
        p2 = span
        ret = []
        while p2<=len(poly_arr):
            ret.append(' '.join(poly_arr[p2-span:p2]))
            p2+=1
        return ret
    def __poly_features_str__(self, s:str, n_splits:int = 1, stemming:bool = True) -> [str]:
        '''
        instead of splitting a str of "this is a string" into ["this","is","a", "string"] we can
        manufacture additional features ex if n_splits=2 it becomes 
        ["this", "is", "a", "string", "this is", "is a", "a string"] 
        '''
        ret = []
        for i in range(1,n_splits+1):
            ret += self.__push_pop__(s=s, span=i, stemming=stemming)
        return ret
    def transform(self, X, y=None):
        X_transformed=[]
        for article in X:
            word_counts = Counter(\
                        self.__poly_features_str__(\
                        article, n_splits = self.num_poly_features, stemming = self.stemming))
            X_transformed.append(word_counts)
        return X_transformed

In [65]:
from scipy.sparse import csr_matrix
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size=1000):
        self.vocab_size = vocab_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for (word,count) in word_count.items():
                total_count[word]+= min(count,10)
        most_common = total_count.most_common()[:self.vocab_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self 
    def transform(self, X, y=None):
        rows,cols,data = [],[],[]
        for (row,word_count) in enumerate(X):
            for (word,count) in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocab_size + 1))

In [66]:
def bayesian_prior_set(X, y, prior_strength:int=1_000) -> bool:
    #fix the prior so it just appends the counter and returns
    pos_set = ' '.join(['jumped', 'hike', 'rise', 'growth', 'bullish', 'optimistic', 'rally', 'surge', 'soared', 'buy', 'higher', 'gains', 'outperform','lower'])
    neg_set = ' '.join(['slumped', 'fell', 'worry', 'bearish', 'miss', 'sell', 'losses', 'warn', 'plummet', 'bad', 'down'])
    pos_weight, neg_weight = int(prior_strength * (1-(len(pos_set)/(len(pos_set) + len(neg_set))))), \
                            int(prior_strength * (1-(len(neg_set)/ (len(pos_set) + len(neg_set)))))    
    X+=([["","","", pos_set]]*pos_weight)
    y+=([2]*pos_weight) #2 is good
    X+=([["","","", neg_set]]*neg_weight)
    y+=([0]*neg_weight) #0 is bad
    return X,y

In [100]:
#X_bayesian, y_bayesian = bayesian_prior_set(X_train,y_train, prior_strength=500)
data_pipeline = Pipeline([
    ("text_cleaner", Cleaner()),
    ("text_to_count", CleanWordToWordCounterTransformer(stemming=False)),
    ("word_cout_to_vect", WordCounterToVectorTransformer(10_000))
])

In [101]:
X_train_transformed = data_pipeline.fit_transform(X_train)

In [102]:
y_train = np.array(y_train)
y_train_dec = y_train!=1
y_train_yn = y_train[y_train_dec]
X_train_transformed_yn = X_train_transformed[y_train_dec]
print(X_train_transformed_yn.shape, y_train_yn.shape)

(577, 10001) (577,)


In [686]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV as gscv
log_clf = LogisticRegression(solver="liblinear", random_state=42, n_jobs=-1)
param_grid = [
    {'max_leaf_nodes':[None],'n_estimators':[10]},
  ]
grid_search = gscv(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring = "f1")

In [687]:
grid_search.fit(X_train_transformed, y_important)
print("Acheaved score of:\t", grid_search.best_score_)
print("With following paramaters:\t", grid_search.best_params_)

Acheaved score of:	 0.7712078639249641
With following paramaters:	 {'max_leaf_nodes': None, 'n_estimators': 10}


In [115]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
log_clf = LogisticRegression(solver="liblinear", random_state=42, n_jobs=-1)
#lin_svc = LinearSVC(random_state=42)
lin_svc = SVC(kernel='linear',probability=True)
forest_clf = RandomForestClassifier(random_state=42)
#extra_clf = ExtraTreesClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1) #random thresholds set

voting_clf = VotingClassifier(
    estimators = [('log_reg', log_clf), ('sgd_clf', lin_svc)],
    voting = 'hard',
)

In [116]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from warnings import filterwarnings as warn
warn("ignore", category=DeprecationWarning)
for clf in (log_clf, lin_svc, voting_clf):
    score = cross_val_score(clf, X_train_transformed_yn , y_train_yn, cv=10, verbose=3, scoring="accuracy")
    print(clf.__class__.__name__, score.mean())

[CV]  ................................................................
[CV] ....................................... , score=0.745763 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.830508 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.810345 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.706897 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.862069 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.824561 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.701754 -   0.0s
[CV]  

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


[CV] ....................................... , score=0.754386 -   0.0s
LogisticRegression 0.7867862233022651
[CV]  ................................................................
[CV] ....................................... , score=0.745763 -   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ....................................... , score=0.830508 -   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV] ....................................... , score=0.810345 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.706897 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.844828 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.842105 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.701754 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.824561 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.807018 -   0.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.8s finished


[CV] ....................................... , score=0.677966 -   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ....................................... , score=0.677966 -   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] ....................................... , score=0.689655 -   0.1s
[CV]  ................................................................
[CV] ....................................... , score=0.706897 -   0.1s
[CV]  ................................................................
[CV] ....................................... , score=0.655172 -   0.1s
[CV]  ................................................................
[CV] ....................................... , score=0.666667 -   0.1s
[CV]  ................................................................
[CV] ....................................... , score=0.701754 -   0.1s
[CV]  ................................................................
[CV] ....................................... , score=0.631579 -   0.1s
[CV]  ................................................................
[CV] ....................................... , score=0.649123 -   0.1s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.8s finished


[CV] ....................................... , score=0.745763 -   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ....................................... , score=0.847458 -   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV] ....................................... , score=0.810345 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.706897 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.862069 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.824561 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.684211 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.807018 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.824561 -   0.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.1s finished


In [94]:
from sklearn.model_selection import cross_val_predict
X_test_transformed = data_pipeline.transform(X_test)

In [95]:
y_test = np.array(y_test)
y_test_dec = y_test!=1
y_test_yn = y_test[y_test_dec]
X_test_transformed_yn = X_test_transformed[y_test_dec]
print(X_test_transformed_yn.shape, y_test_yn.shape)

(63, 10001) (63,)


In [96]:
voting_clf.fit(X_train_transformed_yn, y_train_yn)

VotingClassifier(estimators=[('log_reg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('sgd_clf'...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         n_jobs=1, voting='soft', weights=None)

In [97]:
pred = voting_clf.predict(X_test_transformed_yn)

In [98]:
accuracy_score(pred, y_test_yn)

0.8412698412698413

In [113]:
upper = 0.5
lower =1-upper
pred_bin = pred_num.copy()
pred_bin[pred_bin[:,1]>=upper] = 2
pred_bin[pred_bin[:,1]<=lower] = 0
pred_bin[((pred_bin[:,1] >lower) & (pred_bin[:,1] < upper))] = 1
pred_bin =pred_bin[:,1]


In [114]:
accuracy_score(pred_bin[:503], y_pred=y_bayesian[:503])

0.7117296222664016

In [56]:
ex = [['v','a','so so', '']]
pred = data_pipeline.transform(ex).toarray()
voting_clf.predict(pred)

['Business Insider',
 "nike analysts are brushing off china growth fears ahead of the shoe giant's quarterly numbers (nke)",
 "nike is scheduled to release fourth-quarter earnings after the bell on thursday. wall street is maintaining its overwhelmingly bullish view on the shoe giant amid investors' fears around business in china, where the broader economy is slowing. a declining num…",
 "ap/mary altaffer\r\n<ul><li>nike is scheduled to release fourth-quarter earnings after the bell on thursday.</li><li>wall street is maintaining its overwhelmingly bullish view on the shoe giant amid investors' fears around business in china, where the broader e"]

In [85]:
X[19]

['Seekingalpha.com',
 'gold weekly: rally set to continue',
 'gldm has overshot our june target, reaching a high of $14.34 per share this week. speculators are the most bullish on gold since november 2017, having lifted th',
 "introduction\r\nwelcome to orchid's gold weekly report. we discuss gold prices through the lenses of the world gold shares spdr gold minishares trust etf (gldm) because we think that is the best pure-play etf to assert exposure to spot gold prices.\r\ngldm has ov"]