In [2]:
import os
data_path = '../../dependencies/labeled_data/meta'

In [3]:
import pickle 
import numpy as np
with open(data_path, 'rb') as f:
    data_raw = pickle.load(f)
X,y = np.array(data_raw['X']), np.array(data_raw['y'])

In [4]:
len(X)

1384

In [5]:
from sklearn.pipeline import BaseEstimator, TransformerMixin, Pipeline
import urlextract
import re
class Cleaner(BaseEstimator, TransformerMixin):
    def __init__(self, include_subj=True, replace_html=True, remove_punctuation=True, replace_urls=True, replace_numbers=True):
        self.include_subj = include_subj; self.replace_html = replace_html
        self.remove_punctuation = remove_punctuation; self.replace_urls = replace_urls;
        self.replace_numbers = replace_numbers  
    def fit(self, X, y=None):
        return self
    def __html_to_plain_text__(self, html: str) -> str:
        from bs4 import BeautifulSoup
        return BeautifulSoup(html, 'html.parser').get_text()
    def transform(self, X, y=None):
        X_transformed = []
        for article in X:
            text = " ".join(article) if self.include_subj else " ".join(article[1:])
            if self.replace_html:
                text = self.__html_to_plain_text__(text)
            if self.replace_urls:
                url_extractor = urlextract.URLExtract() 
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text)
            if self.remove_punctuation:
                text = text.replace("\'", "").replace("’", "") #Because we dont want these to be replaced by spaces
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            X_transformed.append(text)
        return X_transformed

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
class CountVectorizerWithStemming(CountVectorizer):
    def build_analyzer(self):
        lemm = WordNetLemmatizer()
        analyzer = super(CountVectorizerWithStemming, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [7]:
class filter_to_fin():
    def __init__(self, model_path:str = '../models/fin_not_fin_v2.model', inverse_recall_rate:float = 0.4):
        self.model_path = model_path
        self.inverse_recall_rate = inverse_recall_rate
    def transform(self, X, y=None):
        with open(self.model_path, 'rb') as f:
            m_t = pickle.load(f)
        #we want a pretty lose recall here, as most of the things coming will be financial news
        indexes = m_t['model'].predict_proba(m_t['transformer'].transform(X))[:,1]>self.inverse_recall_rate
        return X[indexes], y[indexes]

In [8]:
#seeding guided LDA
bias_list = ['jumped', 'hike', 'trend-line', 'earnings','candle','ipo', 'fibonacci', 'sma', 'rise', 'growth',\
             'bulls', 'bears', 'bullish', 'optimistic', 'rally', 'surge', \
                'soared', 'growth' 'buy', 'higher', 'gains', 'outperform','lower',\
             'slumped', 'fell', 'worry', 'bearish', 'miss', 'sell', 'losses', 'warn', \
                 'plummet', 'bad', 'down', 'low', 'disappointed', 'weak', 'worry']
              

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)

In [14]:
from scipy.sparse import csr_matrix
class SetBias(BaseEstimator, TransformerMixin):
    def __init__(self, bias_strength:float, bias_list_path:str = '../dependencies/word_list.sav'):
        with open(bias_list_path, 'rb') as f:
            self.bias_list = pickle.load(f)
        self.bias_strength = bias_strength
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if type(X) is csr_matrix:
            X = X.toarray()
        return csr_matrix(np.add(X, self.bias_list*self.bias_strength, where=X!=0))

In [34]:
class RemoveNan(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return np.nan_to_num(X)

In [12]:
X_train, y_train = X,y

In [35]:
from sklearn.pipeline import Pipeline
from sklearn import exceptions
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer

cvws = CountVectorizerWithStemming(stop_words="english", max_features=20_000)
data_transformer = Pipeline([
    ('clean', Cleaner()), #cleans text
    ('vect', cvws), #turns words to counts 
    ('tfidf', TfidfTransformer()), #turns counts to tf-idf
])
bias_set = Pipeline([
    ('bias', SetBias(bias_strength = 1.2)), #setting bias for values
    ('remove_nan', RemoveNan())
])
all_transform = Pipeline([
    ('data_trans', data_transformer),
    ('bias_trans', bias_set)
])

In [25]:
#using the fin_not_fin model to filter out non-financial news
X_filtered, y_filtered = filter_to_fin(inverse_recall_rate=0.1).transform(X_train,y_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [36]:
#preprocessing the data
X_train_prepared = all_transform.fit_transform(X_filtered)
X_train_prepared

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


<1160x7943 sparse matrix of type '<class 'numpy.float64'>'
	with 39985 stored elements in Compressed Sparse Row format>

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
log_clf = LogisticRegression(solver="liblinear", random_state=42, n_jobs=-1)
score = cross_val_score(grid_search_svc.best_estimator_, X_train_prepared , y_filtered, cv=20, verbose=1)
print("{}+={}".format(100*score.mean(), 100*score.std()))

74.7703994791186+=11.050536467819928


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.3s finished


In [37]:
print(len(X_filtered))
len(X)

1160


1384

In [40]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
log_clf = LogisticRegression(solver="liblinear",C = 1.4, dual= False, penalty = 'l2', random_state=42, n_jobs=-1)
lin_svc = LinearSVC(random_state=42)
#lin_svc = SVC(kernel='linear',probability=True)
#boost_clf = XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)
forest_clf = RandomForestClassifier(random_state=42)
#extra_clf = ExtraTreesClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1) #random thresholds set

voting_clf = VotingClassifier(
    estimators = [('log_reg', log_clf), ('forest_clf', forest_clf), ('sgd_clf', lin_svc)],
    voting = 'hard',
)

  from numpy.core.umath_tests import inner1d


In [41]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from warnings import filterwarnings as warn
warn("ignore", category=DeprecationWarning)
for clf in (log_clf, lin_svc, forest_clf, voting_clf):
    score = cross_val_score(clf, X_train_prepared , y_filtered, cv=20, verbose=1)
    print(clf.__class__.__name__, score.mean())

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.4s finished


LogisticRegression 0.721497098239462


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.6s finished


LinearSVC 0.7528779209859833


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.0s finished


RandomForestClassifier 0.7161148963876671
VotingClassifier 0.7518827606714037


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.0s finished


In [265]:
lin_svc.decision_function

<bound method LinearClassifierMixin.decision_function of LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)>

In [47]:
from sklearn.model_selection import GridSearchCV
#Optimizing hyperparams for svc
param_grid = [
    {'C': np.arange(0.2, 0.8,0.1), 'dual': [False, True], 'penalty': ['l2']},
    #{'C': np.arange(0.4, 0.5,0.1), 'dual': [False], 'penalty': ['l1']},
]
grid_search_svc = GridSearchCV(lin_svc, param_grid, cv=50, return_train_score = True, verbose=1)
grid_search_svc.fit(X_train_prepared, y_filtered)
print("Acheaved score of:\t", grid_search_svc.best_score_)
print("With following paramaters:\t", grid_search_svc.best_params_)

Fitting 50 folds for each of 14 candidates, totalling 700 fits
Acheaved score of:	 0.7637931034482759
With following paramaters:	 {'C': 0.30000000000000004, 'dual': False, 'penalty': 'l2'}


[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    9.1s finished


In [333]:
#Optimizing hyperparams for log reg
param_grid = [
    {'C': np.arange(1.8,4,0.1), 'dual': [False], 'penalty': ['l2']},
]
grid_search_log = GridSearchCV(log_clf, param_grid, cv=10, return_train_score = True, verbose=1)
grid_search_log.fit(X_train_prepared, y_filtered)
print("Acheaved score of:\t", grid_search_log.best_score_)
print("With following paramaters:\t", grid_search_log.best_params_)

Fitting 20 folds for each of 22 candidates, totalling 440 fits
Acheaved score of:	 0.7377495462794919
With following paramaters:	 {'C': 2.500000000000001, 'dual': False, 'penalty': 'l2'}


[Parallel(n_jobs=1)]: Done 440 out of 440 | elapsed:    6.4s finished


In [336]:
#Optimizing hyperparams for random forest
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': np.arrange(200,2200,200)}

random_search_forest = RandomizedSearchCV(forest_clf, param_grid, cv=10, return_train_score = True, verbose=1)
random_search_forest.fit(X_train_prepared, y_filtered)
print("Acheaved score of:\t", random_search_forest.best_score_)
print("With following paramaters:\t", random_search_forest.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 19.2min finished


Acheaved score of:	 0.7323049001814882
With following paramaters:	 {'n_estimators': 1200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}


In [337]:
models = [grid_search_log.best_estimator_, \
          random_search_forest.best_estimator_, \
          grid_search_svc.best_estimator_]

In [380]:
X_test_filtered, y_test_filtered = filter_to_fin().transform(X_test, y_test)
X_test_prepared = all_transform.transform(X_test_filtered)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [381]:
from vecstack import stacking
S_train, S_test = stacking(models,                   
                           X_train_prepared, y_filtered, X_test_prepared,   
                           regression=False, 
                           mode='oof_pred_bag', 
                           needs_proba=False,
                           save_dir=None, 
                           metric=accuracy_score,
                           n_folds=4, 
                           stratified=True,
                           shuffle=False,  
                           random_state=42,    
                           verbose=2)

task:         [classification]
n_classes:    [3]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [LogisticRegression]
    fold  0:  [0.73195876]
    fold  1:  [0.79310345]
    fold  2:  [0.70689655]
    fold  3:  [0.71280277]
    ----
    MEAN:     [0.73619038] + [0.03413983]
    FULL:     [0.73620690]

model  1:     [RandomForestClassifier]
    fold  0:  [0.70790378]
    fold  1:  [0.77586207]
    fold  2:  [0.70344828]
    fold  3:  [0.70242215]
    ----
    MEAN:     [0.72240907] + [0.03092982]
    FULL:     [0.72241379]

model  2:     [LinearSVC]
    fold  0:  [0.73195876]
    fold  1:  [0.78965517]
    fold  2:  [0.70689655]
    fold  3:  [0.72318339]
    ----
    MEAN:     [0.73792347] + [0.03119169]
    FULL:     [0.73793103]



In [339]:
S_train.shape


(1102, 3)

In [382]:
modelStacking = XGBClassifier(random_state=42, n_jobs=-1)
param_grid = {'learning_rate':np.arange(0.01, 1, 0.1), 
              'n_estimators':np.arange(50,500,50), 
              'max_depth':[1,2,3]}
random_search_stack = RandomizedSearchCV(modelStacking, param_grid, cv=10, return_train_score = True, verbose=1)
random_search_stack.fit(S_train, y_filtered)
print("Acheaved score of:\t", random_search_stack.best_score_)
print("With following paramaters:\t", random_search_stack.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   21.4s finished


Acheaved score of:	 0.7482758620689656
With following paramaters:	 {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.51}


In [345]:
#best off going with the 0.74 of the linearSVC

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.1s finished


LogisticRegression 0.7311537999037999


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.4s finished


LinearSVC 0.7311537999037999


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.2s finished


RandomForestClassifier 0.7302934102934102


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.7s finished


VotingClassifier 0.7311537999037999
XGBClassifier 0.737502405002405


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.3s finished


In [48]:
data_transformer.transform([bias_list]).toarray()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


array([[0., 0., 0., ..., 0., 0., 0.]])

In [49]:
import pickle
with open('../models/news_sentiment.model', 'wb') as f:
    pickle.dump({'transformer':all_transform, 'model':grid_search_svc.best_estimator_}, f)

In [50]:
transformed_bias_list = data_transformer.transform([bias_list]).toarray()
with open('../dependencies/word_list.sav', 'wb') as f:
    pickle.dump(transformed_bias_list, f)

In [69]:
with open('../../dependencies/labled_data/test', 'rb') as f:
    data_raw = pickle.load(f)
X_test,y_test = np.array(data_raw['X']), np.array(data_raw['y'])

In [119]:
X_test_transform = all_transform.transform(X_test)
pred = grid_search_svc.best_estimator_.predict(X_test_transform)

In [120]:
accuracy_score(y_test,pred)

0.72