In [1]:
import os
import json
from sys import stdout
data_path = './datasets/fin_not_fin/'

In [2]:
def sweep_dir(folder, recursive=True, max_size=20_000):
    '''
    sweep a given folder for all images, process and load them into a np dataset which gets returned
    '''
    dataset = []
    for file in os.listdir(folder):
        if len(dataset) >= max_size:
            return dataset
        if file.endswith(".json"):
            with open(''.join([folder,file]), 'r') as fp:  
                dataset.append(json.load(fp))
        elif os.path.isdir(folder+file) and recursive is True: #if this is a directory
            dataset+= sweep_dir(folder+file+'/')
    return dataset
raw_financial_fin, raw_financial_pol, raw_financial_tech,\
raw_not_financial_sport, raw_not_financial_entertain = \
sweep_dir(data_path+'financial/fin/', max_size=20_000),\
sweep_dir(data_path+'financial/pol/', max_size=5_000),\
sweep_dir(data_path+'financial/tech/', max_size=5_000),\
sweep_dir(data_path+'not_financial/sport/', max_size=10_000), \
sweep_dir(data_path+'not_financial/entertain/', max_size=5_000)

In [3]:
raw_not_financial = raw_not_financial_entertain + raw_not_financial_sport
raw_financial = raw_financial_fin + raw_financial_pol + raw_financial_tech

In [4]:
X = raw_financial + raw_not_financial
y = [1]*len(raw_financial) + [0]*len(raw_not_financial)
print(len(X), len(y))

51473 51473


In [5]:
import nltk
from sklearn.pipeline import BaseEstimator, TransformerMixin, Pipeline
import urlextract
import re
from html import unescape
import string
from nltk.stem import WordNetLemmatizer

def html_to_plain_text(html: str) -> str:
    from bs4 import BeautifulSoup
    return BeautifulSoup(html, 'html.parser').get_text()

class Cleaner(BaseEstimator, TransformerMixin):
    def __init__(self, include_subj=True, english_only=True, replace_html=True, lower_case=True, stemming = False, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True):
        self.include_subj = include_subj
        self.lower_case = lower_case
        self.english_only = english_only
        self.replace_html = replace_html
        self.stemming = stemming
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
    def fit(self, X):
        return self
    def transform(self, X):
        X_transformed = []
        for article in X:
            if self.english_only and (article['language'] != "english"):
                continue
            text = " ".join([article['title'], article['text']]) if self.include_subj else article['text']
            if self.replace_html:
                text = html_to_plain_text(text)
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                url_extractor = urlextract.URLExtract() 
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text)
            if self.remove_punctuation:
                text = text.replace("\'", "")
                text = text.replace("’", "")
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            if self.stemming:
                stemmer = WordNetLemmatizer()
                text = ' '.join([stemmer.lemmatize(word) for word in text.split()])
            X_transformed.append(text)
        return X_transformed

In [6]:
import numpy as np
import random
#shuffling the 2 lists together
combined = list(zip(X, y))
random.shuffle(combined)
X[:], y[:] = zip(*combined)
print(len(X), len(y))

51473 51473


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
text_clf = Pipeline([
    ('clean', Cleaner(stemming=True, include_subj=True, english_only=True)), #cleans text
    ('vect', CountVectorizer(stop_words="english", max_features=20_000)), #turns words to counts 
    ('tfidf', TfidfTransformer()), #turns counts to tf-idf
])
X_train_prepared = text_clf.fit_transform(X_train)
X_train_prepared

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


<41178x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 4876325 stored elements in Compressed Sparse Row format>

In [43]:
bias_list = {
        'title': "",
        'language': "english",
        'text': "jumped hike trend-line candle ipo fibonacci sma rise growth bulls bears bullish rally surge \
                soared growth buy higher gains outperform lower \
              slumped fell worry bearish miss sell losses warn \
                 plummet bad down low disappointed weak worry"}

In [68]:
from scipy.sparse import csr_matrix
def set_bias(X, bias_list, bias_strength:int=1):
    return csr_matrix(np.add(X, text_clf.transform([bias_list]).toarray()*bias_strength, where=X!=0))

X_train_prepared_biased = set_bias(X_train_prepared.toarray(),
                                   bias_list,
                                   bias_strength=5)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [9]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
log_clf = LogisticRegression(solver="liblinear", random_state=42, n_jobs=-1)
lin_svc = LinearSVC(random_state=42)
#lin_svc = SVC(kernel='linear',probability=True)
#boost_clf = XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)
forest_clf = RandomForestClassifier(random_state=42)
#extra_clf = ExtraTreesClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1) #random thresholds set

voting_clf = VotingClassifier(
    estimators = [('log_reg', log_clf), ('forest_clf', forest_clf), ('sgd_clf', lin_svc)],
    voting = 'hard',
)

  from numpy.core.umath_tests import inner1d


In [69]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from warnings import filterwarnings as warn
warn("ignore", category=DeprecationWarning)
for clf in (log_clf, lin_svc):
    score = cross_val_score(clf, X_train_prepared_biased , y_train, cv=10, verbose=1, scoring="f1")
    print(clf.__class__.__name__, score.mean())

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   27.7s finished


LogisticRegression 0.9017142166516054
LinearSVC 0.8947798501751711


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.0s finished


In [53]:
log_clf.

'l2'

In [70]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'C': [0.8,0.9,1], 'dual': [False, True], 'penalty': ['l2']},
    {'C': [0.8,0.9,1], 'dual': [False], 'penalty': ['l1']}
]
grid_search = GridSearchCV(log_clf, param_grid, cv=10, return_train_score = True, verbose=1, scoring="f1")
grid_search.fit(X_train_prepared_biased, y_train)
print("Acheaved score of:\t", grid_search.best_score_)
print("With following paramaters:\t", grid_search.best_params_)
#print("The model:",random_search.best_estimator_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  1.6min finished


Acheaved score of:	 0.9019109671740846
With following paramaters:	 {'C': 0.9, 'dual': False, 'penalty': 'l2'}


In [71]:
optam_clf = grid_search.best_estimator_
optam_clf.fit(X=X_train_prepared,y=y_train)

LogisticRegression(C=0.9, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [62]:
X_test_prepared = text_clf.transform(X_test)

KeyboardInterrupt: 

In [73]:
X_test_prepared_biased = set_bias(X_train_prepared.toarray(),
                                   bias_list,
                                   bias_strength=1.2)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [81]:
pred = optam_clf.predict_proba(X_test_prepared)[:,1]>0.5

In [75]:
import pickle
with open('dependencies/fin_not_fin.model', 'wb') as f:
    pickle.dump({'transformer':text_clf, 'model':optam_clf}, f)

In [82]:
from sklearn.metrics import accuracy_score, f1_score, recall_score
print(recall_score(y_test, pred))
accuracy_score(y_test,pred)

0.9264682476523953


0.8799417192812045