In [30]:
import pickle
import numpy as np
PATH = '../../ConsensusIO/dependencies/meta'
with open(PATH, 'rb') as fp:
    data_raw = pickle.load(fp)
X, y = data_raw["X"], data_raw["y"]
print(len(y))

1233


In [57]:
import nltk
from sklearn.pipeline import BaseEstimator, TransformerMixin, Pipeline
import urlextract
import re
from html import unescape
import string

def html_to_plain_text(html: str) -> str:
    from bs4 import BeautifulSoup
    return BeautifulSoup(html, 'html.parser').get_text()
def drop(s):
    return re.sub(r'\W+', ' ', s, flags=re.M)

class Cleaner(BaseEstimator, TransformerMixin):
    def __init__(self, include_subj=True, lower_case=True, stemming = True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True):
        self.include_subj = include_subj
        self.lower_case = lower_case
        self.stemming = stemming
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
    def fit(self, X):
        return self
    def transform(self, X):
        X_transformed = []
        for article in X:
            text = " ".join(article) if self.include_subj else " ".join(article[1:3])
            text = html_to_plain_text(text)
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                url_extractor = urlextract.URLExtract() 
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text)
            if self.remove_punctuation:
                text = text.replace("\'", "")
                text = text.replace("’", "")
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            if self.stemming:
                stemmer = nltk.PorterStemmer()
                text = ' '.join([stemmer.stem(word) for word in text.split()])
            X_transformed.append(text)
        return X_transformed

In [77]:
Cleaner(stemming=False).fit_transform(X[:3])[0]

'business insider a tesla analyst says he thinks model NUMBER us deliveries doubled in q NUMBER and the stock is surging tsla tesla shares rose NUMBER early monday as investors awaited second quarter delivery figures expected in the coming days and a bullish tesla analyst forecasted a sizeable jump in model NUMBER us deliveries the jmp securities analyst joseph osha said in a note to client ap photo rich pedroncelli tesla shares rose NUMBER early monday as investors awaited second quarter delivery figures expected in the coming days and a bullish tesla analyst forecasted a sizeable jump in model NUMBER us deliveries the jmp securities '

In [78]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
'''
tf-idf frequency: 

tf: (total number of occurences of each word/total length of the document)
keeps longer documents from being unjustly weighted

idf: (1/ document frequency)
downscale words that occur frequently in many documents as they are most likely useless

tf-idf: tf*idf
'''
text_clf = Pipeline([
    ('clean', Cleaner(stemming=False)), #cleans text
    ('vect', CountVectorizer()), #turns words to counts 
    ('tfidf', TfidfTransformer()), #turns counts to tf-idf
])
X_prepared = text_clf.fit_transform(X)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [79]:
X_prepared

<1233x9578 sparse matrix of type '<class 'numpy.float64'>'
	with 64365 stored elements in Compressed Sparse Row format>

In [74]:
y = np.array(y)
y_choice = y!=1
print(X_prepared.shape, y_choice.shape)

(1233, 6795) (1233,)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(2257, 35788)

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
score = cross_val_score(LogisticRegression(solver="liblinear", random_state=42, n_jobs=-1), X_prepared , y_choice, cv=10, verbose=0, scoring="accuracy")
score.mean()

0.808416418378885