In [1]:

# General stuff
import tqdm 
import pathlib
import re
import importlib
import sys
import pprint

# DS stuff
import math
import numpy.linalg as linalg
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier

# NLP stuff
import nltk
from   nltk import word_tokenize
from   nltk.probability import FreqDist
from   nltk.stem import WordNetLemmatizer
import gensim
import gensim.downloader
import spacy

# Finanace stuff
import yfinance as yf

# Download NLP models
spacy_model = spacy.load('en_core_web_sm')
nltk.download('wordnet')
nltk.download('omw-1.4')
w2v_pre = gensim.downloader.load('word2vec-google-news-300')

# My codes
import data_read
import preprocessing

[nltk_data] Downloading package wordnet to /Users/ozilman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ozilman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
# Reload My codes
importlib.reload(data_read)
importlib.reload(preprocessing)

<module 'preprocessing' from '/Users/ozilman/NLP/finance_sentiment_proj/git_repo/nlp_financial_sentiment/preprocessing.py'>

In [None]:
''' Load Reuters data corpus. Parse headers and dates from the financial news articles '''
headers = data_read.parse_news_data()
#relevant_news = get_relevant_news(traded_symbols, tickers,  headers)
print(len(headers))

In [8]:
''' Create text corpus, of processed, tokenized headers '''

def generate_tokenized_corpus(headers):
    WNLemmatizer = WordNetLemmatizer()
    ticker_re = preprocessing.get_ticker_re()
    remove_list = preprocessing.get_stop_words()

    '''Process and tokenize.'''
    tokenized_headers = []
    for header in tqdm.tqdm(headers):
        header = preprocessing.remove_tickers(ticker_re, header)
        header = preprocessing.NER_processing(header)
        #print(f"({ticker}) {header}")
        header_tokens = [word.lower() for word in word_tokenize(header)
                if word not in remove_list  
                ]
        header_tokens =  preprocessing.lemmatize(header_tokens, WNLemmatizer)
        tokenized_headers.append(header_tokens)
        
    corpus = [token for header in tokenized_headers for token in header]
    freqdist = FreqDist(corpus)
    return tokenized_headers, corpus, freqdist

In [35]:
''' Train Word2Vec embeddings '''
tokenized_headers, corpus, freqdist = generate_tokenized_corpus(list(zip(*headers))[1])
vector_size = 300
w2v = gensim.models.Word2Vec(sentences=tokenized_headers, vector_size=vector_size, min_count=5, epochs=300, workers=8)


100%|██████████| 104406/104406 [17:08<00:00, 101.49it/s]


In [6]:
''' W2V Generate Paragraph embedding '''
####
# Functions to get a document (phrase/header) embedding by averaging the individual 
# word embeddings in the document. 
####
def get_doc_embedding(doc, weighting, wv_model, word_freq_dict, a = 0.001):
    ''' Convert a document to an embedding by averaging its word embeddings.
        Three possible averaging schemes, SIF, logaritmic, and regular unweighted average.
    '''
    size = wv_model.vector_size
    q_vec = np.zeros(size)
    total_weight = 0
    q_size = 0
    weight = 1
    for word in doc:
        if word in wv_model:
            if weighting == "SIF":
                word_freq = word_freq_dict[word]
                weight = a / (a + word_freq)
                q_vec += weight * wv_model[word]
                total_weight += weight
                q_size += 1
            elif weighting == "LOG":
                word_freq = word_freq_dict[word]
                weight = math.log(1 / word_freq)
                q_vec += weight * wv_model[word]
                total_weight += weight
                q_size += 1
            elif weighting == "AVG":
                q_vec += weight * wv_model[word]
                total_weight += 1
                q_size += 1

    # If there are two many words not in wocab that got removed,
    # then a short sentence will not have a lot of signal. 
    if q_size >= 3:
        q_vec = q_vec / total_weight
        return q_vec
    else:
        return None
        
def get_dataset_embeddings(docs, weighting, wv_model, word_freq_dict):
    ''' Get embbedding for each document, then normalize all 
        of them and return a 2 dim np array with doc embeddings as rows.
    '''
    doc_vecs = []
    valid_indices_list = []
    for i, doc in enumerate(docs):
        embedding = get_doc_embedding(doc, weighting, wv_model, word_freq_dict)
        if embedding is not None:
            doc_vecs.append(embedding)
            valid_indices_list.append(i) # Mask to algin y with the X
                                        # that had docs removed from it
                                        # because they are too short.
            
        
    doc_vecs = np.vstack(doc_vecs)
    #row_norm = linalg.norm(doc_vecs, axis=1)
    #row_norm = row_norm[:, np.newaxis]
    #doc_vecs = doc_vecs / row_norm
    return doc_vecs, valid_indices_list
    #return doc_vecs



In [36]:
''' 
Evaluate Model on PhraseBank Dataset.
-- Word2Vec - Reuters -- 
'''
# Split to train and test. 
WNLemmatizer = WordNetLemmatizer()
remove_list = preprocessing.get_stop_words()
# Split to train and test data. For the FinancialPhraseBank dataset
phrases, labels = data_read.load_fin_pharsebank()
phrases, _, _ = generate_tokenized_corpus(phrases)
docs_train, docs_test, y_train, y_test = train_test_split(phrases, labels, test_size = 0.2, random_state = 3)
                                             
# Vectorize :
# 1. Word2Vec

freqdist = None
X_train, mask_train = get_dataset_embeddings(docs_train, "AVG", w2v.wv, freqdist)
X_test_PhraseBank, mask_test = get_dataset_embeddings(docs_test, "AVG", w2v.wv, freqdist)
y_train = np.array(y_train)[mask_train]
y_test_PhraseBank = np.array(y_test)[mask_test]
docs_test_PhraseBank = np.array(docs_test)[mask_test]

mlp_model = MLPClassifier(learning_rate_init=0.001, random_state=3, max_iter=400, activation='relu')
mlp_model.fit(X_train,y_train)
y_hat = mlp_model.predict(X_test_PhraseBank)
print("classification report:\n", metrics.classification_report(y_test_PhraseBank, y_hat))


100%|██████████| 1682/1682 [00:15<00:00, 108.83it/s]
  docs_test_PhraseBank = np.array(docs_test)[mask_test]


classification report:
               precision    recall  f1-score   support

           0       0.88      0.76      0.82       110
           1       0.89      0.95      0.92       226

    accuracy                           0.89       336
   macro avg       0.88      0.86      0.87       336
weighted avg       0.89      0.89      0.88       336



In [38]:
''' 
Evaluate Model on PhraseBank Dataset.
 -- CountVectorizer -- 
'''
# Split to train and test data. For the Financial PhraseBank dataset
phrases, labels = data_read.load_fin_pharsebank()
#phrases = tokenize_docs(phrases, WNLemmatizer, ticker_re)
docs_train, docs_test, y_train, y_test = train_test_split(phrases, labels, test_size = 0.2, random_state = 3)
                                             
# 2. BOW CountVectorizer  
vectorizer = CountVectorizer(preprocessor=preprocessing.preprocess_doc,
                             min_df = 4)

X_train = vectorizer.fit_transform(docs_train)
X_test = vectorizer.transform(docs_test)

mlp_model = MLPClassifier(learning_rate_init=0.0001, random_state=3, max_iter=1000, activation='relu')
mlp_model.fit(X_train,y_train)
y_hat = mlp_model.predict(X_test)
print("classification report:\n", metrics.classification_report(y_test,y_hat))

classification report:
               precision    recall  f1-score   support

           0       0.85      0.73      0.78       110
           1       0.88      0.94      0.91       227

    accuracy                           0.87       337
   macro avg       0.86      0.83      0.85       337
weighted avg       0.87      0.87      0.87       337



In [21]:
''' 
-- Generate the proposed Autolabel Dataset --

This will download all SP500 stock prices as a dataframe, and process the dataframe to 
have a daily change % for each stock (EOD price)/(Start of day price). 

'''
# Create my dataset: get tickers and price history from YahooFinance
tickers = data_read.get_sp500_ticker_names()
symbols = yf.Tickers(" ".join(tickers.keys()))
market_data = symbols.history(interval = "1d", start="2006-10-20", end="2013-11-20", actions= False)

market_data_cleaned = market_data.dropna(axis=0, how = "all")
market_data_cleaned = market_data_cleaned.dropna(axis=1, how = "all")

cols, symbols = zip(*market_data_cleaned.columns)

traded_symbols = set(symbols)
for sym in traded_symbols:
    price_ratio  = market_data_cleaned[('Close',sym)] / market_data_cleaned[('Open',sym)]
    price_ratio = 100 * (price_ratio -1)
    market_data_cleaned[("Change", sym)] = price_ratio 

market_data_cleaned.index = pd.to_datetime(market_data_cleaned.index)

[*********************100%***********************]  767 of 767 completed

187 Failed downloads:
- TEG: Data doesn't exist for startDate = 1161320400, endDate = 1384927200
- LEH: No data found for this date range, symbol may be delisted
- VNT: Data doesn't exist for startDate = 1161320400, endDate = 1384927200
- TWC: No data found for this date range, symbol may be delisted
- MON: Data doesn't exist for startDate = 1161320400, endDate = 1384927200
- HSP: No data found for this date range, symbol may be delisted
- ACAS: No data found for this date range, symbol may be delisted
- FDC: No data found, symbol may be delisted
- DOW: Data doesn't exist for startDate = 1161320400, endDate = 1384927200
- CMCSK: No data found for this date range, symbol may be delisted
- QRVO: Data doesn't exist for startDate = 1161320400, endDate = 1384927200
- JDSU: No data found for this date range, symbol may be delisted
- RHT: No data found, symbol may be delisted
- PAYC: Data doesn't exist for startDate = 1

  market_data_cleaned[("Change", sym)] = price_ratio


In [None]:
''' -- Generate the proposed Autolabel Dataset -- Part 2

Build labeled news dataset:
Reterieve only the news that mention companies from S&P500. For each news header keep it only
if the price of the company it talks about changes by more then +-2%. Label the obes than rose
more then two percent with 1 (positive), and the others with 0 (negative)

'''
relevant_news = data_read.get_relevant_news(traded_symbols, tickers, headers)
out_of_trade_days_cnt = 0
nan_ticker_change_cnt = 0
for news_item in relevant_news:
    date, symbol, _, _ = news_item
    if pd.to_datetime(date) not in market_data_cleaned.index:
        news_item[2] = None 
        out_of_trade_days_cnt += 1
    else:
        print("date {} sym {} ".format(date, symbol))
        ticker_day_change = market_data_cleaned.loc[pd.to_datetime(date) ,("Change",symbol)]
        if not isinstance(ticker_day_change, float):
            nan_ticker_change_cnt += 1
        #print(ticker_day_change)
        if  pd.notna(ticker_day_change) and (ticker_day_change > 2 or ticker_day_change < -2):
            news_item[2] = ticker_day_change

labeled_dataset = pd.DataFrame(relevant_news, columns = ["date", "symbol", "day_change", "header"])
labeled_dataset.set_index("date", inplace=True)
''' Final step: label changes > +2% as 1 and changes  < -2% as 0. '''
reuters_labeled = labeled_dataset.dropna(axis=0, how = "any")
reuters_labeled = reuters_labeled[ (reuters_labeled["day_change"] > 0) | (reuters_labeled["day_change"] < 0)] 
reuters_labeled.loc[reuters_labeled["day_change"] > 0, 'day_change'] = 1
reuters_labeled.loc[reuters_labeled["day_change"] < 0, 'day_change'] = 0


In [None]:

''' 
-- Generate the proposed Autolabel Dataset only assignind a pos/neg label if there is
   no significant market movement that day ( -1.2% < x < 1.2%) --

* Run this cell or the previous cell to generate the automatically-labelled dataset
  depends if you want to account for market ups/ and down or not.

This will download all SP500 stock prices as a dataframe, and process the dataframe to 
have a daily change % for each stock (EOD price)/(Start of day price). 

'''

# Get S&P500 tickers and price history from YahooFinance
tickers = data_read.get_sp500_ticker_names()
symbols = yf.Tickers(" ".join(tickers.keys()))
market_data = symbols.history(interval = "1d", start="2006-10-20", end="2013-11-20", actions= False)
market_data.index = pd.to_datetime(market_data.index)

# Drop all days without trading, and all NaN columns from dataframe retrieved by yahoofinance.
# Get the 
market_data = market_data.dropna(axis=0, how = "all")
market_data= market_data.dropna(axis=1, how = "all")
_, symbols = zip(*market_data.columns)
traded_symbols = set(symbols)

# Add daily change % for each stock (EOD price)/(Start of day price) column to the dataframe. 
for sym in traded_symbols:
    price_ratio  = market_data[('Close',sym)] / market_data[('Open',sym)]
    price_ratio = 100 * (price_ratio -1)
    market_data[("Change", sym)] = price_ratio 


# Calculate SP500 unweighted index (it is the average price of all the stocks in the S&P500) '''
snp_index = market_data["Change"].sum(axis=1) / market_data["Change"].notna().sum(axis=1)
market_data[("Change","SNP_INDX")] = snp_index

'''
Build labeled news dataset:
Reterieve only the reuters news that mention companies from S&P500. For each news header keep it only
if the price of the company it talks about changes by more then +-2%. Label the obes than rose
more then two percent with 1 (positive), and the others with 0 (negative)
'''
relevant_news = data_read.get_relevant_news(traded_symbols, tickers, headers)
for news_item in relevant_news:
    date, symbol, _, _ = news_item
    if pd.to_datetime(date) not in market_data.index:
        news_item[2] = None 
    else:
        #print("date {} sym {} ".format(date, symbol))
        ticker_day_change = market_data.loc[pd.to_datetime(date) ,("Change",symbol)]
        sp_index_change = market_data.loc[pd.to_datetime(date) ,("Change","SNP_INDX")]
        if  pd.notna(ticker_day_change) and (
                (ticker_day_change > 2 and sp_index_change < 1.2 ) 
                or 
                (ticker_day_change < -2 and sp_index_change > -1.2) 
            ) :
            news_item[2] = ticker_day_change

reuters_labeled = pd.DataFrame(relevant_news, columns = ["date", "symbol", "day_change", "header"])
reuters_labeled.set_index("date", inplace=True)
reuters_labeled = reuters_labeled.dropna(axis=0, how = "any")
reuters_labeled = reuters_labeled[ (reuters_labeled["day_change"] > 0) | (reuters_labeled["day_change"] < 0)] 
reuters_labeled.loc[reuters_labeled["day_change"] > 0, 'day_change'] = 1
reuters_labeled.loc[reuters_labeled["day_change"] < 0, 'day_change'] = 0




In [39]:
''' 
 -- Evaluate W2V Model on ReutersAuto Dataset --
'''
# Split to train and test data. For the our automatically-labelled Reuters dataset.
labeled_headers = reuters_labeled['header']
labels = reuters_labeled['day_change']

labeled_headers_tokens, _, _ = generate_tokenized_corpus(labeled_headers)
docs_train, docs_test, y_train, y_test = train_test_split(labeled_headers_tokens, labels, test_size = 0.2, random_state = 3)

#1. Word2Vec embeddings
X_train, mask_train = get_dataset_embeddings(docs_train, "AVG", w2v.wv, freqdist)
X_test, mask_test = get_dataset_embeddings(docs_test, "AVG", w2v.wv, freqdist)
y_train = np.array(y_train)[mask_train]
y_test = np.array(y_test)[mask_test]

#print(X_train[:20])
#print(y_train[:20])
#print(f"train {X_train.shape} test {X_test.shape} train y {y_train.shape} test {y_test.shape}")

# 2. BOW CountVectorizer  
'''vectorizer = CountVectorizer(preprocessor=preprocess_header,
                             min_df = 4)

X_train = vectorizer.fit_transform(docs_train)
X_test = vectorizer.transform(docs_test)

print(f"train {X_train.shape} test {X_test.shape} train y {y_train.shape} test {y_test.shape}")'''

'''lr_model = LogisticRegression(solver="liblinear", random_state=3)
lr_model.fit(X_train, y_train)
y_hat = lr_model.predict(X_test)
'''

mlp_model = MLPClassifier(learning_rate_init=0.00005, hidden_layer_sizes=(100,), random_state=3, max_iter=3000, activation='relu')
mlp_model.fit(X_train, y_train)
y_hat = mlp_model.predict(X_test)
print("classification report:\n", metrics.classification_report(y_test, y_hat))
                                             

100%|██████████| 4880/4880 [00:45<00:00, 106.10it/s]


classification report:
               precision    recall  f1-score   support

         0.0       0.57      0.58      0.58       499
         1.0       0.55      0.54      0.55       473

    accuracy                           0.56       972
   macro avg       0.56      0.56      0.56       972
weighted avg       0.56      0.56      0.56       972



In [None]:
''' -- RQ1 -- 
    Evaluate sentiment trained model on the PhraseBank Dataset 
''' 

from sklearn.datasets import load_files

dataset = load_files("./movie-reviews/")
data = [data.decode('UTF-8') for data in dataset.data]
#data = tokenize_docs(data, WNLemmatizer, ticker_re)

docs_train, docs_test, y_train, _ = train_test_split(dataset.data, dataset.target, test_size = 0.001, random_state = 3)
#docs_train, docs_test, y_train, y_test = train_test_split(data, dataset.target, test_size = 0.2, random_state = 3)

X_train, mask_train = get_dataset_embeddings(docs_train, "AVG", w2v_pre, None)
#X_test, mask_test = get_dataset_embeddings(docs_test, "LOG", w2v_pre, freqdist)
y_train = np.array(y_train)[mask_train]
#y_test = np.array(y_test)[mask_test]

mlp_model = MLPClassifier(learning_rate_init=0.0001, random_state=3, max_iter=1000, activation='relu')
mlp_model.fit(X_train, y_train)


y_hat = mlp_model.predict(X_test_PhraseBank)
print("classification report:\n", metrics.classification_report(y_test_PhraseBank, y_hat))


In [None]:
''' -- RQ1 --
    Sentiment trained model, evaluate on sentiment data  
    -- Countvectorizer -- 
'''
dataset = load_files("./movie-reviews/")
data = [data.decode('UTF-8') for data in dataset.data]

docs_train, docs_test , y_train, y_test = train_test_split(data, dataset.target, test_size = 0.2, random_state = 3)

# 2. BOW CountVectorizer  
vectorizer = CountVectorizer(preprocessor=preprocessing.preprocess_doc,
                             min_df = 4)

X_train = vectorizer.fit_transform(docs_train)
X_test = vectorizer.transform(docs_test)

mlp_model = MLPClassifier(learning_rate_init=0.00005, random_state=3, max_iter=400, activation='relu')
mlp_model.fit(X_train,y_train)


y_hat = mlp_model.predict(X_test)
print("classification report:\n", metrics.classification_report(y_test,y_hat))


In [34]:

''' -- RQ1 -- 
    Evaluate sentiment trained model on the PhraseBank Dataset 
    * CountVectorizer *
'''

dataset = load_files("./movie-reviews/")
data = [data.decode('UTF-8') for data in dataset.data]

docs_train, _ , y_train, _ = train_test_split(data, dataset.target, test_size = 0.2, random_state = 3)


# 2. BOW CountVectorizer  
vectorizer = CountVectorizer(preprocessor=preprocessing.preprocess_doc,
                             min_df = 4)

X_train = vectorizer.fit_transform(docs_train)
X_test = vectorizer.transform([" ".join(doc) for doc in docs_test_PhraseBank])

mlp_model = MLPClassifier(learning_rate_init=0.0001, random_state=3, max_iter=1000, activation='relu')
mlp_model.fit(X_train, y_train)

y_hat = mlp_model.predict(X_test)
print("classification report:\n", metrics.classification_report(y_test_PhraseBank, y_hat))


classification report:
               precision    recall  f1-score   support

           0       0.34      0.37      0.35       110
           1       0.68      0.64      0.66       226

    accuracy                           0.55       336
   macro avg       0.51      0.51      0.51       336
weighted avg       0.57      0.55      0.56       336



In [None]:
''' -- Sentiment140 Data Trial (not used for PhraseBank) -- '''

sentiment_data = pd.read_csv("./sentiment140_sample1.csv", header = None, encoding = "ISO-8859-1")
#sentiment_data.head()
sentiment_data_pos_neg = sentiment_data.loc[(sentiment_data[0] == 0) | (sentiment_data[0] == 4)]
len(sentiment_data)

sentiment_data_pos_neg = sentiment_data_pos_neg[[0,5]]
sentiment_data_pos_neg.iloc[sentiment_data_pos_neg[0] == 4, 0] = 1

''' 
Evaluate Model on PhraseBank Dataset.
'''
# Split to train and test. 
# Split to train and test data. For the FiQa dataset
labeled_headers = sentiment_data_pos_neg.iloc[:,1]
labels = sentiment_data_pos_neg.iloc[:,0]

labeled_headers_tokens, _, _ = generate_tokenized_corpus(labeled_headers)
docs_train, docs_test, y_train, y_test = train_test_split(
                                                labeled_headers_tokens, 
                                                labels, 
                                                test_size = 0.2, 
                                                random_state = 3
                                             )

X_train, mask_train = get_dataset_embeddings(docs_train, "AVG", w2v_pre, None)
X_test, mask_test = get_dataset_embeddings(docs_test, "AVG", w2v_pre, None)
y_train = np.array(y_train)[mask_train]
y_test = np.array(y_test)[mask_test]

'''_, _, y_train, y_test = train_test_split(
            labeled_headers_tokens, 
            labels, 
            test_size = 0.2, 
            random_state = 3
)'''

mlp_model = MLPClassifier(learning_rate_init=0.0001, random_state=3, max_iter=1000, activation='relu')
mlp_model.fit(X_train,y_train)
y_hat = mlp_model.predict(X_test)
print("classification report:\n", metrics.classification_report(y_test,y_hat))



100%|██████████| 10000/10000 [01:12<00:00, 137.64it/s]


classification report:
               precision    recall  f1-score   support

           0       0.71      0.73      0.72       908
           1       0.73      0.71      0.72       920

    accuracy                           0.72      1828
   macro avg       0.72      0.72      0.72      1828
weighted avg       0.72      0.72      0.72      1828



