In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkFiles
import pandas as pd
import numpy as np
import requests
from IPython.display import HTML


# Problem:
We need to train a NLP model on sets financial data in order for it to predict the sentiment of financial data


For this task, We want to use the BERT model, it's flexibility and popularity make it an obvious choice for sentiment analysis.
Because of BERT's popularity, it has high support in case we run into any problems training the model.


## Data:
Our Data will be sourced from Hugging Face and Kaggle and the goal of the data will be to provide BERT with the following qualities:

-An understanding of key words used in Human Emotional intelligence
-Market Volatility
-Financial Vocabulary
-Understanding of the impact financial news has on financial markets

Basically, if BERT were a person it would be an English major looking to become a stock trader.

Our data should reflect our goals, therefore the data we will use to match these goals will be:
## Financial Vocabulary:
### [Investopedia embeddings](https://huggingface.co/datasets/FinLang/investopedia-embedding-dataset)

## Human Emotional Intelligence/Market Volatility:
### [Stock Market Tweets](https://huggingface.co/datasets/mjw/stock_market_tweets)

## Financial News Impact
### [Reuters Financial News](https://huggingface.co/datasets/danidanou/Reuters_Financial_News)


## Emotional Intelligence:
### [Emotions Dataset](https://huggingface.co/datasets/boltuix/emotions-dataset)



In [2]:
#Test run with Reuters dataset

spark = SparkSession.builder.appName("ReutersNews").getOrCreate()
dataset = "danidanou/Reuters_Financial_News"

iframe_html = """
<iframe src="https://huggingface.co/datasets/{dataset}/embed/viewer" width="80%" height="560px"></iframe>
""".format(dataset=dataset)
display(HTML(iframe_html))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/29 15:50:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#Loading Reuters Data into Spark

HUGGING_FACE_PARQUET_API = "https://huggingface.co/api/datasets/{dataset}/parquet"
r = requests.get(HUGGING_FACE_PARQUET_API.format(dataset=dataset))

train_parquet_files = r.json()['default']['train']

for url in train_parquet_files:
    spark.sparkContext.addFile(url)

df = spark.read.parquet(SparkFiles.getRootDirectory() + "/*.parquet")

                                                                                

In [4]:
print(f"Shape of the dataset: {df.count()}, {len(df.columns)}")

Shape of the dataset: 105359, 7


In [5]:
df.show(n=10)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+
|            Headline|         Journalists|                Date|                Link|             Summary|             Article|__index_level_0__|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+
|Hitachi, GE boost...|                  []|Mon Nov 13, 2006 ...|http://www.reuter...| TOKYO - Hitachi ...| The move comes a...|                0|
|Volvo to cut 1,00...|                  []|Mon Nov 13, 2006 ...|http://www.reuter...| STOCKHOLM - Truc...| After years of s...|                1|
|European banks hi...|      [Andrew Hurst]|Mon Nov 13, 2006 ...|http://www.reuter...| ZURICH, Nov 13 (...| Since adopting I...|                2|
|Hitachi, GE to fo...|    [Mayumi Negishi]|Mon Nov 13, 2006 ...|http://www.reuter...| TOKYO - Japan's ...| The partnership .

In [6]:
df.describe().show()



+-------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|summary|            Headline|                Date|                Link|             Summary|             Article| __index_level_0__|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|  count|              105359|              105359|              105359|              105359|              105359|            105359|
|   mean|                null|                null|                null|                null|                null| 53243.31900454636|
| stddev|                null|                null|                null|                null|                null|30733.590297559498|
|    min|"A lot hangs" on ...|Fri Apr 1, 2011 1...|http://www.reuter...|                    |                    |                 0|
|    max|iTunes-like video...|Wed Sep 9, 2009 9...|http://www.

                                                                                

In [7]:
df.select('Headline').show()

+--------------------+
|            Headline|
+--------------------+
|Hitachi, GE boost...|
|Volvo to cut 1,00...|
|European banks hi...|
|Hitachi, GE to fo...|
|Eddie Bauer agree...|
|IBM to join Citig...|
|Yum takes slower ...|
|Illumina to buy g...|
|American Express ...|
|GE, Hitachi form ...|
|Clear Channel bid...|
|U.S. investor Bra...|
|Motorola wins $1....|
|Foundation blocks...|
|Starbucks sees 2,...|
|Stocks rise on oi...|
|Eddie Bauer accep...|
|DRAM demand for Q...|
|Ex-KB Home chief ...|
|Holiday sales see...|
+--------------------+
only showing top 20 rows



In [None]:
from transformers import BertTokenizer
from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
from pyspark.sql.functions import pandas_udf
import pandas as pd

configuartion="bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(configuartion)

tokenized_schema = StructType([
    StructField("input_ID", IntegerType()),
    StructField("attention_mask", ArrayType(IntegerType())),
    StructField("token_id", ArrayType(IntegerType()))
])

@pandas_udf(tokenized_schema)
def bert_tokenize(text_series: pd.Series) -> pd.DataFrame:
    tokenized_data = {
        "input_ids": [],
        "attention_mask":[],
        "token_id": []
    }
    for text in text_series:
        encoding = tokenizer.encode_plus(
            text,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_token_type_ids = True)
        
        tokenized_data["input_ids"].append(encoding['input_ids'])
        tokenized_data["attention_mask"].append(encoding['attention_mask'])
        tokenized_data['input_ids'].append(encoding['token_type_ids'])
    return pd.DataFrame(tokenized_data)

#Headline tokens
tokenized_df = df.withColumn("tokens", bert_tokenize(df["Headline"]))

In [12]:
print(f"Shape of the dataset: {tokenized_df.count()}, {len(tokenized_df.columns)}")

Shape of the dataset: 105359, 8
