In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, AutoTokenizer
import seaborn as sn
import matplotlib.pyplot as plt
from datasets import Dataset
from datasets import load_dataset
from dateutil import parser



In [2]:
import os
import sys
from dotenv import load_dotenv

#For Desktop 
#s.environ['PYSPARK_PYTHON'] = sys.executable
#os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from huggingface_hub import login

#For Desktop
#login(token=os.getenv("HF_MASTER_KEY"))

#For Macbook
load_dotenv()
login(token=os.getenv("MACBOOK_HF_KEY"))

# Clear conflicting environment variables
#if 'PYTHONPATH' in os.environ:
    #del os.environ['PYTHONPATH']



In [3]:
from pyspark.sql import SparkSession
from datasets import load_dataset
import pandas as pd

# Test run with Reuters and Ashraq financial news dataset
reuters_spark = SparkSession.builder.appName("ReutersNews").getOrCreate()
finnews_spark = SparkSession.builder.appName('FinNews').getOrCreate()

# Dictionary mapping spark sessions to dataset names
spark_dataset_mapping = {
    reuters_spark: "danidanou/Reuters_Financial_News",
    finnews_spark: "ashraq/financial-news-articles"
}

def ds_to_spark(spark_session, dataset_name, dataset_size=10000):
    """
    Convert a HuggingFace dataset to a Spark DataFrame
    
    Args:
        spark_session: Active Spark session
        dataset_name: HuggingFace dataset name/path
        dataset_size: Number of rows to load (default: 10000)
    
    Returns:
        Spark DataFrame or None if failed
    """
    from pyspark.sql.types import StructType, StructField, StringType
    
    print(f"Loading dataset: {dataset_name}")
    
    try:
        # Load HuggingFace dataset with size limit
        hf_df = load_dataset(dataset_name, split=f'train[:{dataset_size}]')
        
        # Convert to pandas
        pd_df = hf_df.to_pandas()
        
        print(f"DF shape: {pd_df.shape}")
        
        # Check and clean columns
        for col in pd_df.columns:
            if pd_df[col].dtype == 'object':
                # Get a sample of non-null values
                non_null_values = pd_df[col].dropna()
                
                if len(non_null_values) > 0:
                    # Check the first few values to determine type
                    sample_value = non_null_values.iloc[0]
                    
                    if isinstance(sample_value, (list, dict)):
                        # Convert complex types to string
                        pd_df[col] = pd_df[col].apply(lambda x: str(x) if x is not None else "")
                        print(f"Converted col '{col}' to string")
        
        # Fill null values
        pd_df = pd_df.fillna("")
        
        try:
            # Convert pandas DataFrame to list of Row objects for compatibility
            records = pd_df.to_dict('records')
            spark_df = spark_session.createDataFrame(records)
            print(f"Successfully created Spark DataFrame for {dataset_name}")
            print("Schema:")
            spark_df.printSchema()
            return spark_df
            
        except Exception as e:
            print(f"Error creating DataFrame for {dataset_name}: {e}")
            
            # Alternative approach: Define explicit schema and use records
            print("Trying with explicit string schema and records conversion...")
            
            try:
                # Create schema with all string types
                string_schema = StructType([
                    StructField(col, StringType(), True) for col in pd_df.columns
                ])
                
                # Convert all columns to string
                pd_df_str = pd_df.astype(str)
                
                # Convert to records format
                records = pd_df_str.to_dict('records')
                
                # Create Spark DataFrame with explicit schema
                spark_df = spark_session.createDataFrame(records, schema=string_schema)
                print(f"Successfully created Spark DataFrame with string schema for {dataset_name}")
                print("Schema:")
                spark_df.printSchema()
                return spark_df
                
            except Exception as e2:
                print(f"Failed even with string schema and records: {e2}")
                
                # Last resort: Manual row creation
                print("Trying manual row creation...")
                try:
                    from pyspark.sql import Row
                    
                    # Create Row objects manually
                    Row = Row(*pd_df.columns)
                    rows = [Row(*[str(val) for val in row]) for row in pd_df.values]
                    
                    spark_df = spark_session.createDataFrame(rows)
                    print(f"Successfully created Spark DataFrame with manual row creation for {dataset_name}")
                    print("Schema:")
                    spark_df.printSchema()
                    return spark_df
                    
                except Exception as e3:
                    print(f"All methods failed for {dataset_name}: {e3}")
                    return None
    
    except Exception as e:
        print(f"Error loading dataset {dataset_name}: {e}")
        return None



25/07/02 15:06:47 WARN Utils: Your hostname, MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 10.42.73.3 instead (on interface en0)
25/07/02 15:06:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/07/02 15:06:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/02 15:06:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
from datasets import load_dataset
reuters_df = load_dataset("danidanou/Reuters_Financial_News")['train'].to_pandas()
finnews_df = load_dataset("ashraq/financial-news-articles")['train'].to_pandas()

reuters_df = reuters_df.rename(columns={
    'Article': 'text',
    'Link': 'url',
    'Headline': 'title'
    })

reuters_df = reuters_df.drop(columns=['Journalists', '__index_level_0__', 'Summary'])

#Merging the df
merged_df = pd.concat([reuters_df, finnews_df])

#Shuffling the rows
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

#Removing duplicates based on url
merged_df = merged_df.drop_duplicates(subset=['url'], keep='first')

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 411547 entries, 0 to 411600
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   411547 non-null  object
 1   Date    105359 non-null  object
 2   url     411547 non-null  object
 3   text    411547 non-null  object
dtypes: object(4)
memory usage: 15.7+ MB


In [5]:
#DATA CLEANING 


merged_df['text_length'] = merged_df['text'].apply(lambda text: len(text))

merged_df['news_medium'] = merged_df['url'].str.extract(r'https?://([^/]+)')[0].str.replace('www.', '', regex=False).str[:-4]

#Standardizing dates and getting more date information from url

#Turning string dates to YYYY-MM-DD HH:MM:SS Format
merged_df['Date'] = merged_df['Date'].apply(lambda date: parser.parse(date) if pd.notna(date) else np.NaN)


#Getting dates from URL


url_dates = merged_df['url'].str.extract(r'/(\d{4})[/\-](\d{2})[/\-](\d{2})/').dropna().T.apply('-'.join)

merged_df['Date'] = merged_df['Date'].fillna(url_dates)


merged_df['Date'] = pd.to_datetime(merged_df['Date'], errors='coerce')

merged_df['Date'] = merged_df['Date'].dt.strftime("%Y-%m-%d")


#News article types
merged_df['news_type'] = merged_df['url'].str.extract(r'/([a-zA-Z]+)/')[0]

merged_df




Unnamed: 0,title,Date,url,text,text_length,news_medium,news_type
0,"Relief, anger, fear greet autos bailout in Det...",2008-12-19,http://www.reuters.com/article/2008/12/20/us-a...,But mixed with relief was fear about what fre...,3935,reuters,article
1,PIMCO's Gross sees fed funds at 3 percent by m...,2008-01-08,http://www.reuters.com/article/2008/01/08/us-u...,"The federal funds rate, the Fed's key policy ...",1324,reuters,article
2,"BRIEF-Indra Buys Paradigma, Consulting Firm Of...",,https://www.reuters.com/article/brief-indra-bu...,"Jan 17 (Reuters) - INDRA:\n* BUYS PARADIGMA, C...",131,reuters,article
3,Joost names former Cisco executive new CEO,2007-06-05,http://www.reuters.com/article/2007/06/05/us-j...,"Joost, launched late last year by the founder...",1351,reuters,article
4,Wall Street Week Ahead: It's earnings versus E...,2012-01-13,http://www.reuters.com/article/2012/01/13/us-u...,Bank stocks will probably once again be a pri...,3359,reuters,article
...,...,...,...,...,...,...,...
411596,BRIEF-Medicalsystem Biotechnology announces ch...,,https://www.reuters.com/article/brief-medicals...,April 20(Reuters) - Medicalsystem Biotechnolog...,235,reuters,article
411597,Short sellers target Apple supplier IQE,,https://www.reuters.com/article/iqe-shortselli...,"January 30, 2018 / 3:09 PM / Updated an hour a...",3601,reuters,article
411598,BRIEF-Taiwan Leader Biotech sets subscription ...,,https://www.reuters.com/article/brief-taiwan-l...,March 7 (Reuters) - Taiwan Leader Biotech Corp...,422,reuters,article
411599,UK's Cameron calls for new press regulation sy...,2011-07-08,http://www.reuters.com/article/2011/07/08/us-n...,"""I believe we need a new system entirely, it ...",831,reuters,article


In [6]:
#reuters_df = ds_to_spark(reuters_spark, "danidanou/Reuters_Financial_News", 30000)
#finnews_df = ds_to_spark(finnews_spark, "ashraq/financial-news-articles", 30000)

# Check if successful
#if reuters_df:
    #print("Reuters dataset loaded successfully")
    #reuters_df.show(5)

#if finnews_df:
    #print("Financial News dataset loaded successfully") 
    #finnews_df.show(5)

Below are methods to push the data to Supabase DB

In [13]:
import socket
hostname = "db.cxxzifmsmlxqllnfcurt.supabase.co"
try:
    socket.gethostbyname(hostname)
    print(f"Hostname {hostname} resolved successfully")
except socket.gaierror:
    print(f"Could not resolve hostname: {hostname}")

Could not resolve hostname: db.cxxzifmsmlxqllnfcurt.supabase.co


In [10]:
os.getenv("SB_PASS")

'Mediciroasting19'

In [15]:
from sqlalchemy import create_engine



db_pass = os.getenv("SB_PASS")
connection = f"postgresql://postgres.cxxzifmsmlxqllnfcurt:{db_pass}@aws-0-us-east-2.pooler.supabase.com:5432/postgres"

conn = create_engine(connection)

merged_df.drop(columns=['text']).to_sql('pretrain_data', con=conn, if_exists='replace')

547

25/07/02 17:34:41 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 666992 ms exceeds timeout 120000 ms
25/07/02 17:34:41 WARN SparkContext: Killing executors is not supported by current scheduler.


Below are methods to tokenize the merged dataset and push the data to huggingface

In [20]:
merged_df.drop(columns=['text']).to_csv('pretrain_data.csv', sep="|")

In [21]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 411547 entries, 0 to 411600
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        411547 non-null  object
 1   Date         212531 non-null  object
 2   url          411547 non-null  object
 3   text         411547 non-null  object
 4   text_length  411547 non-null  int64 
 5   news_medium  411547 non-null  object
 6   news_type    335698 non-null  object
dtypes: int64(1), object(6)
memory usage: 25.1+ MB


In [22]:
merged_df.head()

Unnamed: 0,title,Date,url,text,text_length,news_medium,news_type
0,"Relief, anger, fear greet autos bailout in Det...",2008-12-19,http://www.reuters.com/article/2008/12/20/us-a...,But mixed with relief was fear about what fre...,3935,reuters,article
1,PIMCO's Gross sees fed funds at 3 percent by m...,2008-01-08,http://www.reuters.com/article/2008/01/08/us-u...,"The federal funds rate, the Fed's key policy ...",1324,reuters,article
2,"BRIEF-Indra Buys Paradigma, Consulting Firm Of...",,https://www.reuters.com/article/brief-indra-bu...,"Jan 17 (Reuters) - INDRA:\n* BUYS PARADIGMA, C...",131,reuters,article
3,Joost names former Cisco executive new CEO,2007-06-05,http://www.reuters.com/article/2007/06/05/us-j...,"Joost, launched late last year by the founder...",1351,reuters,article
4,Wall Street Week Ahead: It's earnings versus E...,2012-01-13,http://www.reuters.com/article/2012/01/13/us-u...,Bank stocks will probably once again be a pri...,3359,reuters,article


In [24]:
#Investigating some possible merging
for entry in merged_df.news_medium.unique():
    print(entry)
print()
for entry in merged_df.news_type.unique():
    print(entry)

reuters
cnbc
wsj
in.reuters
uk.reuters
fortune
jp.wsj
live.wsj
blogs.wsj
it.reuters
graphics.wsj
cn.reuters
quotes.wsj
chinese.wsj

article
video
nan
articles
moneybeat
puzzle
cio
news
riskandcompliance
economics
podcasts
washwire
cfo
longform
glider
briefly
id
livecoverage
graphics
experts
frontiers
advertorial
investigates
amp
dailyshot
index
ALLY
KSS


In [51]:

def find_subsection(entry):
    entry_list = entry.split('.')
    if len(entry_list) > 1:
        return entry_list[0]
    else:
        return np.nan


merged_df['news_source'] = merged_df['news_medium'].str.split('.').str[-1]
merged_df['news_subsection'] = merged_df['news_medium'].apply(find_subsection)
merged_df['news_type'] = merged_df['news_type'].replace({'articles': 'article'})

In [54]:
merged_df.drop(columns=['text']).to_csv('pretrain_data.csv', sep='|')

In [16]:
#We'll do 3 methods the first will be to take the first 512 words of an article (if it has it), combine into a column for a dataframe
#The second method will be to take the middle 512 words (if available) of an article and combine into a column
#The third will be to take the first 2000 words of an article, combine into a column 

from pyspark.sql.functions import substring, col, expr
from pyspark.sql.types import IntegerType



#Method one
first_512_df = merged_df[merged_df['text_length'] >= 512]

#Method two

#Making a function to define middle of column
middle_512_df = merged_df
middle_512_df['text'] = merged_df['text'].apply(
    lambda text: text[int((len(text) - 512) / 2):int(len(text) - (len(text) - 512) / 2)]
)

#Third_method
first_4096_df = merged_df[merged_df["text_length"] >= 4096]





In [24]:
#The third method will feed into a longform tranfsformer for pretraining
#The second method and first method will feed into a DistilBERT for pretraining
#We'll use a one lstm that will train on the first 512 words (benchmark, to be added later)


#Bert tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
lf_tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

datasets_pd = {
    'first_512': first_512_df,
    'middle_512': middle_512_df, 
    'first_4096': first_4096_df
}

    
# Tokenize
datasets_hf = {
    name: Dataset.from_pandas(df) 
    for name, df in datasets_pd.items()
}


configs = [
    {'name': 'first_512', 'tokenizer': bert_tokenizer, 'max_length': 512},
    {'name': 'middle_512', 'tokenizer': bert_tokenizer, 'max_length': 512},
    {'name': 'first_4096', 'tokenizer': lf_tokenizer, 'max_length': 4096}
]


tokenized_datasets = {}


for config in configs:
    tokenized_datasets[config['name']] = datasets_hf[config['name']].map(
        lambda examples: config['tokenizer'](
            examples['text'],  # adjust column name as needed
            truncation=True,
            padding='max_length',
            max_length=config['max_length'],
            return_tensors='pt'
        ),
        batched=True
    )

Map:   0%|          | 0/314651 [00:00<?, ? examples/s]

Map:   0%|          | 0/411601 [00:00<?, ? examples/s]

Map:   0%|          | 0/67030 [00:00<?, ? examples/s]

In [None]:
#Pushing tokenized datasets to huggingface

for dataset in tokenized_datasets.keys():
    tokenized_datasets[dataset].push_to_hub(f"Czunzun/Financial_news_{dataset}")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/369 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/138 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/138 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/138 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/370 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/369 [00:00<?, ?B/s]