In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, AutoTokenizer
import seaborn as sn
import matplotlib.pyplot as plt
from datasets import Dataset
from datasets import load_dataset
from dateutil import parser



In [2]:
import os
import sys
from dotenv import load_dotenv

#For Desktop 
#s.environ['PYSPARK_PYTHON'] = sys.executable
#os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from huggingface_hub import login

#For Desktop
#login(token=os.getenv("HF_MASTER_KEY"))

#For Macbook
load_dotenv()
login(token=os.getenv("MACBOOK_HF_KEY"))

# Clear conflicting environment variables
#if 'PYTHONPATH' in os.environ:
    #del os.environ['PYTHONPATH']



In [3]:
from pyspark.sql import SparkSession
from datasets import load_dataset
import pandas as pd

# Test run with Reuters and Ashraq financial news dataset
reuters_spark = SparkSession.builder.appName("ReutersNews").getOrCreate()
finnews_spark = SparkSession.builder.appName('FinNews').getOrCreate()

# Dictionary mapping spark sessions to dataset names
spark_dataset_mapping = {
    reuters_spark: "danidanou/Reuters_Financial_News",
    finnews_spark: "ashraq/financial-news-articles"
}

def ds_to_spark(spark_session, dataset_name, dataset_size=10000):
    """
    Convert a HuggingFace dataset to a Spark DataFrame
    
    Args:
        spark_session: Active Spark session
        dataset_name: HuggingFace dataset name/path
        dataset_size: Number of rows to load (default: 10000)
    
    Returns:
        Spark DataFrame or None if failed
    """
    from pyspark.sql.types import StructType, StructField, StringType
    
    print(f"Loading dataset: {dataset_name}")
    
    try:
        # Load HuggingFace dataset with size limit
        hf_df = load_dataset(dataset_name, split=f'train[:{dataset_size}]')
        
        # Convert to pandas
        pd_df = hf_df.to_pandas()
        
        print(f"DF shape: {pd_df.shape}")
        
        # Check and clean columns
        for col in pd_df.columns:
            if pd_df[col].dtype == 'object':
                # Get a sample of non-null values
                non_null_values = pd_df[col].dropna()
                
                if len(non_null_values) > 0:
                    # Check the first few values to determine type
                    sample_value = non_null_values.iloc[0]
                    
                    if isinstance(sample_value, (list, dict)):
                        # Convert complex types to string
                        pd_df[col] = pd_df[col].apply(lambda x: str(x) if x is not None else "")
                        print(f"Converted col '{col}' to string")
        
        # Fill null values
        pd_df = pd_df.fillna("")
        
        try:
            # Convert pandas DataFrame to list of Row objects for compatibility
            records = pd_df.to_dict('records')
            spark_df = spark_session.createDataFrame(records)
            print(f"Successfully created Spark DataFrame for {dataset_name}")
            print("Schema:")
            spark_df.printSchema()
            return spark_df
            
        except Exception as e:
            print(f"Error creating DataFrame for {dataset_name}: {e}")
            
            # Alternative approach: Define explicit schema and use records
            print("Trying with explicit string schema and records conversion...")
            
            try:
                # Create schema with all string types
                string_schema = StructType([
                    StructField(col, StringType(), True) for col in pd_df.columns
                ])
                
                # Convert all columns to string
                pd_df_str = pd_df.astype(str)
                
                # Convert to records format
                records = pd_df_str.to_dict('records')
                
                # Create Spark DataFrame with explicit schema
                spark_df = spark_session.createDataFrame(records, schema=string_schema)
                print(f"Successfully created Spark DataFrame with string schema for {dataset_name}")
                print("Schema:")
                spark_df.printSchema()
                return spark_df
                
            except Exception as e2:
                print(f"Failed even with string schema and records: {e2}")
                
                # Last resort: Manual row creation
                print("Trying manual row creation...")
                try:
                    from pyspark.sql import Row
                    
                    # Create Row objects manually
                    Row = Row(*pd_df.columns)
                    rows = [Row(*[str(val) for val in row]) for row in pd_df.values]
                    
                    spark_df = spark_session.createDataFrame(rows)
                    print(f"Successfully created Spark DataFrame with manual row creation for {dataset_name}")
                    print("Schema:")
                    spark_df.printSchema()
                    return spark_df
                    
                except Exception as e3:
                    print(f"All methods failed for {dataset_name}: {e3}")
                    return None
    
    except Exception as e:
        print(f"Error loading dataset {dataset_name}: {e}")
        return None



25/07/02 14:39:23 WARN Utils: Your hostname, MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 10.42.73.3 instead (on interface en0)
25/07/02 14:39:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/07/02 14:39:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/02 14:39:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
from datasets import load_dataset
reuters_df = load_dataset("danidanou/Reuters_Financial_News")['train'].to_pandas()
finnews_df = load_dataset("ashraq/financial-news-articles")['train'].to_pandas()

reuters_df = reuters_df.rename(columns={
    'Article': 'text',
    'Link': 'url',
    'Headline': 'title'
    })

reuters_df = reuters_df.drop(columns=['Journalists', '__index_level_0__', 'Summary'])

#Merging the df
merged_df = pd.concat([reuters_df, finnews_df])

#Shuffling the rows
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

#Removing duplicates based on url
merged_df = merged_df.drop_duplicates(subset=['url'], keep='first')

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 411547 entries, 0 to 411600
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   411547 non-null  object
 1   Date    105359 non-null  object
 2   url     411547 non-null  object
 3   text    411547 non-null  object
dtypes: object(4)
memory usage: 15.7+ MB


In [5]:
#DATA CLEANING 


merged_df['text_length'] = merged_df['text'].apply(lambda text: len(text))

merged_df['news_medium'] = merged_df['url'].str.extract(r'https?://([^/]+)')[0].str.replace('www.', '', regex=False).str[:-4]

#Standardizing dates and getting more date information from url

#Turning string dates to YYYY-MM-DD HH:MM:SS Format
merged_df['Date'] = merged_df['Date'].apply(lambda date: parser.parse(date) if pd.notna(date) else np.NaN)


#Getting dates from URL


url_dates = merged_df['url'].str.extract(r'/(\d{4})[/\-](\d{2})[/\-](\d{2})/').dropna().T.apply('-'.join)

merged_df['Date'] = merged_df['Date'].fillna(url_dates)


merged_df['Date'] = pd.to_datetime(merged_df['Date'], errors='coerce')

merged_df['Date'] = merged_df['Date'].dt.strftime("%Y-%m-%d")


#News article types
merged_df['news_type'] = merged_df['url'].str.extract(r'/([a-zA-Z]+)/')[0]

merged_df




Unnamed: 0,title,Date,url,text,text_length,news_medium,news_type
0,"Commentary: To contain Iran, look first to Yem...",,https://www.reuters.com/article/us-pascual-yem...,"In the Middle East, all eyes are fixed on Syri...",6646,reuters,article
1,HYPR Expands to Europe as Growing Demand for P...,2018-05-21,http://www.cnbc.com/2018/05/21/pr-newswire-hyp...,"NEW YORK, May 21, 2018 /PRNewswire/ -- HYPR Co...",5496,cnbc,
2,Don’t Get Distracted by the Trade Deficit With...,,https://www.wsj.com/articles/dont-get-distract...,Trade negotiations with China are grinding for...,582,wsj,articles
3,GSK buys out Novartis in $13bln shake-up,2018-03-27,https://www.reuters.com/video/2018/03/27/gsk-b...,GSK buys out Novartis in $13bln shake-up 8:48a...,534,reuters,video
4,CORRECTED-US STOCKS SNAPSHOT-Wall St briefly l...,,https://www.reuters.com/article/usa-stocks/us-...,"April 18, 2018 / 6:20 PM / Updated 18 minutes ...",849,reuters,article
...,...,...,...,...,...,...,...
411596,Fed's Bullard: May see asset sales late 2010,2010-02-08,http://www.reuters.com/article/2010/02/08/us-u...,The Fed's purchases last year of longer-term ...,4535,reuters,article
411597,SocGen suicides put stress at work under spotl...,2008-01-29,http://www.reuters.com/article/2008/01/29/us-s...,Societe Generale last week revealed that a ro...,3151,reuters,article
411598,Royal Mail nine-month revenue rises on higher ...,,https://uk.reuters.com/article/uk-royal-mail-o...,"January 18, 2018 / 7:30 AM / Updated 12 hours ...",1401,uk.reuters,article
411599,China is a target market for Scotch Whisky,2018-01-31,https://www.cnbc.com/video/2018/01/31/china-is...,China is a target market for Scotch Whisky 20 ...,193,cnbc,video


In [6]:
#reuters_df = ds_to_spark(reuters_spark, "danidanou/Reuters_Financial_News", 30000)
#finnews_df = ds_to_spark(finnews_spark, "ashraq/financial-news-articles", 30000)

# Check if successful
#if reuters_df:
    #print("Reuters dataset loaded successfully")
    #reuters_df.show(5)

#if finnews_df:
    #print("Financial News dataset loaded successfully") 
    #finnews_df.show(5)

Below are methods to push the data to Supabase DB

In [10]:
import socket
hostname = "aws-0-us-east-2.pooler.supabase.com"
try:
    socket.gethostbyname(hostname)
    print(f"Hostname {hostname} resolved successfully")
except socket.gaierror:
    print(f"Could not resolve hostname: {hostname}")

Hostname aws-0-us-east-2.pooler.supabase.com resolved successfully


In [12]:
sb_key

'Medici_roasting@19'

In [14]:
os.getenv("DB_URL")

'https://cxxzifmsmlxqllnfcurt.supabase.co'

In [15]:
os.getenv("DB_KEY")

In [16]:
from supabase import create_client, Client
url: str = os.getenv("DB_URL")
key: str = os.getenv("DB_TOKEN")
supabase: Client = create_client(url, key)

In [None]:
try:
    response = (
            supabase.table("pretrain_data")
            .insert([
                merged_df.to_json()
                ])
            .execute()
        )
    return response
except Execption as exception:
    return exception

In [None]:
from sqlalchemy import create_engine
sb_key = os.getenv("SB_PASS")
connection = f"postgresql://postgres.cxxzifmsmlxqllnfcurt:{sb_key}@aws-0-us-east-2.pooler.supabase.com:5432/postgres"

conn = create_engine(connection,)

merged_df.to_sql('pretrain_data', con=conn, rewrite=True,index=False)



OperationalError: (psycopg2.OperationalError) could not translate host name "19@aws-0-us-east-2.pooler.supabase.com" to address: nodename nor servname provided, or not known

(Background on this error at: https://sqlalche.me/e/20/e3q8)

Below are methods to tokenize the merged dataset and push the data to huggingface

In [16]:
#We'll do 3 methods the first will be to take the first 512 words of an article (if it has it), combine into a column for a dataframe
#The second method will be to take the middle 512 words (if available) of an article and combine into a column
#The third will be to take the first 2000 words of an article, combine into a column 

from pyspark.sql.functions import substring, col, expr
from pyspark.sql.types import IntegerType



#Method one
first_512_df = merged_df[merged_df['text_length'] >= 512]

#Method two

#Making a function to define middle of column
middle_512_df = merged_df
middle_512_df['text'] = merged_df['text'].apply(
    lambda text: text[int((len(text) - 512) / 2):int(len(text) - (len(text) - 512) / 2)]
)

#Third_method
first_4096_df = merged_df[merged_df["text_length"] >= 4096]





In [24]:
#The third method will feed into a longform tranfsformer for pretraining
#The second method and first method will feed into a DistilBERT for pretraining
#We'll use a one lstm that will train on the first 512 words (benchmark, to be added later)


#Bert tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
lf_tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

datasets_pd = {
    'first_512': first_512_df,
    'middle_512': middle_512_df, 
    'first_4096': first_4096_df
}

    
# Tokenize
datasets_hf = {
    name: Dataset.from_pandas(df) 
    for name, df in datasets_pd.items()
}


configs = [
    {'name': 'first_512', 'tokenizer': bert_tokenizer, 'max_length': 512},
    {'name': 'middle_512', 'tokenizer': bert_tokenizer, 'max_length': 512},
    {'name': 'first_4096', 'tokenizer': lf_tokenizer, 'max_length': 4096}
]


tokenized_datasets = {}


for config in configs:
    tokenized_datasets[config['name']] = datasets_hf[config['name']].map(
        lambda examples: config['tokenizer'](
            examples['text'],  # adjust column name as needed
            truncation=True,
            padding='max_length',
            max_length=config['max_length'],
            return_tensors='pt'
        ),
        batched=True
    )

Map:   0%|          | 0/314651 [00:00<?, ? examples/s]

Map:   0%|          | 0/411601 [00:00<?, ? examples/s]

Map:   0%|          | 0/67030 [00:00<?, ? examples/s]

In [None]:
#Pushing tokenized datasets to huggingface

for dataset in tokenized_datasets.keys():
    tokenized_datasets[dataset].push_to_hub(f"Czunzun/Financial_news_{dataset}")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/79 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/369 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/138 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/138 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/138 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/370 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/369 [00:00<?, ?B/s]