In [1]:
import IPython
IPython.auto_scroll_threshold = 9999

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

%load_ext autoreload
%autoreload 2
%load_ext autotime

In [5]:
from datetime import datetime 
import pandas_market_calendars as mcal

from financials import group_by_period


# Bovespa calendar
bmf_calendar = mcal.get_calendar('BMF')

early = bmf_calendar.schedule(start_date='2000-01-01', end_date=datetime.utcnow())

daily_dates = mcal.date_range(early, frequency='1D')

time: 3.81 s


In [9]:
daily_dates.to_frame().reset_index()["index"]

0      2000-01-03 18:00:00+00:00
1      2000-01-04 18:00:00+00:00
2      2000-01-05 18:00:00+00:00
3      2000-01-06 18:00:00+00:00
4      2000-01-07 18:00:00+00:00
                  ...           
4954   2020-01-03 19:00:00+00:00
4955   2020-01-06 19:00:00+00:00
4956   2020-01-07 19:00:00+00:00
4957   2020-01-08 19:00:00+00:00
4958   2020-01-09 19:00:00+00:00
Name: index, Length: 4959, dtype: datetime64[ns, UTC]

time: 15.8 ms


In [13]:
import pandas as pd
import numpy as np
from db import sync_table
from pyspark.sql.types import *
import pyspark.sql.functions as F

from spark import init_spark_context, load_and_get_table_df

sc, sql_context = init_spark_context("Trading Calendars Job")

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Trading Calendars Job, master=local) created by __init__ at /home/jovyan/work/notebooks/spark.py:22 

time: 72.6 ms


In [18]:
trading_dates_schema = StructType([
    StructField("date", DateType(), True)])   

trading_dates_spark_df = sql_context.createDataFrame(
    daily_dates.to_frame(),
    schema=trading_dates_schema)

trading_dates_spark_df = trading_dates_spark_df.withColumn("calendar", F.lit("BMF"))

sync_table(trading_dates_spark_df, "tfm_uoc_dse", "tfm_uoc_analysis", "daily_trading_dates", ["calendar", "date"])

trading_dates_spark_df.write\
    .format("org.apache.spark.sql.cassandra")\
    .options(table="daily_trading_dates", keyspace="tfm_uoc_analysis")\
    .option("confirm.truncate","true")\
    .mode("overwrite")\
    .partitionBy("astodate")\
    .save()

time: 83.6 ms


In [28]:
monthly_trading_dates_df = group_by_period(
    trading_dates_spark_df,
    columns=["date", "date"],
    columns_aliases=["first_date", "last_date"],
    columns_agg=[F.first, F.last],
    date_field="date",
    group_columns=["calendar"],
    frequency="monthly"
)

sync_table(monthly_trading_dates_df, 
           "tfm_uoc_dse", "tfm_uoc_analysis", "monthly_trading_dates", 
           ["calendar", "year", "month"])

monthly_trading_dates_df.write\
    .format("org.apache.spark.sql.cassandra")\
    .options(table="monthly_trading_dates", keyspace="tfm_uoc_analysis")\
    .option("confirm.truncate","true")\
    .mode("overwrite")\
    .partitionBy("astodate")\
    .save()



Closing connections
time: 7.39 s


In [31]:
quarterly_trading_dates_df = group_by_period(
    trading_dates_spark_df,
    columns=["date", "date"],
    columns_aliases=["first_date", "last_date"],
    columns_agg=[F.first, F.last],
    date_field="date",
    group_columns=["calendar"],
    frequency="quarterly"
)

sync_table(quarterly_trading_dates_df, 
           "tfm_uoc_dse", "tfm_uoc_analysis", "quarterly_trading_dates", 
           ["calendar", "year", "quarter"])

quarterly_trading_dates_df.write\
    .format("org.apache.spark.sql.cassandra")\
    .options(table="quarterly_trading_dates", keyspace="tfm_uoc_analysis")\
    .option("confirm.truncate","true")\
    .mode("overwrite")\
    .partitionBy("astodate")\
    .save()

Closing connections
time: 5.61 s


In [32]:
yearly_trading_dates_df = group_by_period(
    trading_dates_spark_df,
    columns=["date", "date"],
    columns_aliases=["first_date", "last_date"],
    columns_agg=[F.first, F.last],
    date_field="date",
    group_columns=["calendar"],
    frequency="yearly"
)

sync_table(yearly_trading_dates_df, 
           "tfm_uoc_dse", "tfm_uoc_analysis", "yearly_trading_dates", 
           ["calendar", "year"])

yearly_trading_dates_df.write\
    .format("org.apache.spark.sql.cassandra")\
    .options(table="yearly_trading_dates", keyspace="tfm_uoc_analysis")\
    .option("confirm.truncate","true")\
    .mode("overwrite")\
    .partitionBy("astodate")\
    .save()

Closing connections
time: 7.7 s


In [33]:
sc.stop()

time: 827 ms
