In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf 
from pyspark.sql.functions import * 
from pyspark.sql import DataFrame
from logging import Logger
import os
import json

In [2]:
import import_ipynb
import db_utils as dbu

importing Jupyter notebook from db_utils.ipynb


In [3]:
SPARK_CONFIG = {
    "MASTER": "local[*]",
    "settings": {
      "spark.executor.cores": "1",
      "spark.executor.memory": "1g",
      "spark.driver.cores": "1",
      "spark.driver.memory": "1g",
      "spark.cores.max": "1"
    }
}

In [4]:
def init_spark_context() -> SparkContext:
    """ init spark context """

    # os.environ['PYSPARK_SUBMIT_ARGS'] = f'--jars jars/postgresql-42.5.0 pyspark-shell'
    conf = SparkConf()
    conf.setMaster(SPARK_CONFIG['MASTER'])
    conf.setAppName('app')

    for setting, value in SPARK_CONFIG['settings'].items():
        conf.set(setting, value)

    sc = SparkContext(conf=conf)

    return sc

sc = init_spark_context()
spark = SparkSession(sc)

## CSV to postgres demo

In [15]:
def get_mappings(spark:SparkSession):
    mapping_file_path = 'mapping/mapping.csv'
    mapping_df = spark.read.option('header',True).csv(mapping_file_path)
    source_column_list = list(mapping_df.select('source').toPandas()['source'])
    target_column_list = list(mapping_df.select('target').toPandas()['target'])
    source_target_dict = dict(zip(source_column_list, target_column_list))
    return source_target_dict


In [6]:
filename = 'sample.csv'
delimiter =','
# reading from source
source_df = dbu.read_csv_file(spark, filename, delimiter)
print(source_df.dtypes)

mappings = get_mappings(spark)
print(mappings)

target_config = {
"url" : "jdbc:postgresql://localhost:5432/postgres",
"driver" : "org.postgresql.Driver",
"user" : "postgres",
"password" : "admin"}
target_table="public.target_emp_details"
sql = ''
# reading from target
target_df = dbu.read_from_pg(spark, target_config, sql, target_table)
target_df.dtypes

source_date_format = 'dd-MM-yyyy'
target_date_format = 'yyyy-MM-dd'

final_df = dbu.convert_to_target_dtypes(source_df, target_df, mappings, source_date_format, target_date_format)

reading from file
file read successfully, returning dataframe


[('F_Name', 'string'),
 ('L_Name', 'string'),
 ('age', 'string'),
 ('dob', 'string'),
 ('salary', 'string')]

In [None]:
dbu.write_to_pg(spark, final_df, target_config, target_table, 'append' )

## oracle to postgres demo

In [None]:
def get_mappings(spark:SparkSession):
    mapping_file_path = 'mapping/mapping_tg.csv'
    mapping_df = spark.read.option('header',True).csv(mapping_file_path)
    source_column_list = list(mapping_df.select('source').toPandas()['source'])
    target_column_list = list(mapping_df.select('target').toPandas()['target'])
    source_target_dict = dict(zip(source_column_list, target_column_list))
    return source_target_dict

In [None]:
config = {
"url" : "jdbc:oracle:thin:@192.168.2.70:1521/PDB",
"driver" : "oracle.jdbc.driver.OracleDriver",
"user" : "cmx_ors",
"password" : "cmx_ors"}
table= "C_BO_ADDR"
sql=''
# reading from source
source_df = dbu.read_from_oracle(spark, config, '', table)
source_df.dtypes


target_config = {
"url" : "jdbc:postgresql://localhost:5432/postgres",
"driver" : "org.postgresql.Driver",
"user" : "postgres",
"password" : "admin"}
target_table="public.source_address"
# reading from target to get schema
target_df = dbu.read_from_pg(spark, target_config, sql, target_table)
target_df.dtypes

# getting source to target col mapping
mappings = get_mappings(spark)
print(mappings)
date_format = 'yyyy-MM-dd'
target_date_format = 'yyyy-MM-dd'

final_df = dbu.convert_to_target_dtypes(source_df, target_df, mappings, date_format, target_date_format)

In [None]:
dbu.write_to_pg(spark, final_df, target_config, target_table, 'append' )