In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf 
from pyspark.sql.functions import * 
from pyspark.sql import DataFrame
from logging import Logger
import os
import json

In [2]:
import import_ipynb
import db_utils as dbu

importing Jupyter notebook from db_utils.ipynb


In [3]:
SPARK_CONFIG = {
    "MASTER": "local[*]",
    "settings": {
      "spark.executor.cores": "1",
      "spark.executor.memory": "1g",
      "spark.driver.cores": "1",
      "spark.driver.memory": "1g",
      "spark.cores.max": "1"
    }
}

In [4]:
def init_spark_context() -> SparkContext:
    """ init spark context """

    # os.environ['PYSPARK_SUBMIT_ARGS'] = f'--jars jars/postgresql-42.5.0 pyspark-shell'
    conf = SparkConf()
    conf.setMaster(SPARK_CONFIG['MASTER'])
    conf.setAppName('app')

    for setting, value in SPARK_CONFIG['settings'].items():
        conf.set(setting, value)

    sc = SparkContext(conf=conf)

    return sc

sc = init_spark_context()
spark = SparkSession(sc)

## CSV to postgres demo

In [None]:
# def get_mappings(spark:SparkSession):
#     mapping_file_path = 'mapping/mapping.csv'
#     mapping_df = spark.read.option('header',True).csv(mapping_file_path)
#     source_column_list = list(mapping_df.select('source').toPandas()['source'])
#     target_column_list = list(mapping_df.select('target').toPandas()['target'])
#     source_target_dict = dict(zip(source_column_list, target_column_list))
#     return source_target_dict

In [None]:
filename = 'sample.csv'
delimiter =','
# reading from source
source_df = dbu.read_csv_file(spark, filename, delimiter)
print(source_df.dtypes)

mappings = dbu.mapping_generation(spark)
columns_for_date_conversion = mappings['columns_for_date_conversion']
source_to_target_mapping = mappings['source_target_column_mapping']
# print(mappings)

target_config = {
"url" : "jdbc:postgresql://localhost:5432/postgres",
"driver" : "org.postgresql.Driver",
"user" : "postgres",
"password" : "admin"}
target_table="public.target_emp_details"
sql = ''
# reading from target
target_df = dbu.read_from_pg(spark, target_config, sql, target_table)
print(target_df.dtypes)

In [None]:
type_converted_df = dbu.convert_to_target_dtypes(source_df, target_df, source_to_target_mapping)

In [None]:
final_df = dbu.date_column_format_converter(spark, type_converted_df ,columns_for_date_conversion)

In [None]:
final_df.show()

In [None]:
dbu.write_to_pg(spark, final_df, target_config, target_table, 'append' )

## oracle to postgres demo

In [None]:
# def get_mappings(spark:SparkSession):
#     mapping_file_path = 'mapping/mapping_tg.csv'
#     mapping_df = spark.read.option('header',True).csv(mapping_file_path)
#     source_column_list = list(mapping_df.select('source').toPandas()['source'])
#     target_column_list = list(mapping_df.select('target').toPandas()['target'])
#     source_target_dict = dict(zip(source_column_list, target_column_list))
#     return source_target_dict

In [None]:
config = {
"url" : "jdbc:oracle:thin:@192.168.2.70:1521/PDB",
"driver" : "oracle.jdbc.driver.OracleDriver",
"user" : "cmx_ors",
"password" : "cmx_ors"}
table= "C_BO_ADDR"
sql=''
# reading from source
source_df = dbu.read_from_oracle(spark, config, '', table)
source_df.dtypes

target_config = {
"url" : "jdbc:postgresql://localhost:5432/postgres",
"driver" : "org.postgresql.Driver",
"user" : "postgres",
"password" : "admin"}
target_table="public.source_address"
# reading from target to get schema
target_df = dbu.read_from_pg(spark, target_config, sql, target_table)
target_df.dtypes

# getting source to target col mapping
mappings = dbu.mapping_generation(spark)
columns_for_date_conversion = mappings['columns_for_date_conversion']
source_to_target_mapping = mappings['source_target_column_mapping']

final_df = dbu.convert_to_target_dtypes(source_df, target_df, source_to_target_mapping)

In [None]:
final_df.show()

In [None]:
dbu.write_to_pg(spark, final_df, target_config, target_table, 'append' )

In [5]:
# source information
source_name = 'manualfile'
source_config = {}
source_query = '' 
source_table = ''
source_filename = 'sample.csv'
source_delimiter = ','

In [6]:
# target information
target_name = 'postgres'
target_config = {
"url" : "jdbc:postgresql://localhost:5432/postgres",
"driver" : "org.postgresql.Driver",
"user" : "postgres",
"password" : "admin"}
target_table = 'public.target_emp_details'
target_filename = ''
target_delimiter = ''

In [7]:
# reading from source into source_df
if source_name == 'oracle':
    source_df = dbu.read_from_oracle(spark , source_config, source_query, source_table)
    
elif source_name == 'sqlserver':
    source_df = dbu.read_from_msssql(spark , source_config, source_query, source_table)
    
elif source_name == 'postgres':
    source_df = dbu.read_from_pg(spark , source_config, source_query, source_table)

elif source_name == 'manualfile':
    source_df = dbu.read_csv_file(spark , source_filename, source_delimiter)    

reading from file
file read successfully, returning dataframe


In [8]:
#reading from target for target_df creation
if target_name == 'oracle':
    target_df = dbu.read_from_oracle(spark , target_config, '', target_table)
    
elif target_name == 'sqlserver':
    target_df = dbu.read_from_msssql(spark , target_config, '', target_table)
    
elif target_name == 'postgres':
    target_df = dbu.read_from_pg(spark , target_config, '', target_table)

reading from postgresql
reading directly from source table


In [None]:
mapping_filename - 'mapping.csv'

In [9]:
mappings = dbu.mapping_generation(spark, mapping_filename)
columns_for_date_conversion = mappings['columns_for_date_conversion']
source_to_target_mapping = mappings['source_target_column_mapping']
static_target_columns = mappings['static_target_columns']
default_value_for_null_columns = mappings['default_value_for_null_columns']

In [10]:
type_converted_df = dbu.convert_to_target_dtypes(source_df, target_df, source_to_target_mapping)

fname
lname
age
salary


In [11]:
date_converted_df = dbu.date_column_format_converter(type_converted_df ,columns_for_date_conversion)

In [12]:
hard_coded_value_populated_df = dbu.populate_column_with_default_values(date_converted_df, static_target_columns)

In [13]:
fill_na_dict = dbu.create_fill_na_dict(default_value_for_null_columns)
null_populated_df = dbu.populate_null_values(hard_coded_value_populated_df, fill_na_dict)

In [15]:
null_populated_df.show()

+-------+----------+---+----------+---------+-----------+
|  fname|     lname|age|       dob|   salary|source_name|
+-------+----------+---+----------+---------+-----------+
|Charles|Richardson| 23|1999-03-12|  20000.0|   csv file|
|    Joy|   Gerrard| 19|2001-12-13|  30000.0|   csv file|
|   Rick|      Sera| 20|2000-07-08|   1200.0|   csv file|
|  Jason|       Roy| 24|1998-05-07| 149000.0|   csv file|
|    Jos|    Butler| 26|1996-12-09|  87000.0|   csv file|
|  Steve|     Smith| 20|2000-12-25|  20000.0|   csv file|
| Selena|  Williams| 23|1999-05-15|  29870.0|   csv file|
| Winnie|       Reg| 27|1995-04-12|  23769.0|   csv file|
| Junior|      Hila| 23|1999-03-18| 909876.0|   csv file|
|  David|     Bella| 22|2000-02-29|1100000.0|   csv file|
|   Vish|         D|  0|1998-02-21|      0.0|   csv file|
+-------+----------+---+----------+---------+-----------+



In [17]:
target_write_mode ='append'

In [23]:
if target_name == 'oracle':
    dbu.write_to_oracle(spark, null_populated_df, target_config, target_table, target_write_mode) 
    
elif target_name == 'sqlserver':
    dbu.write_to_mssql(spark, null_populated_df, target_config, target_table, target_write_mode) 
    
elif target_name == 'postgres':
    dbu.write_to_pg(spark, null_populated_df, target_config, target_table, target_write_mode) 
    
elif target_name == 'manualfile':
    dbu.write_to_csv(null_populated_df, target_filename, target_delimiter)

'Data written into postgresql successfully'