In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from logging import Logger
import os
import json

In [2]:
import import_ipynb
import db_utils as dbu

importing Jupyter notebook from db_utils.ipynb


In [3]:
SPARK_CONFIG = {
    "MASTER": "local[*]",
    "settings": {
      "spark.executor.cores": "1",
      "spark.executor.memory": "1g",
      "spark.driver.cores": "1",
      "spark.driver.memory": "1g",
      "spark.cores.max": "1"
    }
}

In [4]:
def init_spark_context() -> SparkContext:
    """ init spark context """

    # os.environ['PYSPARK_SUBMIT_ARGS'] = f'--jars jars/postgresql-42.5.0 pyspark-shell'
    conf = SparkConf()
    conf.setMaster(SPARK_CONFIG['MASTER'])
    conf.setAppName('app')

    for setting, value in SPARK_CONFIG['settings'].items():
        conf.set(setting, value)

    sc = SparkContext(conf=conf)

    return sc

sc = init_spark_context()
spark = SparkSession(sc)

In [6]:
# from pyspark.conf import SparkConf
# from pyspark.sql import SparkSession
# spark.sparkContext._conf.getAll()

In [None]:
# spark

In [None]:
def get_source_config(source_name: str):
    """
    Args: 
        source_name: source name for which we want to get the configuration details
    Returns:
        configuration dictinory with required details to read from source
    """
    ## Path is hardcoded for devlopment need to change
    source_file = r'C:\Users\vishwajeet.dabholkar\Documents\data-mapping\migration\config\source_config.json'
    with open(source_file) as json_file:
        data = json.load(json_file)
    
    return data[source_name]

In [None]:
def get_target_config(target_name: str):
    """
    Args: 
        target_name: source name for which we want to get the configuration details
    Returns:
        configuration dictinory with required details to read from source
    """
    ## Path is hardcoded for devlopment need to change
    target_file = r'C:\Users\vishwajeet.dabholkar\Documents\data-mapping\migration\config\target_config.json'
    with open(target_file) as json_file:
        data = json.load(json_file)
    
    return data[target_name]

In [None]:
def get_mappings(spark:SparkSession):
    mapping_file_path = 'mapping/mapping.csv'
    mapping_df = spark.read.option('header',True).csv(mapping_file_path)
    source_column_list = list(mapping_df.select('source').toPandas()['source'])
    target_column_list = list(mapping_df.select('target').toPandas()['target'])
    source_target_dict = dict(zip(source_column_list, target_column_list))
    return source_target_dict

In [None]:
def read_query(filename:str)-> str:
    """
    Function to read queyr file and return it as string
    args:
        filename : Name of the file in which query is stored. '.sq' format is recommended
    """
    if os.path.isfile('query/'+filename):
        text_file = open('query/'+filename, "r")
 
        #read whole file to a string
        data = text_file.read()

        #close file
        text_file.close()

        return data
    else:
        return 'File not present'

In [None]:
def get_source_input_config():
    
    sources= {1:"manualfile",
             2:"oracle",
             3:"sqlserver",
             4:"postgresql"
            }
    
    print("Select Source:")
    print("1.Manual File\n2.Oracle\n3.MSSQL\n4.PostgreSQL")
    inputs_source_name =  int(input('source_name: '))
    
    source_name = ''
    for k,v in sources.items():
        if inputs_source_name == k:
            source_name = v
    source_config = get_source_config(source_name)
    
    if source_name == 'manualfile':
        file_path = input('file path: ')
        delimeter = input('delimeter: ')
        source_config['filepath'] = file_path
        source_config['delimeter'] = delimeter
        
    else:
        url = input('jdbc url : ')
        user = input('user : ')
        password = input('password : ')
        dbtable = input('database to read from : ')
        source_config['config']['url'] = url
        source_config['config']['user'] = user
        source_config['config']['password'] = password
        
        if dbtable == '':
            query_path = input('query path: ')
            query = read_query(query_path)
            source_config['config']['query'] = query
        else:
            source_config['config']['dbtable'] = dbtable

    return source_config

In [None]:
def get_target_input_config():
    
    targets= {1:"manualfile",
             2:"oracle",
             3:"sqlserver",
             4:"postgresql"
            }
    
    print("Select Source:")
    print("1.Manual File\n2.Oracle\n3.MSSQL\n4.PostgreSQL")
    inputs_target_name =  int(input('source_name: '))
    
    target_name = ''
    for k,v in targets.items():
        if inputs_target_name == k:
            target_name = v
    target_config = get_target_config(target_name)
    
    if target_name == 'manualfile':
        file_path = input('file path: ')
        delimeter = input('delimeter: ')
        
        target_config['filepath'] = file_path
        target_config['delimeter'] = delimeter
        
    else:
        url = input('jdbc url : ')
        user = input('user : ')
        password = input('password : ')
        dbtable = input('database table to write into : ')
        target_config['config']['url'] = url
        target_config['config']['user'] = user
        target_config['config']['password'] = password
        target_config['config']['dbtable'] = dbtable

    return target_config

In [None]:
def source_to_tarrget_df_conversion(source_df: DataFrame, mapping_dict: dict) -> DataFrame:
    """
    Converting source dataframe column names to target column names
    Args:
        source_df: dataframe which we convert for target
        mapping_dict: dictornary with mapping for source_col to target_col
    """
    final_df = source_df
    for key, value in mapping_dict.items():
        final_df= final_df.withColumnRenamed(key,value)
    
    return final_df

In [None]:
spark.stop()

In [None]:
# Database Logins for Informatica:
# CMX_ORS:
# Username: cmx_ors
# Password: cmx_ors
# CMX_System(metadata):
# Username: cmx_system
# Password: cmx_system
# Assigned IP:- 192.168.2.70

In [None]:
# config = {
# "url" : "jdbc:oracle:thin:@192.168.2.70:1521/PDB",
# "driver" : "oracle.jdbc.driver.OracleDriver",
# "user" : "cmx_ors",
# "password" : "cmx_ors",
# "dbtable" : "C_BO_PTY"}

# df = spark.read.format("jdbc").options(**config).load()
# df.printSchema()