In [1]:
from logging import Logger
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, unix_timestamp, to_date, date_format, to_timestamp

In [2]:
def read_from_pg(spark:SparkSession, config: dict, sql: str, table: str) -> DataFrame:
    """ Read dataframe from postgres
    Args:
        config: settings for connect
        sql: sql to read, it may be one of these format
             - 'table_name'
             - 'schema_name.table_name'
             - '(select a, b, c from t1 join t2 ...) as foo'
        spark: specific current spark_context or None
    Returns:
        selected DF
    """        
    try:
        print("reading from postgresql")
        if sql:
            print("executing query to create df")
            source_df = spark.read.format("jdbc").options(**config).option("query", sql).load()

        else:
            print("reading directly from source table")
            source_df = spark.read.format("jdbc").options(**config).option('dbtable',table).load()

        return source_df
    
    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"

In [3]:
def write_to_pg(spark:SparkSession, df: DataFrame, config: dict, table: str, mode: str='append' ) -> None:
    """ Write dataframe to postgres
    Args:
        df: DataFrame to write
        config: config dict
        table: table_name in which we write_data
        column_list: list of columns in which we write the data
        mode: mode, one of these:
            - append - create table if not exists (with all columns of DataFrame)
                and write records to table (using fields only in table columns)
            - overwrite - truncate table (if exists) and write records (using fields only in table columns)
            - overwrite_full - drop table and create new one with all columns and DataFrame and append records to it
            - fail - fail if table is not exists, otherwise append records to it
    """
    
    try:
        column_list = df.columns
        if len(column_list) == 0:
            return("No columns to write into")

        else:
            df.select(*column_list).write.format('jdbc').options(**config).option('dbtable',table).mode(mode).save()
            return "Data written into postgresql successfully"
        
    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"

In [4]:
def read_csv_file(spark:SparkSession, filename:str, delimiter:str) -> DataFrame:
    """ Read dataframe from manual file
    Args:
        filename: file to read from
        delimiter: delimiter to use when reading the file
    Returns:
        selected DF
    """       
    try:
        print("reading from file")
        source_df = spark.read.option("delimiter", delimiter).option('header',True).csv('sourcedata/'+filename)  
        print("file read successfully, returning dataframe")
        return source_df 

    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"

In [5]:
def write_to_csv(df:DataFrame, filename:str, delimiter:str = ',')->None:
    """Write dataframe to a csv file
    Args:
    df: data frame to write
    filename: filename with which we want to save our dataframe in a file
    by default we will write a comma delimited file only 
    """
    try:
        print("writing the dataframe")
        df.coalesce(1).write.option('delimiter', delimiter).option("header", True).csv('target/'+filename)
        print("dataframe written at targets/"+filename)
        
    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"

In [6]:
def read_from_oracle(spark:SparkSession, config: dict, sql: str, table: str) -> DataFrame:
    """ Read dataframe from oracle
    Args:
        config: settings for connect
        sql: sql to read, it may be one of these format
             - 'table_name'
             - 'schema_name.table_name'
             - '(select a, b, c from t1 join t2 ...) as foo'
        spark: specific current spark_context or None
    Returns:
        selected DF
    """        
    try:
        print("reading from oracle")
        if sql:
            print("executing query to create df")
            source_df = spark.read.format("jdbc").options(**config).option("query", sql).load()

        else:
            print("reading directly from source table")
            source_df = spark.read.format("jdbc").options(**config).option('dbtable',table).load()

        return source_df 
    
    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"


In [7]:
def write_to_oracle(spark:SparkSession, df: DataFrame, config: dict, table: str, mode: str='append' ) -> None:
    """ Write dataframe to oracle
    Args:
        df: DataFrame to write
        config: config dict
        table: table_name in which we write_data
        column_list: list of columns in which we write the data
        mode: mode, one of these:
            - append - create table if not exists (with all columns of DataFrame)
                and write records to table (using fields only in table columns)
            - overwrite - truncate table (if exists) and write records (using fields only in table columns)
            - overwrite_full - drop table and create new one with all columns and DataFrame and append records to it
            - fail - fail if table is not exists, otherwise append records to it
    """
    
    try:
        column_list = df.columns
        if len(column_list) == 0:
            return("No columns to write into")

        else:
            df.select(*column_list).write.format('jdbc').options(**config).option('dbtable',table).mode(mode).save()
            return "Data written into oracle successfully"
        
    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"

In [8]:
def read_from_msssql(spark:SparkSession, config: dict, sql: str, table: str) -> DataFrame:
    """ Read dataframe from msssql
    Args:
        config: settings for connect
        sql: sql to read, it may be one of these format
             - 'table_name'
             - 'schema_name.table_name'
             - '(select a, b, c from t1 join t2 ...) as foo'
        spark: specific current spark_context or None
    Returns:
        selected DF
    """       
    try:
        print("reading from msssql")
        if sql:
            print("executing query to create df")
            source_df = spark.read.format("jdbc").options(**config).option("query", sql).load()

        else:
            print("reading directly from source table")
            source_df = spark.read.format("jdbc").options(**config).option('dbtable',table).load()

        return source_df 
    
    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"


In [9]:
def write_to_mssql(spark:SparkSession, df: DataFrame, config: dict, table: str, mode: str='append' ) -> None:
    """ Write dataframe to msssql
    Args:
        df: DataFrame to write
        config: config dict
        table: table_name in which we write_data
        column_list: list of columns in which we write the data
        mode: mode, one of these:
            - append - create table if not exists (with all columns of DataFrame)
                and write records to table (using fields only in table columns)
            - overwrite - truncate table (if exists) and write records (using fields only in table columns)
            - overwrite_full - drop table and create new one with all columns and DataFrame and append records to it
            - fail - fail if table is not exists, otherwise append records to it
    """
    
    try:
        column_list = df.columns
        if len(column_list) == 0:
            return("No columns to write into")

        else:
            df.select(*column_list).write.format('jdbc').options(**config).option('dbtable',table).mode(mode).save()
            return "Data written into msssql successfully"
        
    except Exception as e:
        print("Failure occured check logs")
        return f"{e}"

In [10]:
def convert_to_target_dtypes(source_df:DataFrame, target_df:DataFrame, mappings:dict, source_date_format: str, target_date_format: str)-> DataFrame:
    """
    Function to convert source column names to target column names 
    and convert source column data types to target column data types
    
    Args:
        source_df: source dataframe 
        target_df: target dataframe
        mapping: dictionary with source to column mapping 
        date_format: date format to which we want to convert source date column to target date type
        
    returns :
        returns a data frame which we can write in target
    """
    try:
        
        print('source_types: ', source_df.dtypes)
        target_types = target_df.dtypes
        source_types = source_df.dtypes
        print('target_types: ',target_types)
        # date_format =  'dd-MM-yyyy'
        
        # mapping source columns to target
        final_df = source_df
        for key, value in mappings.items():
            final_df= final_df.withColumnRenamed(key,value)
            
        
        # type conversion of source columns to target column types

        for i in source_types:
            for j in target_types:
                if i[0] == j[0] and j[1]!='date' :
                    print(i[0])
                    final_df = final_df.withColumn(i[0],final_df[i[0]].cast(j[1]))

                elif i[0] == j[0] and j[1] =='date' :
                    print(i[0])
                    final_df = final_df.withColumn(i[0], to_timestamp(final_df[i[0]], source_date_format).cast('date') )
                
                elif i[0] == j[0] and j[1] =='timestamp' :
                    print(i[0])
                    final_df = final_df.withColumn(i[0], date_format(final_df[i[0]], target_date_format).cast('timestamp') )
    
        return final_df
    except Exception as e:
        print("Error occured")
        return f"{e}"