In [3]:
import joblib as jb
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, to_json
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Imputer, StandardScaler, OneHotEncoder, StringIndexer, VectorAssembler
from kafka import KafkaProducer
import json
from pyspark.ml.feature import SQLTransformer

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

from pyspark.sql import SparkSession

import xgboost

import json

In [4]:
spark=SparkSession.builder.appName("Kafka_streams_processing").getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/11 06:11:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Importing trainnig features stats "mode,mean,deviation" and categories

- **Credit data features stats**

- mean and deviation

In [100]:
df_credit_data_mean_deviation=spark.read.csv("../csv/credit_data_mean_scale.csv",header=True,inferSchema=True)
df_credit_data_mean_deviation.show()
df_credit_data_mean_deviation.printSchema()

                                                                                

+-----+------------------+------------------+------------------+
| info|            amount|     oldbalanceOrg|    newbalanceOrig|
+-----+------------------+------------------+------------------+
| mean| 627408.4007645844|1097893.0927459989| 610851.3635214248|
|scale|1658502.6666142726| 3132228.697963737|2628933.6624411293|
+-----+------------------+------------------+------------------+

root
 |-- info: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)



- mode

In [41]:
df_credit_data_mode=spark.read.csv("../csv/credit_data_mode.csv",header=True,inferSchema=True)
df_credit_data_mode.show()
df_credit_data_mode.printSchema()

+----+--------+
|info|    type|
+----+--------+
|mode|CASH_OUT|
+----+--------+

root
 |-- info: string (nullable = true)
 |-- type: string (nullable = true)



- Categories

In [42]:
with open("../json/credit_data_categorical_values.json","r") as f:
    credit_data_categorical_encoder_dict=json.load(f)
credit_data_categorical_encoder_dict

{'type': ['CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']}

- **Insurance Data**

- mean and deviation

In [141]:
df_insurance_data_mean_deviation=spark.read.csv("../csv/insurance_data_mean_scale.csv",header=True,inferSchema=True)
df_insurance_data_mean_deviation.show()
df_insurance_data_mean_deviation.printSchema()

+-----+-------------------+-------------------+-------------------+------------------+-------------------+
| info|     marital_status|witness_present_ind| high_education_ind|past_num_of_claims| address_change_ind|
+-----+-------------------+-------------------+-------------------+------------------+-------------------+
| mean| 0.7152631578947368|0.23467105263157895| 0.6966447368421053|0.5023684210526316| 0.5731578947368421|
|scale|0.45128901255535303|0.42379305054279226|0.45970734981322536|0.9544189034241184|0.49461896692067236|
+-----+-------------------+-------------------+-------------------+------------------+-------------------+

root
 |-- info: string (nullable = true)
 |-- marital_status: double (nullable = true)
 |-- witness_present_ind: double (nullable = true)
 |-- high_education_ind: double (nullable = true)
 |-- past_num_of_claims: double (nullable = true)
 |-- address_change_ind: double (nullable = true)



In [44]:
df_insurance_data_mode=spark.read.csv("../csv/insurance_data_mode.csv",header=True,inferSchema=True)
df_insurance_data_mode.show()
df_insurance_data_mode.printSchema()

+----+------+-------+-------------+-------------+----------------+
|info|gender|channel|accident_site|living_status|vehicle_category|
+----+------+-------+-------------+-------------+----------------+
|mode|     M| Broker|        Local|          Own|         Compact|
+----+------+-------+-------------+-------------+----------------+

root
 |-- info: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- channel: string (nullable = true)
 |-- accident_site: string (nullable = true)
 |-- living_status: string (nullable = true)
 |-- vehicle_category: string (nullable = true)



- Categories

In [45]:
with open("../json/insurance_data_categorical_values.json","r") as f:
    insurance_data_categorical_encoder_dict=json.load(f)
insurance_data_categorical_encoder_dict

{'gender': ['M'],
 'channel': ['Phone'],
 'accident_site': ['Parking_Lot'],
 'living_status': ['Rent'],
 'vehicle_category': ['Medium']}

# Loading preprocessors

- credit data preprocessor

In [27]:
with open("../preprocessors/credit_fraud_data_preprocessor.pkl","rb") as f:
    sklearns_credit_data_preprocessor=jb.load(f)
sklearns_credit_data_preprocessor

- insurance data preprocessor

In [28]:
with open("../preprocessors/insurance_fraud_data_preprocessor.pkl","rb") as f:
    sklearns_insurance_data_preprocessor=jb.load(f)
sklearns_insurance_data_preprocessor

# Loading the Models

In [29]:
insurance_model_path="../models/credit_fraud_detection_model.pkl"
credit_model_path="../models/insuranceFraudeModel.pkl"

In [30]:
with open(insurance_model_path, "rb") as credit_model_file:
    credit_model = jb.load(credit_model_file)

credit_model

In [31]:
with open(credit_model_path,"rb") as insurance_mode_file:
    insurance_model=jb.load(insurance_mode_file)
insurance_model

# Kafka Confuguration

In [32]:
kafka_broker="localhost:9092"
shcema_registry="http://localhost:8081"
insurance_input_topic="raw_insurance_data"
insurance_output_topic="processed_insurance_data"
credit_input_topic="raw_credit_data"
credit_output_topic="processed_credit_data"

# Defining shcemas for kafka topics

- Raw credit data schema

In [33]:
raw_credit_data_schema=StructType(
    [
        StructField('client_id',StringType(),False),
        StructField('transaction_id',StringType(),False),
        StructField('type',StringType(),False),
        StructField('amount',StringType(),False),
        StructField('oldbalanceOrg',DoubleType(),True),
        StructField('newbalanceOrg',DoubleType(),True), 
    ]
)

- Raw insurance data schema

In [34]:
raw_insurance_data_schema=StructType(
    [
        StructField('client_id',StringType(),False),
        StructField('insurance_id',StringType(),False),
        #tal mn ba3d
    ]
)

# Creating data pipelines and preprocessors  for each topic

- Creating custom function to process and transform data

In [131]:
def features_infos_to_map(df_mean_deviation=None,df_mode=None):
    if (df_mean_deviation==None) | (df_mode==None):
        raise ValueError("Null dataFrames cant be accepted !")
    mean_scale_map={
        col:{
            'mean': df_mean_deviation.filter(df_mean_deviation['info'] == 'mean').select(col).collect()[0][0],
            'deviation': df_mean_deviation.filter(df_mean_deviation['info'] == 'scale').select(col).collect()[0][0]
        } for col in df_mean_deviation.columns[1:]
    }
    mode_map={
        col : {
            "mode":df_mode.filter(df_mode["info"]=='mode').select(col).collect()[0][0],
        } for col in df_mode.columns[1:] 
    }
    featurs_stats={**mean_scale_map,**mode_map}
    return featurs_stats


def map_credit_features_infos(features_stats_dict=None,info=None,column=None):
    valid_columns=list(features_stats_dict.keys())
    valid_infos=['mean', 'deviation',"mode"]
    if column not in valid_columns:
        raise ValueError(f"Invalid column name. Expected one of {valid_columns}, got '{column}'")
    if info not in valid_infos:
        raise ValueError(f"Invalid info name. Expected one of {valid_infos}, got '{info}'")
    if info not in features_stats_dict[column].keys():
        raise ValueError(f"Invalid info name. Expected one of {list(features_stats_dict[column].keys())}, got '{info}'")
    return features_stats_dict[column][info]

# Custom inputer function
def custom_imputer(num_cols=[],cat_cols=[],features_stats_dict=None):
    imputer_sql_expressions=[
        f"COALESCE({col},{map_credit_features_infos(column=col,info='mean',features_stats_dict=features_stats_dict)}) as {col}" 
        for col in num_cols
    ]+[
        f"COALESCE({col},'{map_credit_features_infos(column=col,info='mode',features_stats_dict=features_stats_dict)}') as {col}" 
        for col in cat_cols
    ]
    imputer_sql_expression=f"SELECT {','.join(imputer_sql_expressions)} FROM __THIS__"
    print(imputer_sql_expression)
    return SQLTransformer(statement=imputer_sql_expression)

# custom scaler
def custom_scaler(scale_cols=None,features_stats_dict=None):
    scaler_sql_expressions=[
        f"({col}-{map_credit_features_infos(column=col,info='mean',features_stats_dict=features_stats_dict)})/{map_credit_features_infos(column=col,info='deviation',features_stats_dict=features_stats_dict)} as scaled_{col}"
        for col in scale_cols
    ]
    scaler_sql_expression=f"SELECT *,{','.join(scaler_sql_expressions)} FROM __THIS__"
    print(scaler_sql_expression)
    return SQLTransformer(statement=scaler_sql_expression)

# custom hot encoder
def custom_hot_encoder(encoder_categories_dict=None):
    encoders=[]
    for col,categories in encoder_categories_dict.items():
        encoder_sql_expressions=[
            f"CASE WHEN {col}='{cat}' THEN 1 ELSE 0 END as {col}_{cat}"
            for cat in categories
        ]
        encoder_sql_expression=f"SELECT *,{','.join(encoder_sql_expressions)} FROM __THIS__"
        print(encoder_sql_expression)
        encoder=SQLTransformer(statement=encoder_sql_expression)
        encoders.append(encoder)
    return encoders

- Creating data pipline

In [149]:
def create_data_pipline(df=None,df_mean_deviation=None,df_mode=None,encoder_categories_dict=None,output_features=None):
    
    features_stats_dict=features_infos_to_map(
        df_mode=df_mode,
        df_mean_deviation=df_mean_deviation
    )
    
    # Filling missing values 
    num_cols=df_mean_deviation.columns[1:]
    cat_cols=df_mode.columns[1:]
    imputer=custom_imputer(
        num_cols=num_cols,
        cat_cols=cat_cols,
        features_stats_dict=features_stats_dict
    )
    
    # Scaling data
    scale_cols=df_mean_deviation.columns[1:]
    scaler=custom_scaler(
        scale_cols=scale_cols,
        features_stats_dict=features_stats_dict
    )
    
    # Encoding categorical values
    cat_encoders=custom_hot_encoder(
        encoder_categories_dict=encoder_categories_dict
    )

    # Assembling all features in one vectore
    assembler=VectorAssembler(
        inputCols=output_features,
        outputCol="features"
    )

    # Creating the pipeline
    pipline=Pipeline(stages=[imputer]+[scaler]+cat_encoders+[assembler])
    return pipline

- Testing the pipline on credit data

In [151]:
credit_data = [
    ("DEBIT",100.,5000.,4000.),
    ("TRANSFER",2000.,6000.,4000.),
    ("CASH_OUT",70000.,2000.,1500.),
    ("TRANSFER",None,7000.,5000.), 
    (None,1500.,None,3000.) 
]

credit_data_schema = StructType([
    StructField("type",StringType(),True),
    StructField("amount",DoubleType(),True),
    StructField("oldbalanceOrg",DoubleType(),True),
    StructField("newbalanceOrig",DoubleType(),True),
])

df_test_credit_data=spark.createDataFrame(
    credit_data, 
    schema=credit_data_schema
)
print("--------------Raw Credit Data")
df_test_credit_data.show()


credit_data_output_features=["scaled_amount","scaled_oldbalanceOrg","scaled_newbalanceOrig","type_CASH_OUT","type_DEBIT","type_PAYMENT","type_TRANSFER"]

credit_data_pipline=create_data_pipline(
    df=df_test_credit_data,
    df_mean_deviation=df_credit_data_mean_deviation,
    df_mode=df_credit_data_mode,
    encoder_categories_dict=credit_data_categorical_encoder_dict,
    output_features=credit_data_output_features
)

transformed_credit_data=credit_data_pipline.fit(df_test_credit_data).transform(df_test_credit_data)
print("--------------Transformed Credit Data")
transformed_credit_data.drop("amount","oldbalanceOrg","newbalanceOrig").show()

print("--------------Credit Data features'Vectors'")
transformed_credit_data.select("features").collect()

--------------Raw Credit Data
+--------+-------+-------------+--------------+
|    type| amount|oldbalanceOrg|newbalanceOrig|
+--------+-------+-------------+--------------+
|   DEBIT|  100.0|       5000.0|        4000.0|
|TRANSFER| 2000.0|       6000.0|        4000.0|
|CASH_OUT|70000.0|       2000.0|        1500.0|
|TRANSFER|   NULL|       7000.0|        5000.0|
|    NULL| 1500.0|         NULL|        3000.0|
+--------+-------+-------------+--------------+

SELECT COALESCE(amount,627408.4007645844) as amount,COALESCE(oldbalanceOrg,1097893.0927459989) as oldbalanceOrg,COALESCE(newbalanceOrig,610851.3635214248) as newbalanceOrig,COALESCE(type,'CASH_OUT') as type FROM __THIS__
SELECT *,(amount-627408.4007645844)/1658502.6666142726 as scaled_amount,(oldbalanceOrg-1097893.0927459989)/3132228.697963737 as scaled_oldbalanceOrg,(newbalanceOrig-610851.3635214248)/2628933.6624411293 as scaled_newbalanceOrig FROM __THIS__
SELECT *,CASE WHEN type='CASH_OUT' THEN 1 ELSE 0 END as type_CASH_OUT,CASE

[Row(features=DenseVector([-0.3782, -0.3489, -0.2308, 0.0, 1.0, 0.0, 0.0])),
 Row(features=DenseVector([-0.3771, -0.3486, -0.2308, 0.0, 0.0, 0.0, 1.0])),
 Row(features=DenseVector([-0.3361, -0.3499, -0.2318, 1.0, 0.0, 0.0, 0.0])),
 Row(features=SparseVector(7, {1: -0.3483, 2: -0.2305, 6: 1.0})),
 Row(features=SparseVector(7, {0: -0.3774, 2: -0.2312, 3: 1.0}))]

In [163]:
insurance_schema=StructType(
    [
        StructField('high_education_ind',IntegerType(),True),
        StructField('past_num_of_claims',IntegerType(),True),
        StructField('gender',StringType(),True),
        StructField('address_change_ind',IntegerType(),True),
        StructField('witness_present_ind',IntegerType(),True),
        StructField('marital_status',IntegerType(),True),
        StructField('channel',StringType(),True),
        StructField('accident_site',StringType(),True),
        StructField('living_status',StringType(),True),
        StructField('vehicle_category',StringType(),True),
    ]
)

insurance_data = [
    (None, 2, "M", 0, 1, 1, "Broker", None, "Own", "Compact"),
    (1, 0, "F", 1, 0, 0, None, "Local", "Rent", None),
    ( 0, 1, "M", 0, 1, 1, "Online", "Parking Lot", "Own", "Large"),
    ( 1, 0, "F", 1, 0, 0, "Phone", "Local", "Rent", "Medium"),
    (0, None, "M", 1, 1, 1, "Broker", "Highway", None, "Compact"),
    ( 1, 1, "F", None, 0, 0, "Online", "Parking Lot", "Own", "Medium"),
    (0, 2, "M", 1, None, 1, None, "Local", "Rent", "Large"),
    ( 1, 0, "F", 0, 0, None, "Broker", "Highway", "Own", "Compact"),
]

print("--------------Raw insurance Data")
df_insurance_test = spark.createDataFrame(insurance_data, insurance_schema)
df_insurance_test.show()

output_features=["scaled_high_education_ind","scaled_past_num_of_claims","gender_M","scaled_address_change_ind",
                   "scaled_witness_present_ind","scaled_marital_status","channel_Phone","accident_site_Parking_Lot",
                   "vehicle_category_Medium"
                  ]
print("--------------Transformed insurance Data")
insurance_data_pipeline=create_data_pipline(
    df=df_insurance_test, 
    df_mean_deviation=df_insurance_data_mean_deviation,
    df_mode=df_insurance_data_mode,
    encoder_categories_dict=insurance_data_categorical_encoder_dict,
    output_features=output_features
)

insurance_pipeline_model=insurance_data_pipeline.fit(df_insurance_test)
transformed_insurance_df=insurance_pipeline_model.transform(df_insurance_test)

print("--------------Insurance Data features'Vectors'")
transformed_insurance_df.drop(
    "high_education_ind",
    "past_num_of_claims",
    "gender",
    "address_change_ind",
    "witness_present_ind",
    "marital_status",
    "channel",
    "accident_site",
    "living_status",
    "vehicle_category",
    "features"
).show()
transformed_insurance_df.select("features").collect() 

--------------Raw insurance Data
+------------------+------------------+------+------------------+-------------------+--------------+-------+-------------+-------------+----------------+
|high_education_ind|past_num_of_claims|gender|address_change_ind|witness_present_ind|marital_status|channel|accident_site|living_status|vehicle_category|
+------------------+------------------+------+------------------+-------------------+--------------+-------+-------------+-------------+----------------+
|              NULL|                 2|     M|                 0|                  1|             1| Broker|         NULL|          Own|         Compact|
|                 1|                 0|     F|                 1|                  0|             0|   NULL|        Local|         Rent|            NULL|
|                 0|                 1|     M|                 0|                  1|             1| Online|  Parking Lot|          Own|           Large|
|                 1|                 0|    

[Row(features=DenseVector([0.0, 1.5692, 1.0, -1.1588, 1.8059, 0.6309, 0.0, 0.0, 0.0])),
 Row(features=DenseVector([0.6599, -0.5264, 0.0, 0.863, -0.5537, -1.5849, 0.0, 0.0, 0.0])),
 Row(features=DenseVector([-1.5154, 0.5214, 1.0, -1.1588, 1.8059, 0.6309, 0.0, 0.0, 0.0])),
 Row(features=DenseVector([0.6599, -0.5264, 0.0, 0.863, -0.5537, -1.5849, 1.0, 0.0, 1.0])),
 Row(features=DenseVector([-1.5154, 0.0, 1.0, 0.863, 1.8059, 0.6309, 0.0, 0.0, 0.0])),
 Row(features=DenseVector([0.6599, 0.5214, 0.0, 0.0, -0.5537, -1.5849, 0.0, 0.0, 1.0])),
 Row(features=DenseVector([-1.5154, 1.5692, 1.0, 0.863, 0.0, 0.6309, 0.0, 0.0, 0.0])),
 Row(features=SparseVector(9, {0: 0.6599, 1: -0.5264, 3: -1.1588, 4: -0.5537}))]