In [1]:
def create_feature_pipeline(schema, columns):
    """Create preprocessing transformation pipeline.
    """
    from pyspark.sql.types import StructType, StringType, NumericType
    from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer

    feature_columns = []
    preprocessing_stages = []
    for field in [f for f in schema.fields if f.name in columns]:
        if isinstance(field.dataType, StringType):

            indexer = StringIndexer()
            indexer.setInputCol(field.name).setOutputCol(f"{field.name}_indexed")
            preprocessing_stages.append(indexer)

            encoder = OneHotEncoder()
            encoder.setInputCol(f"{field.name}_indexed").setOutputCol(f"{field.name}_encoded")
            preprocessing_stages.append(encoder)

            feature_columns.append(f"{field.name}_encoded")

        elif isinstance(field.dataType, NumericType):
            feature_columns.append(field.name)

        else:
            pass

    assembler = VectorAssembler()
    assembler.setInputCols(feature_columns).setOutputCol('features')
    preprocessing_stages.append(assembler)

    return preprocessing_stages


In [2]:
def create_balanced_dataframe(df, reduction_count):
    """There will be more non-fraud transactions than fraud transactions.
    We will use K-Means algorithm to create non-fraud dataframe which matches
    the size of the non-fraud transactions dataframe.
    """

    from pyspark.ml.clustering import KMeans, KMeansModel

    df = nonfraud_df
    reduction_count = fraud_count

    k_means = KMeans().setK(reduction_count).setMaxIter(30)
    k_means_model = k_means.fit(df)

    from pyspark.ml.linalg import Vectors
    balanced_list = map(lambda v: (Vectors.dense(v[0:]) ,0.0), k_means_model.clusterCenters())
    balanced_df = spark.createDataFrame(balanced_list, schema=['features', 'label'])

    return balanced_df

In [3]:
def create_random_forest_model(df):
    """Create Random Forest Model from DataFrame"""
    from pyspark.ml.classification import RandomForestClassifier
    (training, test) = df.randomSplit([0.7,0.3])
    random_forest_estimator = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setMaxBins(700)
    model = random_forest_estimator.fit(training)
    transaction_with_prediction = model.transform(test)
    
    import sys
    sys.stderr.write("total data count is %d\n" % transaction_with_prediction.count())
    sys.stderr.write("count where label matches prediction %d\n" % transaction_with_prediction.where("prediction = label").count())

    return model

In [4]:
# Initialize app config
import importlib
from demolib import schema, cfg, spark
from pyspark.ml import Pipeline

In [5]:
# Load fraud transactions from MongoDB
fraud_df = spark.read.format("mongo") \
    .options( uri = cfg.db.uri,
              database = cfg.db.name,
              collection = cfg.db.fraud
            ) \
    .load()

# Load non-fraud transactions from MongoDB
nonfraud_df = spark.read.format("mongo") \
    .options( uri = cfg.db.uri,
              database = cfg.db.name,
              collection = cfg.db.nonfraud
            ) \
    .load()

# Combine both fraud and non-fraud transactions using union
transaction_df = fraud_df.unionAll(nonfraud_df).cache()

In [6]:
# Create and save preprocessing transforming model

column_names = ("cc_num", "category", "merchant", "distance", "amt", "age")

pipeline_stages = create_feature_pipeline(transaction_df.schema, column_names)
pipeline = Pipeline().setStages(pipeline_stages)

preprocessing_transformer_model = pipeline.fit(transaction_df)
preprocessing_transformer_model.write().overwrite().save(cfg.model.preprocessing.path)

feature_df = preprocessing_transformer_model.transform(transaction_df)

In [7]:
# Split fraud from nonfraud to create balanced dataframe

fraud_df = feature_df \
  .filter("is_fraud = 1") \
  .withColumnRenamed("is_fraud", "label") \
  .select("features", "label")
fraud_count = fraud_df.count()

nonfraud_df = feature_df.filter("is_fraud = 0")

In [8]:
# Create balanced nonfraud dataframe and union with fraud

balanced_df = create_balanced_dataframe(nonfraud_df, fraud_count)
final_feature_df = fraud_df.unionAll(balanced_df)

In [9]:
# Based on the balanced dataframe create prediction model and save the model
random_forest_model = create_random_forest_model(final_feature_df)
random_forest_model.write().overwrite().save(cfg.model.predict.path)

total data count is 311
count where label matches prediction 308
