In [1]:
# !pip install findspark

In [2]:
import findspark
findspark.init()
findspark.find()

'/usr/lib/spark'

# MODEL

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType

In [4]:
spark = (
    SparkSession
        .builder
        .appName("hw7")
        .config("spark.executor.memory", "10g")
        .config("spark.driver.memory", "10g")
        .getOrCreate()
)

In [5]:
schema = StructType(
    [
        StructField("tranaction_id",IntegerType(),True),
        StructField("tx_datetime",StringType(),True),
        StructField("customer_id",IntegerType(),True),
        StructField("terminal_id",IntegerType(),True),
        StructField("tx_amount",DoubleType(),True),
        StructField("tx_time_seconds",IntegerType(),True),
        StructField("tx_time_days",IntegerType(),True),
        StructField("tx_fraud",IntegerType(),True),
        StructField("tx_fraud_scenario",IntegerType(),True),
    ]
)

In [6]:
df = spark.read.options(header=True,inferSchema=False).schema(schema).parquet(
    "/user/root/datasets/set02/data_cleansed.parquet"
)

In [7]:
df.count()

45693570

In [8]:
df_sample = df.sample(fraction=0.1, seed=3)

In [9]:
df_sample.count()

4568988

In [10]:
df_sample.show(10)

+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|tranaction_id|        tx_datetime|customer_id|terminal_id|tx_amount|tx_time_seconds|tx_time_days|tx_fraud|tx_fraud_scenario|
+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|   1836822545|2022-11-06 14:21:07|     573306|        844|    28.64|      101312467|        1172|       0|                0|
|   1836826849|2022-11-06 02:58:34|     576031|        328|    31.66|      101271514|        1172|       0|                0|
|   1836827722|2022-11-06 12:25:44|     576623|        688|     9.83|      101305544|        1172|       0|                0|
|   1836827855|2022-11-06 10:44:16|     576731|        877|    40.04|      101299456|        1172|       0|                0|
|   1836828249|2022-11-06 19:09:56|     576977|        191|    65.79|      101329796|        1172|       0|           

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
features_assembler = VectorAssembler(inputCols=[
    "terminal_id",
    "tx_amount",
    "tx_time_seconds",
    "tx_time_days",
    ],
    outputCol="Features",
    handleInvalid="skip"
)

In [13]:
# X_train = features_assembler.transform(df_sample)

In [14]:
from pyspark.ml.pipeline import Pipeline

feat_ext_pipe = Pipeline(stages=[
    features_assembler,
]).fit(df_sample)

In [15]:
feat_ext_pipe.write().overwrite().save("feat_ext_pipe.joblib")

In [16]:
from pyspark.ml.pipeline import PipelineModel

In [17]:
feat_ext_pipe2 = PipelineModel.load("feat_ext_pipe.joblib")

In [18]:
feat_ext_pipe

PipelineModel_5e5abb14a4e9

In [19]:
X_train = feat_ext_pipe.transform(df_sample)

In [20]:
X_train.show(10)

+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+--------------------+
|tranaction_id|        tx_datetime|customer_id|terminal_id|tx_amount|tx_time_seconds|tx_time_days|tx_fraud|tx_fraud_scenario|            Features|
+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+--------------------+
|   1836822545|2022-11-06 14:21:07|     573306|        844|    28.64|      101312467|        1172|       0|                0|[844.0,28.64,1.01...|
|   1836826849|2022-11-06 02:58:34|     576031|        328|    31.66|      101271514|        1172|       0|                0|[328.0,31.66,1.01...|
|   1836827722|2022-11-06 12:25:44|     576623|        688|     9.83|      101305544|        1172|       0|                0|[688.0,9.83,1.013...|
|   1836827855|2022-11-06 10:44:16|     576731|        877|    40.04|      101299456|        1172|       0|           

In [21]:
df_sample.show(1)

+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|tranaction_id|        tx_datetime|customer_id|terminal_id|tx_amount|tx_time_seconds|tx_time_days|tx_fraud|tx_fraud_scenario|
+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|   1836822545|2022-11-06 14:21:07|     573306|        844|    28.64|      101312467|        1172|       0|                0|
+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
only showing top 1 row



In [22]:
df_sample.columns

['tranaction_id',
 'tx_datetime',
 'customer_id',
 'terminal_id',
 'tx_amount',
 'tx_time_seconds',
 'tx_time_days',
 'tx_fraud',
 'tx_fraud_scenario']

In [23]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="tx_fraud", seed=42, leafCol="leafId", featuresCol='Features')

In [24]:
model = rf.fit(X_train)

In [25]:
model.write().overwrite().save("spark_model.joblib")

In [26]:
model2 = RandomForestClassificationModel.load("spark_model.joblib")

In [27]:
schema

StructType(List(StructField(tranaction_id,IntegerType,true),StructField(tx_datetime,StringType,true),StructField(customer_id,IntegerType,true),StructField(terminal_id,IntegerType,true),StructField(tx_amount,DoubleType,true),StructField(tx_time_seconds,IntegerType,true),StructField(tx_time_days,IntegerType,true),StructField(tx_fraud,IntegerType,true),StructField(tx_fraud_scenario,IntegerType,true)))

In [28]:
schema_inf = StructType(
    [
        StructField("tranaction_id",IntegerType(),True),
        StructField("tx_datetime",StringType(),True),
        StructField("customer_id",IntegerType(),True),
        StructField("terminal_id",IntegerType(),True),
        StructField("tx_amount",DoubleType(),True),
        StructField("tx_time_seconds",IntegerType(),True),
        StructField("tx_time_days",IntegerType(),True),
        # StructField("tx_fraud",IntegerType(),True),
        # StructField("tx_fraud_scenario",IntegerType(),True),
    ]
)

In [29]:
test0 = spark.createDataFrame(
    [
        (1836822545, "2022-11-06 14:21:07", 573306, 844, 28.64, 101312467, 1172),
    ],
    schema_inf
)

In [30]:
test0

DataFrame[tranaction_id: int, tx_datetime: string, customer_id: int, terminal_id: int, tx_amount: double, tx_time_seconds: int, tx_time_days: int]

In [31]:
model2.predict(feat_ext_pipe2.transform(test0).head().Features)

0.0

In [32]:
inf = feat_ext_pipe2.transform(test0)

In [33]:
model2.transform(inf).show()

+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------------------+--------------------+--------------------+----------+-------------+
|tranaction_id|        tx_datetime|customer_id|terminal_id|tx_amount|tx_time_seconds|tx_time_days|            Features|       rawPrediction|         probability|prediction|       leafId|
+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------------------+--------------------+--------------------+----------+-------------+
|   1836822545|2022-11-06 14:21:07|     573306|        844|    28.64|      101312467|        1172|[844.0,28.64,1.01...|[2.91377163899818...|[0.97125721299939...|       0.0|[0.0,0.0,0.0]|
+-------------+-------------------+-----------+-----------+---------+---------------+------------+--------------------+--------------------+--------------------+----------+-------------+



In [34]:
inf.columns

['tranaction_id',
 'tx_datetime',
 'customer_id',
 'terminal_id',
 'tx_amount',
 'tx_time_seconds',
 'tx_time_days',
 'Features']