In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.sql.types import DoubleType

In [7]:
spark = SparkSession.builder.master('local').appName('MiNADZD').getOrCreate()

df = spark.read.csv('./creditcard.csv', header=True, inferSchema=True)
df = df.withColumn('Class', df['Class'].cast(DoubleType()))

In [16]:
'Rozmiar danych: ' + str(df.count()) + ' rows x ' + str(len(df.columns)) + ' columns.'

'Rozmiar danych: 284807 rows x 32 columns.'

In [17]:
df.show(5)

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+--------------------+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|    

In [18]:
listColumns=df.columns

if not 'features' in listColumns:
    inputColumns = ['Time','Amount']
    for i in range(1,29):
        inputColumns.append('V'+str(i))

    assembler = VectorAssembler(inputCols=inputColumns, outputCol='features')
    df = assembler.transform(df)

In [19]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
liNormData = normalizer.transform(df)
liNormData.select('normFeatures').show(5)

+--------------------+
|        normFeatures|
+--------------------+
|[0.0,0.9190068131...|
|[0.0,0.2234773306...|
|[0.00247767129574...|
|[0.00708831461517...|
|[0.02296611443072...|
+--------------------+
only showing top 5 rows



In [20]:
(trainingNormData, testNormData) = liNormData.randomSplit([0.8,0.2])

In [21]:
algo = RandomForestClassifier(featuresCol='normFeatures', labelCol='Class')
model = algo.fit(trainingNormData)

In [22]:
predictions = model.transform(testNormData)

In [14]:
predictions.select(['Class','prediction', 'probability']).show()

+-----+----------+--------------------+
|Class|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.99565752218895...|
|  0.0|       0.0|[0.99826323486015...|
|  0.0|       0.0|[0.98984277541625...|
|  0.0|       0.0|[0.98425867939652...|
|  0.0|       0.0|[0.95913818617477...|
|  0.0|       0.0|[0.99937818976364...|
|  0.0|       0.0|[0.97063808148246...|
|  0.0|       0.0|[0.98805884385839...|
|  0.0|       0.0|[0.89754361496148...|
|  0.0|       0.0|[0.98390885512830...|
|  0.0|       0.0|[0.99546774777332...|
|  0.0|       0.0|[0.97603780930019...|
|  0.0|       0.0|[0.98648079891563...|
|  0.0|       0.0|[0.99950579939522...|
|  0.0|       0.0|[0.99919507357452...|
|  0.0|       0.0|[0.99726515473129...|
|  0.0|       0.0|[0.99950579939522...|
|  0.0|       0.0|[0.94856509275325...|
|  0.0|       0.0|[0.99949060736185...|
|  0.0|       0.0|[0.94591477719349...|
+-----+----------+--------------------+
only showing top 20 rows

