### Import settings

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = 'pyspark-shell'

In [2]:
import findspark
findspark.init('/opt/spark')
from pyspark import SparkContext,SparkConf
conf = (SparkConf()
         .setMaster("spark://10.200.5.39:7077")
         .set("spark.executor.memory","30g")
         .set("spark.sql.autoBroadcastJoinThreshold", "-1")
         .setAppName("Classification"))
sc = SparkContext()

In [11]:
import pyspark
import binascii
from pyspark.sql import SQLContext
from functools import reduce
from sklearn import tree
from pyspark.sql.types import *
import pygraphviz
import pyspark.sql.functions as f
import networkx as nx
from networkx.readwrite import json_graph
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn import datasets,metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
import numpy as np
import json
import tensorflow as tf
from IPython.display import Image
from networkx.drawing.nx_pydot import write_dot
import pickle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
sqlContext = SQLContext(sc)

In [4]:
path = "hdfs://10.200.5.25:9001/user/titanium/"

### Functions space

In [5]:
def transform_label_to_id(df,new_col,old_col):
    df = df.withColumn(new_col,f.when(f.col(old_col).like("b%"),0).otherwise(f.col(old_col)))
    df = df.withColumn(new_col,f.when(f.col(old_col).like("e%"),1).otherwise(f.col(new_col)))
    df = df.withColumn(new_col,f.when(f.col(old_col).like("c%"),2).otherwise(f.col(new_col)))
    df = df.withColumn(new_col,f.when(f.col(old_col).like("o%"),3).otherwise(f.col(new_col)))
    df = df.withColumn(new_col,f.when(f.col(old_col).like("Coin%"),3).otherwise(f.col(new_col)))
    return df

def change_label_new_df(df,new_col,old_col,t):
    df = df.withColumn(new_col,f.when(f.col(old_col).like("b%"),f.concat(f.lit(t*"b"),f.col(old_col))).otherwise(f.col(old_col)))
    df = df.withColumn(new_col,f.when(f.col(old_col).like("e%"),f.concat(f.lit(t*"e"),f.col(old_col))).otherwise(f.col(new_col)))
    df = df.withColumn(new_col,f.when(f.col(old_col).like("c%"),f.concat(f.lit(t*"c"),f.col(old_col))).otherwise(f.col(new_col)))
    df = df.withColumn(new_col,f.when(f.col(old_col).like("o%"),f.concat(f.lit(t*"o"),f.col(old_col))).otherwise(f.col(new_col)))
    return df

schema = StructType([
    StructField("label", StringType(), True),
    StructField("user", StringType(), True),
])

def join_dataframe_intag(DF,pref):
    DF=DF.groupBy("user").agg(f.collect_list("label").alias("list"))
    DF = DF.select("user",f.explode("list").alias("value")).groupBy("user","value").agg(f.count("value").alias("cnt"))
    DF2 = DF.groupBy("user").agg(f.count("user").alias("cnt")).drop("cnt")
    DF2 = DF2.alias("a").join(DF.alias('b'),(f.col("a.user")==f.col("b.user")) & (f.col("b.value")==0),"leftouter").select("a.user","b.cnt").withColumnRenamed("cnt",pref+"_cnt0")
    DF2 = DF2.alias("a").join(DF.alias('b'),(f.col("a.user")==f.col("b.user")) & (f.col("b.value")==1),"leftouter").select("a.user",pref+"_cnt0","b.cnt").withColumnRenamed("cnt",pref+"_cnt1")
    DF2 = DF2.alias("a").join(DF.alias('b'),(f.col("a.user")==f.col("b.user")) & (f.col("b.value")==2),"leftouter").select("a.user",pref+"_cnt0",pref+"_cnt1","b.cnt").withColumnRenamed("cnt",pref+"_cnt2")
    DF2 = DF2.alias("a").join(DF.alias('b'),(f.col("a.user")==f.col("b.user")) & (f.col("b.value")==3),"leftouter").select("a.user",pref+"_cnt0",pref+"_cnt1",pref+"_cnt2","b.cnt").withColumnRenamed("cnt",pref+"_cnt3")

    DF2=DF2.fillna(0)
    return DF2

def link_outclass_intag(user,b):
    dd = [(user[i][0], int(b[i])) for i in range(len(user))]
    df = sqlContext.createDataFrame(sc.parallelize(dd),schema=["user", "label"])
    return df



In [6]:
# Class Definition
# 0: user
# 1: exchange
# 2: casino
# 3: coinbase

### Import dataframe

In [7]:
syntethic_directory="synthetic_data/dataframe_exported/data160119/"

entity_d1= sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"entity_feature/")

address_d1 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"address_feature/")

motif1_d1 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"motifs_1/")

motif2_d1 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"motifs_2/")

In [8]:
path_directory="synthetic_data/dataframe_exported/data290119/"

entity_d2= sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"entity_feature/")

address_d2 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"address_feature/")

motif1_d2 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"motifs_1/")

motif2_d2 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"motifs_2/")

In [9]:
path_directory="synthetic_data/dataframe_exported/data310119/"

entity_d3= sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"entity_feature/")

address_d3 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"address_feature/")

motif1_d3 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"motifs_1/")

motif2_d3 = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true',inferSchema='true')\
.load(path+syntethic_directory+"motifs_2/")

In [10]:
####################################################################
#               REDEFINITION SOME LABEL (AVOID OVERLAP TAG)
####################################################################

entity_d2 = change_label_new_df(entity_d2,"user","user",1)
address_d2 = change_label_new_df(address_d2,"user","user",1)
motif1_d2 = change_label_new_df(motif1_d2,"outuser","outuser",1)
motif1_d2 = change_label_new_df(motif1_d2,"inuser","inuser",1)
motif2_d2 = change_label_new_df(motif2_d2,"outuser","outuser",1)
motif2_d2 = change_label_new_df(motif2_d2,"miduser","miduser",1)
motif2_d2 = change_label_new_df(motif2_d2,"inuser","inuser",1)

entity_d3 = change_label_new_df(entity_d3,"user","user",2)
address_d3 = change_label_new_df(address_d3,"user","user",2)
motif1_d3 = change_label_new_df(motif1_d3,"outuser","outuser",2)
motif1_d3 = change_label_new_df(motif1_d3,"inuser","inuser",2)
motif2_d3 = change_label_new_df(motif2_d3,"outuser","outuser",2)
motif2_d3 = change_label_new_df(motif2_d3,"miduser","miduser",2)
motif2_d3 = change_label_new_df(motif2_d3,"inuser","inuser",2)

####################################################################
#               JOIN DATAFRAME
####################################################################
entity_data1 = entity_d1.union(entity_d2)
address_data1= address_d1.union(address_d2)
motif1_data1 = motif1_d1.union(motif1_d2)
motif2_data1 = motif2_d1.union(motif2_d2)

entity_data1 = entity_data1.union(entity_d3)
address_data1= address_data1.union(address_d3)
motif1_data1 = motif1_data1.union(motif1_d3)
motif2_data1 = motif2_data1.union(motif2_d3)

### Training

In [12]:
####################################################################
#               ENTITY ML
####################################################################

entity_feature = entity_data1.filter(f.col("user")!="Unknow").fillna(0)

#Transform label to class index 
entity_feature=transform_label_to_id(entity_feature,"label","user")

df_entity=entity_feature.randomSplit([0.7,0.3])

#Split the input and the output from dataframe 
ent_X=df_entity[0].select("balance_recv","balancein","balance","count_recv","count_sent","add_out","add_in")
ent_y=df_entity[0].select("label")
#Round amount field
ent_X = ent_X.withColumn("balance_recv", f.round(ent_X["balance_recv"], 6))
ent_X = ent_X.withColumn("balancein", f.round(ent_X["balancein"], 6))
ent_X = ent_X.withColumn("balance", f.round(ent_X["balance"], 6))

#Transform input/output dataframe in vector
X_train_ent= ent_X.collect()
y_train_ent = ent_y.collect()

#Reshape the output vector
y_train_ent=np.reshape(y_train_ent,(len(y_train_ent),))

In [13]:
#Split the input and the output from dataframe 
ent_X=df_entity[1].select("balance_recv","balancein","balance","count_recv","count_sent","add_out","add_in")
ent_y=df_entity[1].select("label")
#Round amount field
ent_X = ent_X.withColumn("balance_recv", f.round(ent_X["balance_recv"], 6))
ent_X = ent_X.withColumn("balancein", f.round(ent_X["balancein"], 6))
ent_X = ent_X.withColumn("balance", f.round(ent_X["balance"], 6))

#Transform input/output dataframe in vector
X_test_ent= ent_X.collect()
y_test_ent = ent_y.collect()

y_test_ent=np.reshape(y_test_ent,(len(y_test_ent),))

In [14]:
####################################################################
#               ADABOOST ML
####################################################################

#Create adaboost classifer object
abc_ent = AdaBoostClassifier(n_estimators=50,learning_rate=1)

#Train Adaboost Classifer
adaboost_ent = abc_ent.fit(X_train_ent, y_train_ent)

####################################################################
#               RANDOM FOREST ML
####################################################################

#Create random forest classifer object
rfc_ent= RandomForestClassifier(n_jobs=2, random_state=0)

#Train Randomforest Classifer
randomforest_ent=rfc_ent.fit(X_train_ent, y_train_ent)


#Create a Gaussian Classifier
model_nb = GaussianNB()
model_nb_ent=model_nb.fit(X_train_ent,y_train_ent)

model_knn = KNeighborsClassifier(n_neighbors=7)
modelknn_ent=model_knn.fit(X_train_ent,y_train_ent)

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
mlp_ent=mlp.fit(X_train_ent,y_train_ent)



In [17]:
#Predict the response for test dataset
y_ent_prediction_ada = adaboost_ent.predict(X_test_ent)
y_ent_prediction_rf = randomforest_ent.predict(X_test_ent) 
y_ent_prediction_knn = modelknn_ent.predict(X_test_ent) 
y_ent_prediction_mlp = mlp_ent.predict(X_test_ent) 
y_ent_prediction_nb = model_nb_ent.predict(X_test_ent) 

#Accuracy with the same dataset
print("###################################")
print("Train=%s; Test=%s;  Accuracy Adaboost:%s" %(len(X_train_ent),len(X_test_ent),metrics.accuracy_score(y_test_ent, y_ent_prediction_ada)))
print(matthews_corrcoef(y_test_ent,y_ent_prediction_ada))
print(classification_report(y_test_ent,y_ent_prediction_ada))
print("Train=%s; Test=%s;  Accuracy RandomForest:%s" %(len(X_train_ent),len(X_test_ent),metrics.accuracy_score(y_test_ent, y_ent_prediction_rf)))
print(matthews_corrcoef(y_test_ent,y_ent_prediction_rf))
print(classification_report(y_test_ent,y_ent_prediction_rf))
print("Train=%s; Test=%s;  Accuracy KNN:%s" %(len(X_train_ent),len(X_test_ent),metrics.accuracy_score(y_test_ent, y_ent_prediction_knn)))
print(matthews_corrcoef(y_test_ent,y_ent_prediction_knn))
print(classification_report(y_test_ent,y_ent_prediction_knn))

###################################
Train=834; Test=366;  Accuracy Adaboost:1.0
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       306
           1       1.00      1.00      1.00        36
           2       1.00      1.00      1.00        24

   micro avg       1.00      1.00      1.00       366
   macro avg       1.00      1.00      1.00       366
weighted avg       1.00      1.00      1.00       366

Train=834; Test=366;  Accuracy RandomForest:0.9918032786885246
0.9725002488924304
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       306
           1       1.00      1.00      1.00        36
           2       0.89      1.00      0.94        24

   micro avg       0.99      0.99      0.99       366
   macro avg       0.96      1.00      0.98       366
weighted avg       0.99      0.99      0.99       366

Train=834; Test=366;  Accuracy KNN:0.9890710382513661
0.9636880142409209

In [14]:
####################################################################
#               ADDRESS ML
####################################################################

address_data1 = address_data1.filter(f.col("user")!="Unknow")

#Transform label to class index 
address_feature=transform_label_to_id(address_data1,"label","user")

df_address=address_feature.randomSplit([0.7,0.3])

#Split the input and the output from dataframe 
add_X=df_address[0].select("count_rec","totamount_rec","count_sent","totamount_sent","balance","unique","sibling")
add_y=df_address[0].select("label")
#Round amount field
add_X = add_X.withColumn("totamount_rec", f.round(add_X["totamount_rec"], 6))
add_X = add_X.withColumn("totamount_sent", f.round(add_X["totamount_sent"], 6))
add_X = add_X.withColumn("balance", f.round(add_X["balance"], 6))

#Transform input/output dataframe in vector
X_train_add = add_X.collect()
y_train_add = add_y.collect()

#Reshape the output vector
y_train_add=np.reshape(y_train_add,(len(y_train_add),))

In [15]:
#Split the input and the output from dataframe 
add_X=df_address[1].select("count_rec","totamount_rec","count_sent","totamount_sent","balance","unique","sibling")
add_y=df_address[1].select("label")
#Round amount field
add_X = add_X.withColumn("totamount_rec", f.round(add_X["totamount_rec"], 6))
add_X = add_X.withColumn("totamount_sent", f.round(add_X["totamount_sent"], 6))
add_X = add_X.withColumn("balance", f.round(add_X["balance"], 6))

#Transform input/output dataframe in vector
X_test_add = add_X.collect()
y_test_add = add_y.collect()

#Reshape the output vector
y_test_add=np.reshape(y_test_add,(len(y_test_add),))

In [16]:
####################################################################
#               ADABOOST ML
####################################################################

#Create adaboost classifer object
abc_add = AdaBoostClassifier(n_estimators=50,learning_rate=1)

#Train Adaboost Classifer
adaboost_add = abc_add.fit(X_train_add, y_train_add)

####################################################################
#               RANDOM FOREST ML
####################################################################

#Create random forest classifer object
rfc_add= RandomForestClassifier(n_jobs=2, random_state=0)

#Train Randomforest Classifer
randomforest_add=rfc_add.fit(X_train_add, y_train_add)


#Create a Gaussian Classifier
model_nb = GaussianNB()
model_nb_add=model_nb.fit(X_train_add, y_train_add)

model_knn = KNeighborsClassifier(n_neighbors=7)
modelknn_add=model_knn.fit(X_train_add, y_train_add)

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
mlp_add=mlp.fit(X_train_add, y_train_add)



In [18]:
#Predict the response for test dataset
y_add_prediction_ada = adaboost_add.predict(X_test_add)
y_add_prediction_rf = randomforest_add.predict(X_test_add) 
y_add_prediction_knn = modelknn_add.predict(X_test_add) 
y_add_prediction_mlp = mlp_add.predict(X_test_add) 
y_add_prediction_nb = model_nb_add.predict(X_test_add) 

#Accuracy with the same dataset
print("Train dataset = Test dataset")
print("Train=%s; Test=%s;  Accuracy Adaboost:%s" %(len(X_train_add),len(X_test_add),metrics.accuracy_score(y_test_add, y_add_prediction_ada)))
print(classification_report(y_test_add,y_add_prediction_ada))
print("Train=%s; Test=%s;  Accuracy RandomForest:%s" %(len(X_train_add),len(X_test_add),metrics.accuracy_score(y_test_add, y_add_prediction_rf)))
print(classification_report(y_test_add,y_add_prediction_rf))
print("Train=%s; Test=%s;  Accuracy KNN:%s" %(len(X_train_add),len(X_test_add),metrics.accuracy_score(y_test_add, y_add_prediction_knn)))
print(classification_report(y_test_add,y_add_prediction_knn))
print("Train=%s; Test=%s;  Accuracy MLP:%s" %(len(X_train_add),len(X_test_add),metrics.accuracy_score(y_test_add, y_add_prediction_mlp)))
print(classification_report(y_test_add,y_add_prediction_nb))
print("Train=%s; Test=%s;  Accuracy NB:%s" %(len(X_train_add),len(X_test_add),metrics.accuracy_score(y_test_add, y_add_prediction_nb)))
print(classification_report(y_test_add,y_add_prediction_nb))

Train dataset = Test dataset
Train=92695; Test=39788;  Accuracy Adaboost:0.8948175329244998
              precision    recall  f1-score   support

           0       0.90      0.99      0.94     35660
           1       0.37      0.07      0.12      2806
           2       0.94      0.07      0.12      1322

   micro avg       0.89      0.89      0.89     39788
   macro avg       0.74      0.38      0.39     39788
weighted avg       0.87      0.89      0.86     39788

Train=92695; Test=39788;  Accuracy RandomForest:0.9654167085553433
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     35660
           1       0.85      0.79      0.82      2806
           2       0.87      0.76      0.81      1322

   micro avg       0.97      0.97      0.97     39788
   macro avg       0.90      0.85      0.87     39788
weighted avg       0.96      0.97      0.96     39788

Train=92695; Test=39788;  Accuracy KNN:0.8992912435910325
              precisi

In [10]:
####################################################################
#               MOTIFS-1 ML
####################################################################

motif1_feature=motif1_data1.withColumnRenamed("outuser","user")


#Transform label to class index (input (?) and output)
motif1_feature=transform_label_to_id(motif1_feature,"label","user")
motif1_feature=transform_label_to_id(motif1_feature,"labelin","inuser")

motif1_feature =motif1_feature.withColumn("labelin", f.col("labelin").cast("integer"))

df_motifs1=motif1_feature.randomSplit([0.7,0.3])

#Split the input and the output from dataframe 
#mot1_X=df_motifs1[0].select("labelin","address_recv_dist","amount_recv","tx_sent","address_sent_dist","amount_sent","tx_recv_tot","fees","loop_in_out","direct_in_out")
mot1_X=df_motifs1[0].select("address_recv_dist","amount_recv","tx_sent","address_sent_dist","amount_sent","tx_recv_tot","fees","loop_in_out","direct_in_out")
mot1_y=df_motifs1[0].select("label")

#Round amount field
mot1_X = mot1_X.withColumn("amount_recv", f.round(mot1_X["amount_recv"], 6))
mot1_X = mot1_X.withColumn("amount_sent", f.round(mot1_X["amount_sent"], 6))
mot1_X = mot1_X.withColumn("fees", f.round(mot1_X["fees"], 6))

#Transform input/output dataframe in vector
X_train_mot1 = mot1_X.collect()
y_train_mot1 = mot1_y.collect()

#Reshape the output vector
y_train_mot1=np.reshape(y_train_mot1,(len(y_train_mot1),))

In [11]:
#Split the input and the output from dataframe 
#mot1_X=df_motifs1[1].select("labelin","address_recv_dist","amount_recv","tx_sent","address_sent_dist","amount_sent","tx_recv_tot","fees","loop_in_out","direct_in_out")
mot1_X=df_motifs1[1].select("address_recv_dist","amount_recv","tx_sent","address_sent_dist","amount_sent","tx_recv_tot","fees","loop_in_out","direct_in_out")
mot1_y=df_motifs1[1].select("label")

#Round amount field
mot1_X = mot1_X.withColumn("amount_recv", f.round(mot1_X["amount_recv"], 6))
mot1_X = mot1_X.withColumn("amount_sent", f.round(mot1_X["amount_sent"], 6))
mot1_X = mot1_X.withColumn("fees", f.round(mot1_X["fees"], 6))

#Transform input/output dataframe in vector
X_test_mot1 = mot1_X.collect()
y_test_mot1 = mot1_y.collect()

#Reshape the output vector
y_test_mot1=np.reshape(y_test_mot1,(len(y_test_mot1),))

In [15]:
####################################################################
#               ADABOOST ML
####################################################################

# Create adaboost classifer object
abc_mot1 = AdaBoostClassifier(n_estimators=50,learning_rate=1)

# Train Adaboost Classifer
adaboost_mot1 = abc_mot1.fit(X_train_mot1, y_train_mot1)

####################################################################
#               RANDOM FOREST ML
####################################################################

#Create random forest classifer object
rfc_mot1= RandomForestClassifier(n_jobs=2, random_state=0)

#Train Randomforest Classifer
randomforest_mot1=rfc_mot1.fit(X_train_mot1, y_train_mot1)


model_knn = KNeighborsClassifier(n_neighbors=7)
modelknn_mot1=model_knn.fit(X_train_mot1, y_train_mot1)





In [16]:
#Predict the response for test dataset
y_mot1_prediction_ada  = adaboost_mot1.predict(X_test_mot1)
y_mot1_prediction_rf  = randomforest_mot1.predict(X_test_mot1 ) 
y_mot1_prediction_knn = modelknn_mot1.predict(X_test_mot1) 

#Accuracy with the same dataset
print("Train dataset = Test dataset")
print("Train=%s; Test=%s; Accuracy Adaboost:%s" %(len(X_train_mot1 ),len(X_test_mot1 ),metrics.accuracy_score(y_test_mot1 , y_mot1_prediction_ada )))
print(classification_report(y_test_mot1,y_mot1_prediction_ada))
print("Train=%s; Test=%s; Accuracy RandomForest:%s" %(len(X_train_mot1 ),len(X_test_mot1 ),metrics.accuracy_score(y_test_mot1 , y_mot1_prediction_rf )))
print(classification_report(y_test_mot1,y_mot1_prediction_rf))
print("Train=%s; Test=%s;  Accuracy KNN:%s" %(len(X_train_mot1),len(X_test_mot1),metrics.accuracy_score(y_test_mot1, y_mot1_prediction_knn)))
print(classification_report(y_test_mot1,y_mot1_prediction_knn))

Train dataset = Test dataset
Train=92398; Test=39545; Accuracy Adaboost:0.8995827538247566
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35408
           1       0.53      0.02      0.04      2829
           2       0.48      0.09      0.15      1308

   micro avg       0.90      0.90      0.90     39545
   macro avg       0.64      0.37      0.38     39545
weighted avg       0.86      0.90      0.86     39545

Train=92398; Test=39545; Accuracy RandomForest:0.9855860412188646
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     35408
           1       0.95      0.89      0.92      2829
           2       0.91      0.85      0.88      1308

   micro avg       0.99      0.99      0.99     39545
   macro avg       0.95      0.91      0.93     39545
weighted avg       0.99      0.99      0.99     39545

Train=92398; Test=39545;  Accuracy KNN:0.906713870274371
              precision 

In [17]:
####################################################################
#               MOTIFS-2 ML
####################################################################

motif2_data1=motif2_data1.fillna(0,subset=["amount_sent_from_in","fee1"])
motif2_feature=motif2_data1.withColumnRenamed("outuser","user")

#Transform label to class index (input (?) middle (?) and output)
motif2_feature=transform_label_to_id(motif2_feature,"label","user")
motif2_feature=transform_label_to_id(motif2_feature,"labelin","inuser")
motif2_feature=transform_label_to_id(motif2_feature,"labelmid","miduser")

motif2_feature =motif2_feature.withColumn("labelin", f.col("labelin").cast("integer"))
motif2_feature =motif2_feature.withColumn("labelmid", f.col("labelmid").cast("integer"))

df_motifs2=motif2_feature.randomSplit([0.7,0.3])

#Split the input and the output from dataframe 
#mot2_X=df_motifs2[0].select("labelin","labelmid","address_recv_dist_to_out","amount_recv_to_out","fee2",\
#                         "tx_sent_from_mid","address_sent_from_mid","amount_sent_from_mid",\
#                         "address_recv_to_mid","amount_recv_to_mid","tx_sent_from_in","address_sent_from_in",\
#                         "amount_sent_from_in","fee1","loop_mid_out","loop_in_mid","loop_in_out",\
#                         "direct_mid_out","direct_in_mid","direct_in_out")

mot2_X=df_motifs2[0].select("address_recv_dist_to_out","amount_recv_to_out","fee2",\
                         "tx_sent_from_mid","address_sent_from_mid","amount_sent_from_mid",\
                         "address_recv_to_mid","amount_recv_to_mid","tx_sent_from_in","address_sent_from_in",\
                         "amount_sent_from_in","fee1","loop_mid_out","loop_in_mid","loop_in_out",\
                         "direct_mid_out","direct_in_mid","direct_in_out")

mot2_y=df_motifs2[0].select("label")

#Round amount field
mot2_X = mot2_X.withColumn("amount_recv_to_out", f.round(mot2_X["amount_recv_to_out"], 6))
mot2_X = mot2_X.withColumn("amount_sent_from_mid", f.round(mot2_X["amount_sent_from_mid"], 6))
mot2_X = mot2_X.withColumn("amount_recv_to_mid", f.round(mot2_X["amount_recv_to_mid"], 6))
mot2_X = mot2_X.withColumn("amount_sent_from_in", f.round(mot2_X["amount_sent_from_in"], 6))
mot2_X = mot2_X.withColumn("fee2", f.round(mot2_X["fee2"], 6))
mot2_X = mot2_X.withColumn("fee1", f.round(mot2_X["fee1"], 6))

#Transform input/output dataframe in vector
X_train_mot2= mot2_X.collect()
y_train_mot2 = mot2_y.collect()

#Reshape the output vector
y_train_mot2=np.reshape(y_train_mot2,(len(y_train_mot2),))

In [24]:
#Split the input and the output from dataframe 
mot2_X=df_motifs2[1].select("labelin","labelmid","address_recv_dist_to_out","amount_recv_to_out","fee2",\
                         "tx_sent_from_mid","address_sent_from_mid","amount_sent_from_mid",\
                         "address_recv_to_mid","amount_recv_to_mid","tx_sent_from_in","address_sent_from_in",\
                         "amount_sent_from_in","fee1","loop_mid_out","loop_in_mid","loop_in_out",\
                         "direct_mid_out","direct_in_mid","direct_in_out")

mot2_y=df_motifs2[1].select("label")

#Round amount field
mot2_X = mot2_X.withColumn("amount_recv_to_out", f.round(mot2_X["amount_recv_to_out"], 6))
mot2_X = mot2_X.withColumn("amount_sent_from_mid", f.round(mot2_X["amount_sent_from_mid"], 6))
mot2_X = mot2_X.withColumn("amount_recv_to_mid", f.round(mot2_X["amount_recv_to_mid"], 6))
mot2_X = mot2_X.withColumn("amount_sent_from_in", f.round(mot2_X["amount_sent_from_in"], 6))
mot2_X = mot2_X.withColumn("fee2", f.round(mot2_X["fee2"], 6))
mot2_X = mot2_X.withColumn("fee1", f.round(mot2_X["fee1"], 6))

#Transform input/output dataframe in vector
X_test_mot2= mot2_X.collect()
y_test_mot2 = mot2_y.collect()

#Reshape the output vector
y_test_mot2=np.reshape(y_test_mot2,(len(y_test_mot2),))

In [18]:
####################################################################
#               ADABOOST ML
####################################################################

# Create adaboost classifer object
abc_mot2 = AdaBoostClassifier(n_estimators=50,learning_rate=1)

# Train Adaboost Classifer
adaboost_mot2 = abc_mot2.fit(X_train_mot2, y_train_mot2)

####################################################################
#               RANDOM FOREST ML
####################################################################

#Create random forest classifer object
rfc_mot2= RandomForestClassifier(n_jobs=2, random_state=0)

#Train Randomforest Classifer
randomforest_mot2=rfc_mot2.fit(X_train_mot2, y_train_mot2)

#Create a Gaussian Classifier
model_nb = GaussianNB()
model_nb_mot2=model_nb.fit(X_train_mot2, y_train_mot2)

model_knn = KNeighborsClassifier(n_neighbors=7)
modelknn_mot2=model_knn.fit(X_train_mot2, y_train_mot2)

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
mlp_mot2=mlp.fit(X_train_mot2, y_train_mot2)



In [26]:
#Predict the response for test dataset
y_mot2_prediction_ada  = adaboost_mot2.predict(X_test_mot2)
y_mot2_prediction_rf  = randomforest_mot2.predict(X_test_mot2) 
y_mot2_prediction_knn = modelknn_mot2.predict(X_test_mot2) 
y_mot2_prediction_mlp = mlp_mot2.predict(X_test_mot2) 
y_mot2_prediction_nb = model_nb_mot2.predict(X_test_mot2)

#Accuracy with the same dataset
print("Train dataset = Test dataset")
print("Train=%s; Test=%s; Accuracy Adaboost:%s" %(len(X_train_mot2 ),len(X_test_mot2 ),metrics.accuracy_score(y_test_mot2 , y_mot2_prediction_ada )))
print(classification_report(y_test_mot2,y_mot2_prediction_ada))
print("Train=%s; Test=%s; Accuracy RandomForest:%s" %(len(X_train_mot2 ),len(X_test_mot2 ),metrics.accuracy_score(y_test_mot2 , y_mot2_prediction_rf )))
print(classification_report(y_test_mot2,y_mot2_prediction_rf))
print("Train=%s; Test=%s;  Accuracy KNN:%s" %(len(X_train_mot2),len(X_test_mot2),metrics.accuracy_score(y_test_mot2, y_mot2_prediction_knn)))
print(classification_report(y_test_mot2,y_mot2_prediction_knn))
print("Train=%s; Test=%s;  Accuracy MLP:%s" %(len(X_train_mot2),len(X_test_mot2),metrics.accuracy_score(y_test_mot2, y_mot2_prediction_mlp)))
print(classification_report(y_test_mot2,y_mot2_prediction_mlp))
print("Train=%s; Test=%s;  Accuracy NB:%s" %(len(X_train_mot2),len(X_test_mot2),metrics.accuracy_score(y_test_mot2, y_mot2_prediction_nb)))
print(classification_report(y_test_mot2,y_mot2_prediction_nb))

Train dataset = Test dataset
Train=217753; Test=93284; Accuracy Adaboost:0.9120642339522319
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     85932
           1       0.00      0.00      0.00      5134
           2       0.00      0.00      0.00      2218

   micro avg       0.91      0.91      0.91     93284
   macro avg       0.31      0.33      0.32     93284
weighted avg       0.85      0.91      0.88     93284

Train=217753; Test=93284; Accuracy RandomForest:0.9933750696796878
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     85932
           1       0.99      0.93      0.96      5134
           2       0.99      0.91      0.95      2218

   micro avg       0.99      0.99      0.99     93284
   macro avg       0.99      0.95      0.97     93284
weighted avg       0.99      0.99      0.99     93284

Train=217753; Test=93284;  Accuracy KNN:0.9295913554307277
              precis

  'precision', 'predicted', average, warn_for)


Train=217753; Test=93284;  Accuracy NB:0.8947515115132284
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     85932
           1       0.11      0.07      0.08      5134
           2       0.00      0.00      0.00      2218

   micro avg       0.89      0.89      0.89     93284
   macro avg       0.34      0.35      0.34     93284
weighted avg       0.86      0.89      0.87     93284



In [19]:
filename_ada = '/home/titanium/spark_ml/nolabel/adaboost_mot2.pkl'
filename_rfc = '/home/titanium/spark_ml/nolabel/randomforest_mot2.pkl'
filename_knn = '/home/titanium/spark_ml/nolabel/knn_mot2.pkl'
pickle.dump(adaboost_mot2, open(filename_ada, 'wb'))
pickle.dump(randomforest_mot2, open(filename_rfc, 'wb'))
pickle.dump(modelknn_mot2, open(filename_knn, 'wb'))


filename_ada = '/home/titanium/spark_ml/nolabel/adaboost_mot1.pkl'
filename_rfc = '/home/titanium/spark_ml/nolabel/randomforest_mot1.pkl'
filename_knn = '/home/titanium/spark_ml/nolabel/knn_mot1.pkl'
pickle.dump(adaboost_mot1, open(filename_ada, 'wb'))
pickle.dump(randomforest_mot1, open(filename_rfc, 'wb'))
pickle.dump(modelknn_mot1, open(filename_knn, 'wb'))

In [28]:
#############################################################################
#              SAVE MODEL
#############################################################################

filename_ada = '/home/titanium/spark_ml/address/adaboost_add.pkl'
filename_rfc = '/home/titanium/spark_ml/address/randomforest_add.pkl'
filename_knn = '/home/titanium/spark_ml/address/knn_add.pkl'
pickle.dump(adaboost_add, open(filename_ada, 'wb'))
pickle.dump(randomforest_add, open(filename_rfc, 'wb'))
pickle.dump(modelknn_add, open(filename_knn, 'wb'))

filename_ada = '/home/titanium/spark_ml/entity/adaboost_ent.pkl'
filename_rfc = '/home/titanium/spark_ml/entity/randomforest_ent.pkl'
filename_knn = '/home/titanium/spark_ml/entity/knn_ent.pkl'
pickle.dump(adaboost_ent, open(filename_ada, 'wb'))
pickle.dump(randomforest_ent, open(filename_rfc, 'wb'))
pickle.dump(modelknn_ent, open(filename_knn, 'wb'))


filename_ada = '/home/titanium/spark_ml/motifs1/adaboost_mot1.pkl'
filename_rfc = '/home/titanium/spark_ml/motifs1/randomforest_mot1.pkl'
filename_knn = '/home/titanium/spark_ml/motifs1/knn_mot1.pkl'
pickle.dump(adaboost_mot1, open(filename_ada, 'wb'))
pickle.dump(randomforest_mot1, open(filename_rfc, 'wb'))
pickle.dump(modelknn_mot1, open(filename_knn, 'wb'))

filename_ada = '/home/titanium/spark_ml/motifs2/adaboost_mot2.pkl'
filename_rfc = '/home/titanium/spark_ml/motifs2/randomforest_mot2.pkl'
filename_knn = '/home/titanium/spark_ml/motifs2/knn_mot2.pkl'
pickle.dump(adaboost_mot2, open(filename_ada, 'wb'))
pickle.dump(randomforest_mot2, open(filename_rfc, 'wb'))
pickle.dump(modelknn_mot2, open(filename_knn, 'wb'))

In [29]:
address_feature2=df_address[1].select("user").collect()
motif1_feature2=df_motifs1[1].select("user").collect()
motif2_feature2=df_motifs2[1].select("user").collect()


In [30]:
#Create a dataframe with the result of previuos classifier

#############################################################################
#               ADDRESS DATAFRAME
#############################################################################

#Set ENTITY dataframe for the final classifier
entity_data1 = entity_data1.filter(f.col("user")!="Unknow")
entity_data1 = entity_data1.fillna(0)

entity_feature=entity_data1

#############################################################################
#               ADDRESS DATAFRAME
#############################################################################
#Link the ADABOOST prediction with the input tag/label
DF=link_outclass_intag(address_feature2,y_add_prediction_ada)
#Join the prediction about the same input tag/label
address_pred_feature_ada = join_dataframe_intag(DF,"add")

#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(address_feature2,y_add_prediction_rf)
#Join the prediction about the same input tag/label
address_pred_feature_rf = join_dataframe_intag(DF,"add")

#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(address_feature2,y_add_prediction_knn)
#Join the prediction about the same input tag/label
address_pred_feature_knn = join_dataframe_intag(DF,"add")
#############################################################################
#               MOTIFS-1 DATAFRAME
#############################################################################

#Link the ADABOOST prediction with the input tag/label
DF=link_outclass_intag(motif1_feature2,y_mot1_prediction_ada)
#Join the prediction about the same input tag/label
motif1_pred_feature_ada = join_dataframe_intag(DF,"mot1")

#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(motif1_feature2,y_mot1_prediction_rf)
#Join the prediction about the same input tag/label
motif1_pred_feature_rf = join_dataframe_intag(DF,"mot1")


#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(motif1_feature2,y_mot1_prediction_knn)
#Join the prediction about the same input tag/label
motif1_pred_feature_knn = join_dataframe_intag(DF,"mot1")
#############################################################################
#               MOTIFS-2 DATAFRAME
#############################################################################

#Link the ADABOOST prediction with the input tag/label
DF=link_outclass_intag(motif2_feature2,y_mot2_prediction_ada)
#Join the prediction about the same input tag/label
motif2_pred_feature_ada = join_dataframe_intag(DF,"mot2")

#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(motif2_feature2,y_mot2_prediction_rf)
#Join the prediction about the same input tag/label
motif2_pred_feature_rf = join_dataframe_intag(DF,"mot2")

#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(motif2_feature2,y_mot2_prediction_knn)
#Join the prediction about the same input tag/label
motif2_pred_feature_knn = join_dataframe_intag(DF,"mot2")

In [31]:
#Join all the dataframe information

#############################################################################
#               ADABOOST DATAFRAME CREATION
#############################################################################
df_final_ada = entity_feature.join(address_pred_feature_ada,['user'])
df_final_ada = df_final_ada.join(motif1_pred_feature_ada,['user'])
df_final_ada  = df_final_ada.join(motif2_pred_feature_ada,['user'])
df_final_ada  = transform_label_to_id(df_final_ada ,"label","user")

#############################################################################
#               RANDOM FOREST DATAFRAME CREATION
#############################################################################
df_final_rf = entity_feature.join(address_pred_feature_rf,['user'])
df_final_rf = df_final_rf.join(motif1_pred_feature_rf,['user'])
df_final_rf = df_final_rf.join(motif2_pred_feature_rf,['user'])
df_final_rf = transform_label_to_id(df_final_rf,"label","user")

#############################################################################
#               RANDOM FOREST DATAFRAME CREATION
#############################################################################
df_final_knn = entity_feature.join(address_pred_feature_knn,['user'])
df_final_knn = df_final_knn.join(motif1_pred_feature_knn,['user'])
df_final_knn = df_final_knn.join(motif2_pred_feature_knn,['user'])
df_final_knn = transform_label_to_id(df_final_knn,"label","user")

In [32]:
#############################################################################
#               ADABOOST DATAFRAME CREATION
#############################################################################

df_final_ada_split=df_final_ada.randomSplit([0.7,0.3])

#Split the input from the output data
ada_final_X=df_final_ada_split[0].select("balance_recv","balancein","balance","count_recv","count_sent",\
                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")

ada_final_y=df_final_ada_split[0].select("label")

#Round amount field
ada_final_X = ada_final_X.withColumn("balance_recv", f.round(ada_final_X["balance_recv"], 6))
ada_final_X = ada_final_X.withColumn("balancein", f.round(ada_final_X["balancein"], 6))
ada_final_X = ada_final_X.withColumn("balance", f.round(ada_final_X["balance"], 6))

#Transform input/output dataframe in vector
ada_final_X_train = ada_final_X.fillna(0).collect()
ada_final_y_train = ada_final_y.collect()

#Reshape the output vector
ada_final_y_train=np.reshape(ada_final_y_train,(len(ada_final_y_train),))

#Split the input from the output data
ada_final_X=df_final_ada_split[1].select("balance_recv","balancein","balance","count_recv","count_sent",\
                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")

ada_final_y=df_final_ada_split[1].select("label")

#Round amount field
ada_final_X = ada_final_X.withColumn("balance_recv", f.round(ada_final_X["balance_recv"], 6))
ada_final_X = ada_final_X.withColumn("balancein", f.round(ada_final_X["balancein"], 6))
ada_final_X = ada_final_X.withColumn("balance", f.round(ada_final_X["balance"], 6))

#Transform input/output dataframe in vector
ada_final_X_test = ada_final_X.fillna(0).collect()
ada_final_y_test = ada_final_y.collect()

#Reshape the output vector
ada_final_y_test=np.reshape(ada_final_y_test,(len(ada_final_y_test),))

In [33]:
#############################################################################
#               RANDOMFOREST DATAFRAME CREATION
#############################################################################
df_final_rf_split=df_final_rf.randomSplit([0.7,0.3])

rf_final_X=df_final_rf_split[0].select("balance_recv","balancein","balance","count_recv","count_sent",\
                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")

rf_final_y=df_final_rf_split[0].select("label")

#Round amount field
rf_final_X = rf_final_X.withColumn("balance_recv", f.round(rf_final_X["balance_recv"], 6))
rf_final_X = rf_final_X.withColumn("balancein", f.round(rf_final_X["balancein"], 6))
rf_final_X = rf_final_X.withColumn("balance", f.round(rf_final_X["balance"], 6))

#Transform input/output dataframe in vector
rf_final_X_train = rf_final_X.fillna(0).collect()
rf_final_y_train = rf_final_y.collect()

#Reshape the output vector
rf_final_y_train=np.reshape(rf_final_y_train,(len(rf_final_y_train),))

rf_final_X=df_final_rf_split[1].select("balance_recv","balancein","balance","count_recv","count_sent",\
                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")

rf_final_y=df_final_rf_split[1].select("label")

#Round amount field
rf_final_X = rf_final_X.withColumn("balance_recv", f.round(rf_final_X["balance_recv"], 6))
rf_final_X = rf_final_X.withColumn("balancein", f.round(rf_final_X["balancein"], 6))
rf_final_X = rf_final_X.withColumn("balance", f.round(rf_final_X["balance"], 6))

#Transform input/output dataframe in vector
rf_final_X_test = rf_final_X.fillna(0).collect()
rf_final_y_test = rf_final_y.collect()

#Reshape the output vector
rf_final_y_test=np.reshape(rf_final_y_test,(len(rf_final_y_test),))

In [34]:
#############################################################################
#               RANDOMFOREST DATAFRAME CREATION
#############################################################################
df_final_knn_split=df_final_knn.randomSplit([0.7,0.3])

knn_final_X=df_final_knn_split[0].select("balance_recv","balancein","balance","count_recv","count_sent",\
                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")

knn_final_y=df_final_knn_split[0].select("label")

#Round amount field
knn_final_X = knn_final_X.withColumn("balance_recv", f.round(knn_final_X["balance_recv"], 6))
knn_final_X = knn_final_X.withColumn("balancein", f.round(knn_final_X["balancein"], 6))
knn_final_X = knn_final_X.withColumn("balance", f.round(knn_final_X["balance"], 6))

#Transform input/output dataframe in vector
knn_final_X_train = knn_final_X.fillna(0).collect()
knn_final_y_train = knn_final_y.collect()

#Reshape the output vector
knn_final_y_train=np.reshape(knn_final_y_train,(len(knn_final_y_train),))

knn_final_X=df_final_knn_split[1].select("balance_recv","balancein","balance","count_recv","count_sent",\
                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")

knn_final_y=df_final_knn_split[1].select("label")

#Round amount field
knn_final_X = knn_final_X.withColumn("balance_recv", f.round(knn_final_X["balance_recv"], 6))
knn_final_X = knn_final_X.withColumn("balancein", f.round(knn_final_X["balancein"], 6))
knn_final_X = knn_final_X.withColumn("balance", f.round(knn_final_X["balance"], 6))

#Transform input/output dataframe in vector
knn_final_X_test = knn_final_X.fillna(0).collect()
knn_final_y_test = knn_final_y.collect()

#Reshape the output vector
knn_final_y_test=np.reshape(knn_final_y_test,(len(knn_final_y_test),))

In [35]:
#############################################################################
#               ADABOOST FINAL CLASSIFICATOR
#############################################################################

# Create adaboost classifer object
abc_final = AdaBoostClassifier(n_estimators=50,learning_rate=1)

# Train Adaboost Classifer
adaboost_final = abc_final.fit(ada_final_X_train, ada_final_y_train)

#############################################################################
#               RANDOMFOREST FINAL CLASSIFICATOR
#############################################################################

#Create random forest classifer object
rfc_final= RandomForestClassifier(n_jobs=2, random_state=0)  
#Train Randomforest Classifer
randomforest_final=rfc_final.fit(rf_final_X_train, rf_final_y_train)

#############################################################################
#               RANDOMFOREST FINAL CLASSIFICATOR
#############################################################################

#Create random forest classifer object
rfc_final_to_knn= RandomForestClassifier(n_jobs=2, random_state=0)  
#Train Randomforest Classifer
randomforest_final_to_knn=rfc_final_to_knn.fit(knn_final_X_train, knn_final_y_train)



In [36]:
#Predict the response for test dataset
y_prediction_ada = adaboost_final.predict(ada_final_X_test)
y_prediction_rf= randomforest_final.predict(rf_final_X_test) 
y_prediction_knn= randomforest_final_to_knn.predict(knn_final_X_test) 

#Accuracy with the same dataset
print("Train dataset = Test dataset")
print("Train=%s; Test=%s; Accuracy Adaboost:%s" %(len(ada_final_X_train ),len(ada_final_X_test ),metrics.accuracy_score(ada_final_y_test , y_prediction_ada )))
print(classification_report(ada_final_y_test,y_prediction_ada))
print("Train=%s; Test=%s; Accuracy RandomForest:%s" %(len(rf_final_X_train ),len(rf_final_X_test ),metrics.accuracy_score(rf_final_y_test , y_prediction_rf )))
print(classification_report(rf_final_y_test,y_prediction_rf))
print("Train=%s; Test=%s; Accuracy RandomForest from KNN:%s" %(len(knn_final_X_train ),len(knn_final_X_test),metrics.accuracy_score(knn_final_y_test , y_prediction_knn )))
print(classification_report(knn_final_y_test,y_prediction_knn))

Train dataset = Test dataset
Train=840; Test=348; Accuracy Adaboost:0.985632183908046
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       292
           1       1.00      0.92      0.96        26
           2       1.00      0.90      0.95        30

   micro avg       0.99      0.99      0.99       348
   macro avg       0.99      0.94      0.97       348
weighted avg       0.99      0.99      0.99       348

Train=834; Test=354; Accuracy RandomForest:1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       303
           1       1.00      1.00      1.00        26
           2       1.00      1.00      1.00        25

   micro avg       1.00      1.00      1.00       354
   macro avg       1.00      1.00      1.00       354
weighted avg       1.00      1.00      1.00       354

Train=840; Test=348; Accuracy RandomForest from KNN:1.0
              precision    recall  f1-score   sup

In [37]:
#############################################################################
#              SAVE MODEL
#############################################################################
filename_ada = '/home/titanium/spark_ml/final/adaboost_final.pkl'
filename_rfc = '/home/titanium/spark_ml/final/randomforest_final.pkl'
filename_knn = '/home/titanium/spark_ml/final/randomforest_final_toknn.pkl'
pickle.dump(adaboost_final, open(filename_ada, 'wb'))
pickle.dump(randomforest_final, open(filename_rfc, 'wb'))
pickle.dump(randomforest_final_to_knn, open(filename_knn, 'wb'))


### Testing

In [27]:
path_directory="dataframe_join/join123"

In [32]:
#############################################################################
#              LOAD MODEL
#############################################################################
filename_ada = '/home/titanium/spark_ml/adaboost_add.pkl'
filename_rfc = '/home/titanium/spark_ml/randomforest_add.pkl'
loaded_model_ada_add = pickle.load(open(filename_ada, 'rb'))
loaded_model_rfc_add = pickle.load(open(filename_rfc, 'rb'))

filename_ada = '/home/titanium/spark_ml/adaboost_mot1.pkl'
filename_rfc = '/home/titanium/spark_ml/randomforest_mot1.pkl'
loaded_model_ada_mot1 = pickle.load(open(filename_ada, 'rb'))
loaded_model_rfc_mot1 = pickle.load(open(filename_rfc, 'rb'))

filename_ada = '/home/titanium/spark_ml/adaboost_mot2.pkl'
filename_rfc = '/home/titanium/spark_ml/randomforest_mot2.pkl'
loaded_model_ada_mot2 = pickle.load(open(filename_ada, 'rb'))
loaded_model_rfc_mot2 = pickle.load(open(filename_rfc, 'rb'))

filename_ada = '/home/titanium/spark_ml/adaboost_final.pkl'
filename_rfc = '/home/titanium/spark_ml/randomforest_final.pkl'
loaded_model_ada_final = pickle.load(open(filename_ada, 'rb'))
loaded_model_rfc_final = pickle.load(open(filename_rfc, 'rb'))


In [28]:
df_gambling = sqlContext.read.format('com.databricks.spark.csv')\
.option('header','true')\
.option("inferSchema", "true")\
.load(path+"walletexp_data/gambling*")

df_exchange = sqlContext.read.format('com.databricks.spark.csv')\
.option('header','true')\
.option("inferSchema", "true")\
.load(path+"walletexp_data/exchange*")

df_exchange =df_exchange.withColumn("xxx",f.lit(1))
df_gambling =df_gambling.withColumn("xxx",f.lit(2))
df_label = df_exchange.union(df_gambling).groupby("label").agg(f.first("xxx").alias("class"))

In [29]:
#########################################################################
#               IMPORT DATAFRAMES
#########################################################################

entity_data2= sqlContext.read.format('com.databricks.spark.csv')\
.option('header','true')\
.option("inferSchema", "true")\
.load(path+path_directory+"/entity/")

address_data2 = sqlContext.read.format('com.databricks.spark.csv')\
.option('header','true')\
.option("inferSchema", "true")\
.load(path+path_directory+"/address/")

#motif1_data2 = sqlContext.read.format('com.databricks.spark.csv')\
#.option('header','true')\
#.option("inferSchema", "true")\
#.load(path+path_directory+"/motif1/")

#motif2_data2 = sqlContext.read.format('com.databricks.spark.csv')\
#.option('header','true')\
#.option("inferSchema", "true")\
#.load(path+path_directory+"/motif2/")

In [30]:
address_data2=address_data2.groupby("address").agg(f.first("label").alias("label"),f.first("user").alias("user"),\
                                     f.sum("count_rec").alias("count_rec"),f.sum("totamount_rec").alias("totamount_rec"),\
                                    f.sum("count_sent").alias("count_sent"),f.sum("totamount_sent").alias("totamount_sent"),\
                                    f.sum("balance").alias("balance"),f.min("unique").alias("unique"),f.sum("sibling").alias("sibling"))

In [33]:
####################################################################
#               ADDRESS ML
####################################################################

#address_data2 = address_data2.filter(f.col("user")!="Unknow")

#Transform label to class index 
#address_feature_test=transform_label_to_id(address_data2,"label","user")

#Split the input and the output from dataframe 
address_feature_test=address_data2
add_X_test=address_feature_test.select("count_rec","totamount_rec","count_sent","totamount_sent","balance","unique","sibling")
add_y_test=address_feature_test.select("label")
add_y_test = add_y_test.withColumn("label", f.col("label").cast("string"))
add_y_test_class = address_feature_test.select("user")

#Round amount field
add_X_test = add_X_test.withColumn("totamount_rec", f.round(add_X_test["totamount_rec"], 6))
add_X_test = add_X_test.withColumn("totamount_sent", f.round(add_X_test["totamount_sent"], 6))
add_X_test = add_X_test.withColumn("balance", f.round(add_X_test["balance"], 6))

#Transform input/output dataframe in vector
X_test_add = add_X_test.collect()
y_test_add = add_y_test.collect()

#Reshape the output vector
y_test_add=np.reshape(y_test_add,(len(y_test_add),))


####################################################################
#               EVALUATION CLASSIFIER
####################################################################

#Predict the response for test dataset
y_add_prediction_ada_test = loaded_model_ada_add.predict(X_test_add)
y_add_prediction_rf_test = loaded_model_rfc_add.predict(X_test_add) 

#Accuracy with the same dataset
print("Train=%s; Test=%s;  Accuracy Adaboost:%s" %(0,len(X_test_add),metrics.accuracy_score( y_test_add,y_add_prediction_ada_test)))
print("Train=%s; Test=%s;  Accuracy RandomForest:%s" %(0,len(X_test_add),metrics.accuracy_score( y_test_add,y_add_prediction_rf_test)))

Train=0; Test=425689;  Accuracy Adaboost:0.0008879722050605019
Train=0; Test=425689;  Accuracy RandomForest:0.0005356022824174456


In [13]:
motif1_feature_test=motif1_data2.withColumnRenamed("outuser","user")

motif1_feature_split = motif1_feature_test.randomSplit([.25,.25,.25,.25])

In [14]:
####################################################################
#               MOTIFS-1 ML
####################################################################

#Transform label to class index (input (?) and output)
#motif1_feature_test=transform_label_to_id(motif1_feature_test,"label","user")
#motif1_feature_test=transform_label_to_id(motif1_feature_test,"labelin","inuser")
for i in range(0,4):
    #Split the input and the output from dataframe 
    mot1_X_test0=motif1_feature_split[i].select("labelin","address_recv_dist","amount_recv","tx_sent","address_sent_dist","amount_sent","tx_recv_tot","fees","loop_in_out","direct_in_out")
    mot1_y_test0=motif1_feature_split[i].select("label")
    mot1_y_test0 = mot1_y_test0.withColumn("label", f.col("label").cast("string"))
    mot1_y_class0=motif1_feature_split[i].select("user").collect()

    #Round amount field
    mot1_X_test0 = mot1_X_test0.withColumn("amount_recv", f.round(mot1_X_test0["amount_recv"], 6))
    mot1_X_test0 = mot1_X_test0.withColumn("amount_sent", f.round(mot1_X_test0["amount_sent"], 6))
    mot1_X_test0 = mot1_X_test0.withColumn("fees", f.round(mot1_X_test0["fees"], 6))
    #Transform input/output dataframe in vector
    X_test_mot10 = mot1_X_test0.collect()
    y_test_mot10 = mot1_y_test0.collect()
    
    if(i==0):
        X_test_mot1=X_test_mot10
        y_test_mot1=y_test_mot10
        mot1_y_class=mot1_y_class0
    else:
        X_test_mot1=X_test_mot1+X_test_mot10
        y_test_mot1=y_test_mot1+y_test_mot10
        mot1_y_class=mot1_y_class+mot1_y_class0


In [15]:
####################################################################
#               EVALUATION CLASSIFIER
####################################################################
#Reshape the output vector
y_test_mot1=np.reshape(y_test_mot1,(len(y_test_mot1),))

#Predict the response for test dataset
y_mot1_prediction_ada_test  = loaded_model_ada_mot1.predict(X_test_mot1)
y_mot1_prediction_rf_test  = loaded_model_rfc_mot1.predict(X_test_mot1 ) 

#Accuracy with the same dataset
print("Train=%s; Test=%s; Accuracy Adaboost:%s" %(0,len(X_test_mot1 ),metrics.accuracy_score(y_test_mot1 , y_mot1_prediction_ada_test )))
print("Train=%s; Test=%s; Accuracy RandomForest:%s" %(0,len(X_test_mot1 ),metrics.accuracy_score(y_test_mot1 , y_mot1_prediction_rf_test )))

Train=0; Test=2148239; Accuracy Adaboost:0.046788090151980294
Train=0; Test=2148239; Accuracy RandomForest:0.017907690904038143


In [16]:
motif2_data2=motif2_data2.fillna(0,subset=["amount_sent_from_in","fee1"])
motif2_feature_test=motif2_data2.withColumnRenamed("outuser","user")

motif2_feature_split = motif2_feature_test.randomSplit([.1,.1,.1,.1,.1,.1,.1,.1,.1,.1])

In [17]:
####################################################################
#               MOTIFS-2 ML
####################################################################


#Transform label to class index (input (?) middle (?) and output)
#motif2_feature_test=transform_label_to_id(motif2_feature_test,"label","user")
#motif2_feature_test=transform_label_to_id(motif2_feature_test,"labelin","inuser")
#motif2_feature_test=transform_label_to_id(motif2_feature_test,"labelmid","miduser")
for i in range(0,10):
    #Split the input and the output from dataframe 
    mot2_X_test0=motif2_feature_split[i].select("labelin","labelmid","address_recv_dist_to_out","amount_recv_to_out","fee2",\
                             "tx_sent_from_mid","address_sent_from_mid","amount_sent_from_mid",\
                             "address_recv_to_mid","amount_recv_to_mid","tx_sent_from_in","address_sent_from_in",\
                             "amount_sent_from_in","fee1","loop_mid_out","loop_in_mid","loop_in_out",\
                             "direct_mid_out","direct_in_mid","direct_in_out")

    mot2_y_test0=motif2_feature_split[i].select("label")
    mot2_y_test0 =mot2_y_test0.withColumn("label", f.col("label").cast("string"))
    mot2_y_test0_class=motif2_feature_split[i].select("user").collect()

    #Round amount field
    mot2_X_test0 = mot2_X_test0.withColumn("amount_recv_to_out", f.round(mot2_X_test0["amount_recv_to_out"], 6))
    mot2_X_test0 = mot2_X_test0.withColumn("amount_sent_from_mid", f.round(mot2_X_test0["amount_sent_from_mid"], 6))
    mot2_X_test0 = mot2_X_test0.withColumn("amount_recv_to_mid", f.round(mot2_X_test0["amount_recv_to_mid"], 6))
    mot2_X_test0 = mot2_X_test0.withColumn("amount_sent_from_in", f.round(mot2_X_test0["amount_sent_from_in"], 6))
    mot2_X_test0 = mot2_X_test0.withColumn("fee2", f.round(mot2_X_test0["fee2"], 6))
    mot2_X_test0 = mot2_X_test0.withColumn("fee1", f.round(mot2_X_test0["fee1"], 6))
    
    mot2_X_test0_list = mot2_X_test0.collect()
    mot2_y_test0_list = mot2_y_test0.collect()
        
    if(i==0):
        X_test_mot2=mot2_X_test0_list
        y_test_mot2=mot2_y_test0_list
        mot2_y_test_class=mot2_y_test0_class
    else:
        X_test_mot2=X_test_mot2+mot2_X_test0_list
        y_test_mot2=y_test_mot2+mot2_y_test0_list
        mot2_y_test_class=mot2_y_test_class+mot2_y_test0_class

In [18]:

#Reshape the output vector
y_test_mot2=np.reshape(y_test_mot2,(len(y_test_mot2),))

####################################################################
#               EVALUATION CLASSIFIER
####################################################################

#Predict the response for test dataset
y_mot2_prediction_ada_test  = loaded_model_ada_mot2.predict(X_test_mot2)
y_mot2_prediction_rf_test  = loaded_model_rfc_mot2.predict(X_test_mot2) 

#Accuracy with the same dataset
print("Train=%s; Test=%s; Accuracy Adaboost:%s" %(0,len(X_test_mot2 ),metrics.accuracy_score(y_test_mot2 , y_mot2_prediction_ada_test )))
print("Train=%s; Test=%s; Accuracy RandomForest:%s" %(0,len(X_test_mot2 ),metrics.accuracy_score(y_test_mot2 , y_mot2_prediction_rf_test )))

Train=0; Test=2809380; Accuracy Adaboost:0.0006261167944528686
Train=0; Test=2809380; Accuracy RandomForest:0.4248165075568275


In [35]:
add_y_test_class=add_y_test_class.collect()

In [36]:
#Create a dataframe with the result of previuos classifier

#############################################################################
#               ENTITY DATAFRAME
#############################################################################

#Set ENTITY dataframe for the final classifier
#entity_data2 = entity_data2.filter(f.col("user")!="Unknow")
#entity_data2 = entity_data2.fillna(0,subset=['count_sent'])
#entity_data2 = entity_data2.fillna(0,subset=['count_recv'])

entity_feature_test=entity_data2
#############################################################################
#               ADDRESS DATAFRAME
#############################################################################

#Link the ADABOOST prediction with the input tag/label
DF=link_outclass_intag(add_y_test_class,y_add_prediction_ada_test)
#Join the prediction about the same input tag/label
address_pred_feature_ada_test = join_dataframe_intag(DF,"add")

#############################################################################
#               MOTIFS-1 DATAFRAME
#############################################################################

#Link the ADABOOST prediction with the input tag/label
DF=link_outclass_intag(mot1_y_class,y_mot1_prediction_ada_test)
#Join the prediction about the same input tag/label
motif1_pred_feature_ada_test = join_dataframe_intag(DF,"mot1")

#############################################################################
#               MOTIFS-2 DATAFRAME
#############################################################################
#Link the ADABOOST prediction with the input tag/label
DF=link_outclass_intag(mot2_y_test_class,y_mot2_prediction_ada_test)
#Join the prediction about the same input tag/label
motif2_pred_feature_ada_test = join_dataframe_intag(DF,"mot2")

#Join all the dataframe information

#############################################################################
#               ADABOOST DATAFRAME CREATION
#############################################################################
df_final_ada_test = entity_feature_test.join(address_pred_feature_ada_test,['user'])
df_final_ada_test = df_final_ada_test.join(motif1_pred_feature_ada_test,['user'])
df_final_ada_test  = df_final_ada_test.join(motif2_pred_feature_ada_test,['user'])
#df_final_ada_test  = transform_label_to_id(df_final_ada_test ,"label","user")

In [37]:
#############################################################
#           STORE ADABOOST FEATURE IN HDFS
#############################################################
path_directory="/dataframe_join"
df_final_ada_test.write\
.format("com.databricks.spark.csv")\
.option("header", "true")\
.save(path+path_directory+"/df_final_ada_test")

AnalysisException: 'path hdfs://10.200.5.25:9001/user/titanium/dataframe_join/df_final_ada_test already exists.;'

In [22]:
#############################################################################
#               ADDRESS DATAFRAME
#############################################################################

#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(add_y_test_class,y_add_prediction_rf_test)
#Join the prediction about the same input tag/label
address_pred_feature_rf_test = join_dataframe_intag(DF,"add")

#############################################################################
#               MOTIFS-1 DATAFRAME
#############################################################################
#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(mot1_y_class,y_mot1_prediction_rf_test)
#Join the prediction about the same input tag/label
motif1_pred_feature_rf_test = join_dataframe_intag(DF,"mot1")

#############################################################################
#               MOTIFS-2 DATAFRAME
#############################################################################
#Link the RANDOMFOREST prediction with the input tag/label
DF=link_outclass_intag(mot2_y_test_class,y_mot2_prediction_rf_test)
#Join the prediction about the same input tag/label
motif2_pred_feature_rf_test = join_dataframe_intag(DF,"mot2")

#############################################################################
#               RANDOM FOREST DATAFRAME CREATION
#############################################################################
df_final_rf_test = entity_feature_test.join(address_pred_feature_rf_test,['user'])
df_final_rf_test = df_final_rf_test.join(motif1_pred_feature_rf_test,['user'])
df_final_rf_test = df_final_rf_test.join(motif2_pred_feature_rf_test,['user'])
#df_final_rf_test = transform_label_to_id(df_final_rf_test,"label","user")

In [23]:
#############################################################
#           STORE RANDOMFOREST FEATURE IN HDFS
#############################################################
path_directory="/dataframe_join"

df_final_rf_test.write\
.format("com.databricks.spark.csv")\
.option("header", "true")\
.save(path+path_directory+"/df_final_rf_test")

In [None]:
df_final_ada_test= sqlContext.read.format('com.databricks.spark.csv')\
.option('header','true')\
.option("inferSchema", "true")\
.load(path+path_directory+"/df_final_ada_test/")

df_final_rf_test= sqlContext.read.format('com.databricks.spark.csv')\
.option('header','true')\
.option("inferSchema", "true")\
.load(path+path_directory+"/df_final_rf_test/")

In [38]:
df_final_ada_test=df_final_ada_test.fillna(0)
df_final_rf_test=df_final_rf_test.fillna(0)

In [35]:
entity_data2.printSchema()

root
 |-- label: integer (nullable = true)
 |-- user: string (nullable = true)
 |-- balance_recv: double (nullable = true)
 |-- balancein: double (nullable = true)
 |-- balance: double (nullable = true)
 |-- count_recv: integer (nullable = true)
 |-- count_sent: integer (nullable = true)



In [36]:
#############################################################################
#               ADABOOST DATAFRAME CREATION
#############################################################################

#Transform label to class index (output)
#df_final_ada_test=transform_label_to_id(df_final_ada_test,"label","user")
ada_final_X_test0=entity_data2.select("balance_recv","balancein","balance","count_recv","count_sent")

#ada_final_X_test0=df_final_ada_test.select("balance_recv","balancein","balance","count_recv","count_sent",\
#                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
#                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
#                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")
#Split the input from the output data


ada_final_y_test0=entity_data2.select("label")
ada_final_y_test0 = ada_final_y_test0.withColumn("label", f.col("label").cast("string"))
#Round amount field
ada_final_X_test0 = ada_final_X_test0.withColumn("balance_recv", f.round(ada_final_X_test0["balance_recv"], 6))
ada_final_X_test0 = ada_final_X_test0.withColumn("balancein", f.round(ada_final_X_test0["balancein"], 6))
ada_final_X_test0 = ada_final_X_test0.withColumn("balance", f.round(ada_final_X_test0["balance"], 6))

#Transform input/output dataframe in vector
ada_final_X_test0 = ada_final_X_test0.collect()
ada_final_y_test0 = ada_final_y_test0.collect()

#Reshape the output vector
ada_final_y_test0=np.reshape(ada_final_y_test0,(len(ada_final_y_test0),))

In [40]:
#############################################################################
#               RANDOMFOREST DATAFRAME CREATION
#############################################################################

#Transform label to class index (output)
#df_final_rf_test=transform_label_to_id(df_final_rf_test,"label","user")

#Split the input from the output data
rf_final_X_test0=df_final_rf_test.select("balance_recv","balancein","balance","count_recv","count_sent",\
                        "add_cnt0","add_cnt1","add_cnt2","add_cnt3",\
                        "mot1_cnt0","mot1_cnt1","mot1_cnt2","mot1_cnt3",\
                        "mot2_cnt0","mot2_cnt1","mot2_cnt2","mot2_cnt3")

rf_final_y_test0=df_final_rf_test.select("label")
rf_final_y_test0 = rf_final_y_test0.withColumn("label", f.col("label").cast("string"))

#Round amount field
rf_final_X_test0 = rf_final_X_test0.withColumn("balance_recv", f.round(rf_final_X_test0["balance_recv"], 6))
rf_final_X_test0 = rf_final_X_test0.withColumn("balancein", f.round(rf_final_X_test0["balancein"], 6))
rf_final_X_test0 = rf_final_X_test0.withColumn("balance", f.round(rf_final_X_test0["balance"], 6))

#Transform input/output dataframe in vector
rf_final_X_test0 = rf_final_X_test0.collect()
rf_final_y_test0 = rf_final_y_test0.collect()

#Reshape the output vector
rf_final_y_test0=np.reshape(rf_final_y_test0,(len(rf_final_y_test0),))

In [37]:
#############################################################################
#               EVALUATION
#############################################################################

#Predict the response for test dataset
y_prediction_ada_test = loaded_model_ada_final.predict(ada_final_X_test0)
#y_prediction_rf_test= loaded_model_rfc_final.predict(rf_final_X_test0) 

#Accuracy with the same dataset
print("Train=%s; Test=%s; Accuracy Adaboost:%s" %(0,len(ada_final_X_test0 ),metrics.accuracy_score(ada_final_y_test0 , y_prediction_ada_test )))
#print("Train=%s; Test=%s; Accuracy RandomForest:%s" %(0,len(rf_final_X_test0 ),metrics.accuracy_score(rf_final_y_test0 , y_prediction_rf_test )))

Train=0; Test=70; Accuracy Adaboost:0.0


In [29]:
entity_data2.show()

+-----+--------------------+------------------+------------------+--------------------+----------+----------+
|label|                user|      balance_recv|         balancein|             balance|count_recv|count_sent|
+-----+--------------------+------------------+------------------+--------------------+----------+----------+
|    1|MercadoBitcoin.co...| 41708.98170726001| 51531.75953852001|  -9822.777831259998|      1766|       722|
|    1|MercadoBitcoin.co...| 517.2303597899999| 544.5094150199999|  -27.27905523000004|       166|        57|
|    1|        Cavirtex.com| 96502.01483320999|112073.87241775998| -15571.857584549987|      8017|      2870|
|    1|          VirWoX.com|18595.922202779988|24146.342736819977|  -5550.420534039989|      3414|      1267|
|    2|         BitZino.com| 7208.211132199999| 9635.911612120006| -2427.7004799200067|      2669|       810|
|    1|          VirWoX.com| 90049.15396205007|115057.79711077994|  -25008.64314872987|      6210|      2490|
|    1|   

In [31]:
print(y_prediction_rf_test)

['1' '1' '0' '1' '1' '1' '1' '0' '0' '0']
