In [1]:
from static_data_handlers.train_data_loader import TrainLoader
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sklearn.ensemble import RandomForestClassifier
from joblib import dump
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
# define csv paths
data_folder = '../hai_dataset/hai/hai-21.03'
data_filenames = ['train1.csv', 'train2.csv', 'train3.csv']
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']

In [3]:
# Initialize Spark session
spark_session = SparkSession.builder.appName("ModerTraining").getOrCreate()

# Load raw training data
data_loader = TrainLoader(spark_session, data_folder=data_folder, data_filenames=data_filenames, label_columns=label_columns)
df = data_loader.get_data()

Row(time=datetime.datetime(2020, 7, 11, 0, 0), P1_B2004=0.10121, P1_B2016=1.29784, P1_B3004=397.63785, P1_B3005=1001.99799, P1_B4002=33.6555, P1_B4005=100.0, P1_B400B=2847.02539, P1_B4022=37.14706, P1_FCV01D=100.0, P1_FCV01Z=100.0, P1_FCV02D=0.0, P1_FCV02Z=-1.87531, P1_FCV03D=51.58201, P1_FCV03Z=52.80456, P1_FT01=166.74039, P1_FT01Z=808.2962, P1_FT02=1973.19031, P1_FT02Z=2847.02539, P1_FT03=246.43968, P1_FT03Z=1000.44769, P1_LCV01D=8.79882, P1_LCV01Z=8.46252, P1_LIT01=395.19528, P1_PCV01D=39.09198, P1_PCV01Z=40.49072, P1_PCV02D=12, P1_PCV02Z=12.01782, P1_PIT01=1.3681, P1_PIT02=0.27786, P1_PP01AD=540833, P1_PP01AR=540833, P1_PP01BD=0, P1_PP01BR=0, P1_PP02D=1, P1_PP02R=1, P1_STSP=1, P1_TIT01=35.437, P1_TIT02=35.74219, P2_24Vdc=28.02645, P2_ASD=0, P2_AutoGO=1, P2_CO_rpm=54074.0, P2_Emerg=0, P2_HILout=712.07275, P2_MSD=763.19324, P2_ManualGO=0, P2_OnOff=1, P2_RTR=2880, P2_SIT01=780.0, P2_SIT02=779.59595, P2_TripEx=1, P2_VT01=11.89504, P2_VTR01=10, P2_VTR02=10, P2_VTR03=10, P2_VTR04=10, P2_

In [4]:
# preprocess data
df = data_loader.preprocess_data(df)
X_train, X_test, y_train, y_test = data_loader.split_data(data_frame=df)

In [8]:
# convert pyspark dataframe to pandas dataframe
X_train=X_train.toPandas()
y_train=y_train.toPandas()
X_test=X_test.toPandas()
y_test=y_test.toPandas()

# eliminate time column, it is not needed for prediction
X_train=X_train.set_index('time')
y_train=y_train.set_index('time')

In [19]:
X_train.head()

Unnamed: 0_level_0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_HT_PO,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-11 00:00:00,0.10121,1.29784,397.63785,1001.99799,33.6555,100.0,2847.02539,37.14706,100.0,100.0,...,4.01474,0,301.01636,-0.00297,16495.0,301.35992,305.03113,0,10052.0,27610.0
2020-07-11 00:00:01,0.10121,1.29692,397.63785,1001.99799,33.6555,100.0,2839.5852,37.14477,100.0,100.0,...,3.74347,0,297.43567,0.00072,16402.0,297.43567,304.27161,0,10052.0,27610.0
2020-07-11 00:00:03,0.10121,1.28685,397.63785,1001.99799,33.6555,100.0,2834.95264,37.11959,100.0,100.0,...,3.1286,0,297.7431,-0.00318,16422.0,298.0686,303.67474,0,10052.0,27614.0
2020-07-11 00:00:04,0.10121,1.28807,397.63785,1001.99799,33.6555,100.0,2832.70654,37.12265,100.0,100.0,...,2.87546,0,297.01965,0.00015,16355.0,296.53137,303.22266,0,10052.0,27621.0
2020-07-11 00:00:05,0.10121,1.28838,397.63785,1001.99799,33.6555,100.0,2840.56763,37.12341,100.0,100.0,...,2.64032,0,296.98346,0.00094,16261.0,296.83881,302.04718,0,10051.0,27610.0


In [None]:
y_train.head()

In [20]:
# Define and train the model using sklearn
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
# check model accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

In [21]:
# save model
dump(clf, 'models/base_model.joblib')

['models/base_model.joblib']

In [None]:
# Stop Spark session
spark_session.stop()