# Spark job to load and preprocess training data and train a model

In [1]:
from static_data_handlers.train_data_loader import TrainLoaderSpark
from pyspark.sql import SparkSession
from sklearn.ensemble import RandomForestClassifier
from joblib import dump
from sklearn.metrics import accuracy_score

## Initialize Spark session

In [None]:
# Initialize Spark session
spark_session = SparkSession.builder.appName("ModerTraining").getOrCreate()

## Data loading and preprocessing
### Data paths and other parameters

In [2]:
# define csv paths
data_folder = '../hai_dataset/hai/hai-21.03'
data_filenames = ['train1.csv', 'train2.csv', 'train3.csv']
label_columns = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']

### Load data

In [None]:
# Load raw training data
data_loader = TrainLoaderSpark(spark_session, data_folder=data_folder, data_filenames=data_filenames, label_columns=label_columns)
df = data_loader.get_data()

### Preprocess and split data
- preprocess data: e.g.: drop missing values, etc.
- split data to features and labels
- split data into train and test sets
- convert pyspark dataframe to pandas dataframe
- eliminate time column, it is not needed for prediction
- display data

In [4]:
# preprocess data
df = data_loader.preprocess_data(df)
X_train, X_test, y_train, y_test = data_loader.split_data(data_frame=df)

In [8]:
# convert pyspark dataframe to pandas dataframe
X_train=X_train.toPandas()
y_train=y_train.toPandas()
X_test=X_test.toPandas()
y_test=y_test.toPandas()

# eliminate time column, it is not needed for prediction
X_train=X_train.set_index('time')
y_train=y_train.set_index('time')

In [19]:
X_train.head()

Unnamed: 0_level_0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_HT_PO,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-11 00:00:00,0.10121,1.29784,397.63785,1001.99799,33.6555,100.0,2847.02539,37.14706,100.0,100.0,...,4.01474,0,301.01636,-0.00297,16495.0,301.35992,305.03113,0,10052.0,27610.0
2020-07-11 00:00:01,0.10121,1.29692,397.63785,1001.99799,33.6555,100.0,2839.5852,37.14477,100.0,100.0,...,3.74347,0,297.43567,0.00072,16402.0,297.43567,304.27161,0,10052.0,27610.0
2020-07-11 00:00:03,0.10121,1.28685,397.63785,1001.99799,33.6555,100.0,2834.95264,37.11959,100.0,100.0,...,3.1286,0,297.7431,-0.00318,16422.0,298.0686,303.67474,0,10052.0,27614.0
2020-07-11 00:00:04,0.10121,1.28807,397.63785,1001.99799,33.6555,100.0,2832.70654,37.12265,100.0,100.0,...,2.87546,0,297.01965,0.00015,16355.0,296.53137,303.22266,0,10052.0,27621.0
2020-07-11 00:00:05,0.10121,1.28838,397.63785,1001.99799,33.6555,100.0,2840.56763,37.12341,100.0,100.0,...,2.64032,0,296.98346,0.00094,16261.0,296.83881,302.04718,0,10051.0,27610.0


In [None]:
y_train.head()

## Model training and evaluation
### Train model

In [20]:
# Define and train the model using sklearn
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

### Evaluate model

In [None]:
# check model accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

### Save model so that it can be used for prediction multiple times with different drift detectors

In [21]:
# save model
dump(clf, 'models/base_model.joblib')

['models/base_model.joblib']

## Stop Spark session

In [None]:
# Stop Spark session
spark_session.stop()