In [7]:
import os
from datetime import datetime
import tarfile
from six.moves import urllib
import numpy as np
import pandas as pd
from zlib import crc32
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
spark = SparkSession.builder.getOrCreate()
sc = SparkContext.getOrCreate()

In [8]:
def train(csv_path='2023-01-27_17_10_influxdb_data.csv'):
    try:
        data = pd.read_csv(csv_path)
        filtered_data = data.drop(["result", "table", "_start", "_stop", "_measurement", "key", "Unnamed: 0"], axis=1)
        filtered_data["_field"].value_counts()
        # parse keys into seperate dataframes
        buzz_df = pd.DataFrame([element for element in filtered_data.to_numpy() if element[2] == "buzz"])
        fill_df = pd.DataFrame([element for element in filtered_data.to_numpy() if element[2] == "fill"])
        stol_df = pd.DataFrame([element for element in filtered_data.to_numpy() if element[2] == "stol"])
        #add values for machine learning
        past_values_30m = [filtered_data["_value"].loc[[i[0]-131]][i[0]-131] for i in enumerate(filtered_data.to_numpy()) if i[0]>45326+131 and i[0]<90653]
        for i in range(0,131):
            past_values_30m.insert(0, -1)
        past_values_2h = [filtered_data["_value"].loc[[i[0]-545]][i[0]-545] for i in enumerate(filtered_data.to_numpy()) if i[0]>45326+545 and i[0]<90653]
        for i in range(0,545):
            past_values_2h.insert(0, -1)
        past_values_4h = [filtered_data["_value"].loc[[i[0]-1091]][i[0]-1091] for i in enumerate(filtered_data.to_numpy()) if i[0]>45326+1091 and i[0]<90653]
        for i in range(0,1091):
            past_values_4h.insert(0, -1)
        #add result column for ml
        future_values = [filtered_data["_value"].loc[[i[0]+1091]][i[0]+1091] for i in enumerate(filtered_data.to_numpy()) if i[0]>45326 and i[0]<89561 and i[1][2] == "fill"]
        for i in range(0,1092):
            future_values.append(-1)
        # merge dataframe for use in spark
        temp_all_rows = pd.merge(fill_df, buzz_df, how='inner', on=0)
        temp_all_rows["fill_future_4h"] = future_values
        temp_all_rows["fill_past_30m"] = past_values_30m
        temp_all_rows["fill_past_2h"] = past_values_2h
        temp_all_rows["fill_past_4h"] = past_values_4h
        all_rows = pd.merge(temp_all_rows, stol_df, how='inner', on=0)
        renamed_rows = all_rows.rename(columns={'1_x': 'fill', '1_y': 'buzz', 1: 'stol', 0: 'tod'})
        pandas_df = renamed_rows.drop(['2_x', '2_y', 2], axis=1)
        pandas_df["tod"] = [datetime.strptime(row, '%Y-%m-%dT%H:%M:%S') for row in [row.split('Z')[0] for row in [row.split('.')[0] for row in pandas_df["tod"]]]]
        #parse date into seperate columns
        pandas_df["year"] = [date.year for date in pandas_df["tod"]]
        pandas_df["month"] = [date.month for date in pandas_df["tod"]]
        pandas_df["day"] = [date.day for date in pandas_df["tod"]]
        pandas_df["hour"] = [date.hour for date in pandas_df["tod"]]
        pandas_df["minute"] = [date.minute for date in pandas_df["tod"]]
        pandas_df["second"] = [date.second for date in pandas_df["tod"]]
        #clean data
        pandas_df = pandas_df.drop(['tod'], axis=1)
        pandas_df = pandas_df.drop(pandas_df[pandas_df.fill_future_4h < 0].index)
        pandas_df = pandas_df.drop(pandas_df[pandas_df.fill_past_4h < 0].index)
        sparkDF=spark.createDataFrame(pandas_df)
        feature_cols = ['year', 'month', 'day', 'hour', 'minute', 'second', 'fill', 'buzz', 'stol', 'fill_past_30m', 'fill_past_2h', 'fill_past_4h']
        vect_assembler = VectorAssembler(inputCols = feature_cols, outputCol = "features")
        data_w_features = vect_assembler.transform(sparkDF)
        # specify output column
        data_for_training = data_w_features.select('features', 'fill_future_4h')
        # train-test-split
        train_dataset, test_dataset = data_for_training.randomSplit([0.7, 0.3])
        LinReg = LinearRegression(featuresCol = "features", labelCol = "fill_future_4h")
        model = LinReg.fit(train_dataset)
        model.write().overwrite().save('./models/')
        print("trained and written model in the models folder!")
    except:
        print("Error while training the model! Training is still in development and does not work for other datasets than in the Repository!")

trained and written model in the models folder!
