In [1]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import datetime as dt

Import Data

In [2]:
#import data

df = pd.read_csv("data\produkt_tu_stunde_19480101_20211231_05906.txt", delimiter=';')
df

Unnamed: 0,STATIONS_ID,MESS_DATUM,QN_9,TT_TU,RF_TU,eor
0,5906,1948010101,5,-0.3,90.0,eor
1,5906,1948010102,5,-0.3,89.0,eor
2,5906,1948010103,5,0.1,89.0,eor
3,5906,1948010104,5,0.5,91.0,eor
4,5906,1948010105,5,0.8,87.0,eor
...,...,...,...,...,...,...
648607,5906,2021123119,3,10.2,91.0,eor
648608,5906,2021123120,3,10.9,88.0,eor
648609,5906,2021123121,3,11.0,88.0,eor
648610,5906,2021123122,3,11.1,88.0,eor


Clean & harmonize data set

In [3]:
#remove unneeded data

df=df.drop(columns=['STATIONS_ID', 'QN_9', 'eor'])
df.head(2)

Unnamed: 0,MESS_DATUM,TT_TU,RF_TU
0,1948010101,-0.3,90.0
1,1948010102,-0.3,89.0


In [4]:
#rename MESS_DATUM in Date

df = df.rename(columns={"MESS_DATUM": "Date"})

In [5]:
#convert int to date

df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d%H')
df.head(2)

Unnamed: 0,Date,TT_TU,RF_TU
0,1948-01-01 01:00:00,-0.3,90.0
1,1948-01-01 02:00:00,-0.3,89.0


In [6]:
#only last 4 years

df=df[df['Date'].dt.year >= 2018]
df

Unnamed: 0,Date,TT_TU,RF_TU
613548,2018-01-01 00:00:00,11.5,66.0
613549,2018-01-01 01:00:00,10.9,70.0
613550,2018-01-01 02:00:00,10.5,71.0
613551,2018-01-01 03:00:00,10.3,65.0
613552,2018-01-01 04:00:00,10.3,60.0
...,...,...,...
648607,2021-12-31 19:00:00,10.2,91.0
648608,2021-12-31 20:00:00,10.9,88.0
648609,2021-12-31 21:00:00,11.0,88.0
648610,2021-12-31 22:00:00,11.1,88.0


In [7]:
#remove leap years from dataset

df = df[~((df.Date.dt.month == 2) & (df.Date.dt.day == 29)) ]

In [8]:
#remove -999 values form dataset

df = df[~(df.TT_TU == -999)]
df = df[~(df.RF_TU == -999)]
df.min()

Date     2018-01-01 00:00:00
TT_TU                    -13
RF_TU                     14
dtype: object

Data set label & split

In [9]:
#create labeled data set for predictions in one year

prediction_hours = (365*24)
df["Prediction_TT"]=df["TT_TU"].shift(-prediction_hours)
df["Prediction_RF"]=df["RF_TU"].shift(-prediction_hours)

In [10]:
#aufteilen der Daten in x und y Werte

X = np.array(df[["TT_TU"]])
X = X[: len(df)-prediction_hours]

In [11]:
#Ertsellen der y Werte

y = np.array(df["Prediction_TT"])
y = y[:-prediction_hours]

In [12]:
#aufteilen der Daten in Trainings und Test Daten

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [13]:
#erstellen einer varaible mit letzten 365*24 Werten aus dem df

prediction_hours_array = np.array(df[['TT_TU']])[-prediction_hours:]

Support Vector Regression

In [14]:
#from sklearn.svm import SVR

#Support Vector Regression mit radial basis function

svr_rbf = SVR(kernel="rbf", C=1e3, gamma=1)
svr_rbf.fit(X_train, y_train)

SVR(C=1000.0, gamma=1)

In [15]:
#Testen des Models

model_test = svr_rbf.score(X_test, y_test)
print("Model Score: ", model_test)

Model Score:  0.4884996369132426


In [16]:
#Ausgaben der vorhergesagten Test Werte
#svm_prediction = svr_rbf.predicti(X_test)
#print(svm_prediction)

#print()

#Ausgeben der tatsächlen Werte
#print(y_test)

In [17]:
svm_prediction = svr_rbf.predict(prediction_hours_array)
print(svm_prediction)

print()


print(df["TT_TU"].tail(prediction_hours))

[5.05862231 4.92645294 5.04753485 ... 9.56488566 9.54012247 9.52079109]

639800     2.9
639801     3.3
639802     3.7
639803     3.7
639804     3.0
          ... 
648607    10.2
648608    10.9
648609    11.0
648610    11.1
648611    11.5
Name: TT_TU, Length: 8760, dtype: float64
