In [71]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import datetime as dt

Import Data

In [72]:
#import data

df = pd.read_csv("data\produkt_tu_stunde_19480101_20211231_05906.txt", delimiter=';')
df

Unnamed: 0,STATIONS_ID,MESS_DATUM,QN_9,TT_TU,RF_TU,eor
0,5906,1948010101,5,-0.3,90.0,eor
1,5906,1948010102,5,-0.3,89.0,eor
2,5906,1948010103,5,0.1,89.0,eor
3,5906,1948010104,5,0.5,91.0,eor
4,5906,1948010105,5,0.8,87.0,eor
...,...,...,...,...,...,...
648607,5906,2021123119,3,10.2,91.0,eor
648608,5906,2021123120,3,10.9,88.0,eor
648609,5906,2021123121,3,11.0,88.0,eor
648610,5906,2021123122,3,11.1,88.0,eor


Clean & harmonize data set

In [73]:
#remove unneeded data

df=df.drop(columns=['STATIONS_ID', 'QN_9', 'eor'])
df.head(2)

Unnamed: 0,MESS_DATUM,TT_TU,RF_TU
0,1948010101,-0.3,90.0
1,1948010102,-0.3,89.0


In [74]:
#rename MESS_DATUM in Date

df = df.rename(columns={"MESS_DATUM": "Date"})

In [75]:
#convert int to date

df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d%H')
df.head(2)

Unnamed: 0,Date,TT_TU,RF_TU
0,1948-01-01 01:00:00,-0.3,90.0
1,1948-01-01 02:00:00,-0.3,89.0


In [76]:
#only last 21 years

df=df[df['Date'].dt.year >= 2000]
df

Unnamed: 0,Date,TT_TU,RF_TU
455831,2000-01-01 00:00:00,2.4,91.0
455832,2000-01-01 01:00:00,2.4,91.0
455833,2000-01-01 02:00:00,2.5,91.0
455834,2000-01-01 03:00:00,2.6,91.0
455835,2000-01-01 04:00:00,2.8,91.0
...,...,...,...
648607,2021-12-31 19:00:00,10.2,91.0
648608,2021-12-31 20:00:00,10.9,88.0
648609,2021-12-31 21:00:00,11.0,88.0
648610,2021-12-31 22:00:00,11.1,88.0


In [77]:
#remove leap years from dataset

df = df[~((df.Date.dt.month == 2) & (df.Date.dt.day == 29)) ]

In [78]:
#problem with -999 values

df.min()

Date     2000-01-01 00:00:00
TT_TU                   -999
RF_TU                   -999
dtype: object

In [79]:
#replace -999 values with nan

df = df.replace(-999, np.nan)
df.min()

Date     2000-01-01 00:00:00
TT_TU                  -17.9
RF_TU                     11
dtype: object

In [80]:
#count the nan values

df.apply(pd.isnull).sum()

Date       0
TT_TU    151
RF_TU    164
dtype: int64

In [81]:
#fill the nan values with the values forwoard

df=df.ffill(axis = 0)
df.apply(pd.isnull).sum()


Date     0
TT_TU    0
RF_TU    0
dtype: int64

Data set label & split

In [82]:
#create labeled data set for predictions in one year  -> 365*24

prediction_hours = (365*24)
df["Prediction_TT"]=df["TT_TU"].shift(-prediction_hours)
df["Prediction_RF"]=df["RF_TU"].shift(-prediction_hours)

In [83]:
#split the data in x und y axis

X = np.array(df[["TT_TU"]])
X = X[: len(df)-prediction_hours]

In [84]:
#create y values

y = np.array(df["Prediction_TT"])
y = y[:-prediction_hours]

In [85]:
#aufteilen der Daten in Trainings und Test Daten

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [86]:
#erstellen einer varaible mit letzten 365*24 Werten aus dem df

prediction_hours_array = np.array(df[['TT_TU']])[-prediction_hours:]

Support Vector Regression

In [87]:
#from sklearn.svm import SVR

#Support Vector Regression mit radial basis function

svr_rbf = SVR(kernel="rbf", C=1e3, gamma=0.1)
svr_rbf.fit(X_train, y_train)

SVR(C=1000.0, gamma=0.1)

In [88]:
#Testen des Models

model_test = svr_rbf.score(X_test, y_test)
print("Model Score: ", model_test)

Model Score:  0.5838530722704075


In [89]:
svm_prediction = svr_rbf.predict(prediction_hours_array)
print(svm_prediction)

print()


print(df["TT_TU"].tail(prediction_hours))

[ 4.35797211  4.35797211  4.25539187 ... 10.69891656 10.80237932
 11.23424549]

639852     3.3
639853     3.3
639854     3.0
639855     2.9
639856     3.0
          ... 
648607    10.2
648608    10.9
648609    11.0
648610    11.1
648611    11.5
Name: TT_TU, Length: 8760, dtype: float64


In [90]:
svm_prediction

array([ 4.35797211,  4.35797211,  4.25539187, ..., 10.69891656,
       10.80237932, 11.23424549])

In [None]:
# Die Min und Max Temperatur anzeigen lassen und so eine prediction machen