In [35]:
import pandas as pd
import numpy as np


df = pd.read_csv("wheather_data.csv")

df.head(5)

Unnamed: 0,id,timestamp,value,identifier,value_type_id,location_id,source_id
0,427436,2019-04-25 13:20:09,1021.8,PMSL,20,23.0,5
1,427439,2019-04-25 13:20:09,12.0,TDP,15,23.0,5
2,427434,2019-04-25 13:20:09,0.0,P6H,13,23.0,5
3,427435,2019-04-25 13:20:09,0.51,P24H,14,23.0,5
4,427440,2019-04-25 13:20:09,3.0,UVI,16,23.0,5


### Find the minimum, maximum and mean value of numerical variables

In [36]:
df.describe()

Unnamed: 0,id,value,value_type_id,location_id,source_id
count,406719.0,406719.0,406719.0,406719.0,406719.0
mean,4678257.0,153.45302,35.833573,38.873038,5.744032
std,1880498.0,321.899409,15.033063,30.817378,1.662299
min,427433.0,-0.019,11.0,23.0,5.0
25%,3275172.0,0.0,21.0,23.0,5.0
50%,5746641.0,13.45,39.0,27.0,5.0
75%,6082960.0,63.0,43.0,30.0,5.0
max,6426617.0,1021.9,66.0,152.0,10.0


In [2]:
# We extract value because the other columns are id and thus not continuous, so we there is no point in finding mean for them
print("Minimum value: ", df["value"].min())
print("Maximum value: ", df["value"].max())
print("Mean value: ", df["value"].mean())


Minimum value:  -0.019
Maximum value:  1021.9
Mean value:  153.45302012447573


In [3]:
df["value"].describe()

count    406719.000000
mean        153.453020
std         321.899409
min          -0.019000
25%           0.000000
50%          13.450000
75%          63.000000
max        1021.900000
Name: value, dtype: float64

### Find the unique values of categorical data and the frequency of apearance of each category

In [4]:
# All the id data is unique as it is supposed and we will probably drop this column as it does not contain much
# info for prediction
df["id"].value_counts()

427436     1
5971414    1
5971419    1
5971416    1
5971415    1
          ..
4311694    1
4311691    1
4311679    1
4311642    1
6426596    1
Name: id, Length: 406719, dtype: int64

In [5]:
df["timestamp"].value_counts()

2019-06-25 09:11:32    81
2019-07-05 00:01:38    81
2019-07-03 00:01:39    80
2019-06-25 09:11:33    80
2019-07-03 00:01:24    80
                       ..
2019-05-10 16:20:11     1
2019-06-25 13:00:59     1
2019-06-26 02:44:04     1
2019-07-24 00:04:23     1
2019-07-23 20:40:05     1
Name: timestamp, Length: 25242, dtype: int64

In [6]:
df["identifier"].value_counts()

T                       47790
RH                      47789
PMSL                    47767
WS                      47742
WD                      47311
UVI                     40979
PC                      30540
P                       25308
TDP                     15676
P1H                     15675
CC                      11988
P6H                     10444
P24H                    10443
SM0                      1027
ST0                      1024
SM100                     815
SM30                      815
ETref                     813
ETmodel                   813
ST100                     812
ST30                      812
ST10                      209
P3H                       106
pressureMeanSeaLevel       21
Name: identifier, dtype: int64

In [7]:
df["value_type_id"].value_counts()

37    25308
42    25308
40    25308
38    25308
41    25307
43    25307
39    25306
45    25306
13    10444
15    10444
11    10443
12    10443
14    10443
18    10442
19    10441
17    10439
16    10439
55     6830
21     6809
26     6809
27     6806
25     6760
23     6756
24     6333
59     5232
66     5232
65     5232
60     5232
61     5232
64     5232
63     5232
58     5232
62     5232
57     5232
20     3614
30      815
29      815
28      815
50      813
49      813
46      812
47      812
48      812
36      212
35      212
51      209
22      106
Name: value_type_id, dtype: int64

In [8]:
df["location_id"].value_counts()

30.0     128465
23.0     126003
27.0      96668
116.0     55563
152.0        20
Name: location_id, dtype: int64

In [9]:
df["source_id"].value_counts()

5     313387
10     52320
6      41012
Name: source_id, dtype: int64

### Convert datetime to values that can be recognized by the computer

In [37]:
from datetime import datetime


# Convert the date into timestamp values based on seconds in order to be understandable by the computer.
# Convert to object type to string, use string manipulation to take out date and time and finally convert to
# timestamp object

timestamp = []
for i in range(0, len(df)):
    x = str(df["timestamp"][i])
    dtime = datetime(int(x[:4]), int(x[5:7]), int(x[8:10]), int(x[11:13]), int(x[14:16]),int(x[17:19]))
    dtimestamp = dtime.timestamp()
    timestamp.append(dtimestamp)
    
df["timestamp"] = timestamp
df["timestamp"]

0         1.556188e+09
1         1.556188e+09
2         1.556188e+09
3         1.556188e+09
4         1.556188e+09
              ...     
406714    1.564433e+09
406715    1.564433e+09
406716    1.564433e+09
406717    1.564433e+09
406718    1.564433e+09
Name: timestamp, Length: 406719, dtype: float64

### Encode categorical variables using one hot encoder

In [11]:
from sklearn.preprocessing import OneHotEncoder

# Encoding categorical variables using one hot encoder.


enc = OneHotEncoder(handle_unknown='ignore')

encoder_df = pd.DataFrame(enc.fit_transform(df[['identifier']]).toarray())
encoder_df.columns = enc.get_feature_names_out(["identifier"])

#df = df.drop(['identifier'] ,axis=1)

#df = pd.concat([df, encoder_df], axis=1)
encoder_df.head(10)

Unnamed: 0,identifier_CC,identifier_ETmodel,identifier_ETref,identifier_P,identifier_P1H,identifier_P24H,identifier_P3H,identifier_P6H,identifier_PC,identifier_PMSL,...,identifier_ST0,identifier_ST10,identifier_ST100,identifier_ST30,identifier_T,identifier_TDP,identifier_UVI,identifier_WD,identifier_WS,identifier_pressureMeanSeaLevel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Drop the columns that do not contain much information about the prediction
df = df.drop(['id', 'timestamp', 'identifier', 'source_id'] ,axis=1)

### Scale data

In [13]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Scale the dataset that we will feed to the neural network.

X_train, X_test, y_train, y_test = train_test_split(df.drop(["value"], axis = 1), df["value"], test_size = 0.2, random_state = 0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Create model

In [20]:
from keras.models import Sequential
import shap
from keras.layers import Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

def neural_network(first_layer_neurons = 250, second_layer_neurons = 50, first_activation = 'relu', second_activation = 'relu', optimization_method = 'adam', error_function = 'mean_squared_error'):
    # create ANN model
    model = Sequential()

    # Defining the Input layer and FIRST hidden layer, both are same!
    model.add(Dense(units=first_layer_neurons, input_dim= X_train.shape[1], kernel_initializer='normal', activation=first_activation))

    model.add(Dropout(0.5))
    # Defining the Second layer of the model
    # after the first layer we don't have to specify input_dim as keras configure it automatically
    model.add(Dense(units=second_layer_neurons, kernel_initializer='normal', activation=second_activation))
    model.add(Dropout(0.5))

    # The output neuron is a single fully connected node 
    # Since we will be predicting a single number
    model.add(Dense(1, kernel_initializer='normal'))

    # Compiling the model
    model.compile(loss=error_function, optimizer=optimization_method)

    # Fitting the ANN to the Training set
    model.fit(X_train, y_train ,batch_size = 20, epochs = 1, verbose=1)
    y_pred = model.predict(X_test)
    
    # Calculate shap values
    X_test_summary = shap.sample(X_test, 1)
    explainer = shap.KernelExplainer(model.predict, X_test)
   
    shap_values = explainer.shap_values(X_test_summary)
    if(error_function == "mse"):
        return mean_squared_error(y_test,y_pred), str(shap_values[0][0][0]), str(shap_values[0][0][1])
    else:
        return  mean_absolute_error(y_test, y_pred), str(shap_values[0][0][0]),str(shap_values[0][0][1])
    
    
    
   
    
    

In [33]:

def myClick():
    
    # Make the user enter integer values
    try:
        int(neurons.get())
    except ValueError:
        messagebox.showerror('Parameters Error', 'Error: Please enter integer values on number of neurons!')
        return
    
    if(int(neurons.get()) <= 0 or int(neuronshid.get()) <= 0):
        messagebox.showerror('Parameters Error', 'Error: Please enter integer values > 0 on number of neurons!')
        return
    else:
        
        # Convert error and optimizer to a form that can be understandable by the computer
        if(error_func.get() == "Mean square error"):
            error =  "mse"
        else:
            error = "mae"
            
            
        if(learning_method.get() == "Adam"):
            learning =  "adam"
        else:
            learning = "sgd"
        
        # Run the experiment
        result, shap_value1, shap_value2 = neural_network(int(neurons.get()), int(neuronshid.get()), activation.get(), activation_hid.get(), learning, error)
        newWindow = Tk()
        newWindow.title("Results of experiment")
        newWindow.geometry("700x350")
        myLabel = Label(newWindow, text = "Number of neurons on 1st and 2nd layer: " + neurons.get() + " and " + neuronshid.get(), font = ("bold", 10), padx = 10)
        myLabel.grid(column = 0, row = 0)
        
        activation_Label = Label(newWindow, text = "Activation functions on 1st and 2nd layer: " + activation.get()+ " and " + activation_hid.get(), font = ("bold", 10), padx = 10)
        activation_Label.grid(column = 0, row = 1)
        
        optimizer_Label = Label(newWindow, text = "Optimizer: " + learning_method.get(), font = ("bold", 10), padx = 10)
        optimizer_Label.grid(column = 0, row = 2)
        
        error_Label = Label(newWindow, text = "Error function: " + error_func.get(), font = ("bold", 10), padx = 10)
        error_Label.grid(column = 0, row = 3)
        
        result_Label = Label(newWindow, text = "Error result: " + str(result) , font = ("bold", 10), padx = 10)
        result_Label.grid(column = 0, row = 4)
        
        shapLabel = Label(newWindow, text = "Shap values of first and second feature: " + shap_value1 + " and " + shap_value2, font = ("bold", 10), padx = 10)
        shapLabel.grid(column = 0, row = 5)
        
        

        newWindow.mainloop()

In [34]:
from tkinter import *
from tkinter import messagebox

# Create window
root = Tk()

# Change size of the window
root.geometry("700x350")

# Add title to the window
root.title("Predict value program")

# Add a label
myLabel = Label(root, text = "Enter parameters", font = ("bold", 14), pady = 20, padx = 10)
myLabel.grid(column = 0, row = 0)

#-----------Number of Neurons on both layers ------------------

#First layer
myLabel = Label(root, text = "Number of neurons on 1st layer", font = ("bold", 10), padx = 10)
myLabel.grid(column = 0, row = 1)
neurons_txt = StringVar()
neurons = Entry(root, width = 50, textvariable = neurons_txt)
neurons.grid(column = 1, row = 1)


#Second layer
myLabel = Label(root, text = "Number of neurons on 2nd layer", font = ("bold", 10), padx = 10)
myLabel.grid(column = 0, row = 2)
neuronshid_txt = StringVar()
neuronshid = Entry(root, width = 50, textvariable = neuronshid_txt)
neuronshid.grid(column = 1, row = 2)


#-------------------------Activation Function-----------------------
myLabel = Label(root, text = "Chose activation function: ", font = ("bold", 10))
myLabel.grid(column = 0, row = 3)

#First layer Menu
activation = StringVar()
activation.set("relu")


activation_func_menu = OptionMenu(root, activation,"sigmoid", "relu")
activation_func_menu.grid(column = 1, row = 3)

# Second layer Menu
activation_hid = StringVar()
activation_hid.set("relu")


activation_func_menu = OptionMenu(root, activation_hid,"sigmoid", "relu")
activation_func_menu.grid(column = 2, row = 3)

#-------------------------Error Function-----------------------

myLabel = Label(root, text = "Chose error function: ", font = ("bold", 10))
myLabel.grid(column = 0, row = 4)

#Set the Menu initially
error_func = StringVar()
error_func.set("Mean square error")

#Create a dropdown Menu
error_func_menu= OptionMenu(root, error_func,"Mean square error", "Mean absolute error")
error_func_menu.grid(column = 1, row = 4)


#-------------------------Learning Method-----------------------

myLabel = Label(root, text = "Chose learning method", font = ("bold", 10))
myLabel.grid(column = 0, row = 5)


learning_method = StringVar()
learning_method.set("Adam")

learning_method_menu= OptionMenu(root, learning_method,"Stochastic gradient descent", "Adam")
learning_method_menu.grid(column = 1, row = 5)





# Add a button for making predictions
myButton = Button(root, text = "Run experiment", command = myClick, bg = "blue", fg='white')
myButton.grid(column = 1, row = 8)


root.mainloop()

hey
hey
hey
hey
hey
