In [2]:
import os
import sys

import json
import pickle

import scipy
import seaborn as sns

import numpy as np
import pandas as pd
from dtw import dtw
import torch
from torchsummary import summary
import matplotlib.pyplot as plt
import dcor

In [3]:
import matplotlib
matplotlib.use('Qt5Agg')

In [4]:
data_training = pd.read_csv("../data/data_training.csv")
data_validation = pd.read_csv('../data/data_validation.csv')

data = pd.concat([data_training, data_validation])
data["Date"] = pd.to_datetime(data["Date"])
data = data.set_index('Date')
meta = pd.read_csv("../data/meta.csv")
data.index

DatetimeIndex(['1950-12-31', '1951-01-01', '1951-01-02', '1951-01-03',
               '1951-01-04', '1951-01-05', '1951-01-06', '1951-01-07',
               '1951-01-08', '1951-01-09',
               ...
               '2020-12-21', '2020-12-22', '2020-12-23', '2020-12-24',
               '2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28',
               '2020-12-29', '2020-12-30'],
              dtype='datetime64[ns]', name='Date', length=25568, freq=None)

In [56]:
def pearson(dx, dy):
    return scipy.stats.pearsonr(dx,dy)

def spearman(dx,dy):
    return scipy.stats.spearmanr(dx,dy)

def distance_correlation(dx,dy):
    return dcor.distance_correlation(dx, dy),0

def timewarping(dx,dy):
    distance, cost_matrix, acc_cost, path = dtw(np.array(dx).reshape(-1,1),np.array(dy).reshape(-1,1), dist = scipy.spatial.distance.euclidean)
    return distance



def corr(df,starting_point,interval,method,l,pv,p1):
    period2 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]

    for col in df.columns.tolist():
        cor, p_value = method(p1['2275'],period2[col])
        l.append(cor)
        pv.append(p_value)



def calculate_correlations(df,starting_point,interval,method):

    day = pd.Timedelta(1,'d')
    sp = starting_point
    l = []
    pv = []

    period1 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]
    for i in np.linspace(0,5,5):
        starting_point = starting_point + day
        corr(df,starting_point,interval,method,l,pv,period1)
    starting_point =  sp + day
    for i in np.linspace(0,50,50):
        starting_point = starting_point - day
        corr(df,starting_point,interval,method,l,pv,period1)

    cm = np.transpose(np.reshape(np.array(l), (-1, len(df.columns))))
    pv = np.transpose(np.reshape(np.array(pv), (-1, len(df.columns))))

    return cm, pv


correlation_matrix, p_values = calculate_correlations(data,pd.Timestamp('2006-01-01'),pd.Timedelta(52,'w'),distance_correlation)

cr = pd.DataFrame(data = np.transpose(correlation_matrix), columns=data.columns)
cr.idxmax() - 5



1515      4
1516      4
1518      3
1521      3
1719      5
1720      6
1722      2
1723      2
2040      1
2046      1
2048      0
2271      0
2272      0
2274      0
2275      0
2278      2
2543      2
2753      5
2756      4
2757      2
2760      1
1514      4
2041     -1
1523      5
2042      1
2736      7
1721      3
1724      3
1725      3
2744      6
2748      7
2759      2
1520      3
1527      4
1729      4
1726      5
2541      2
1530      6
1732      4
1734      4
2049      5
2741      4
2742      6
2751      5
2545      4
744624    2
210888    0
210900    0
dtype: int64

In [57]:
max_corr = pd.Series((cr.idxmax() - 5),name="maximum_correlation")
max_corr.index = max_corr.index.astype(int)

In [58]:
meta = pd.read_csv("../data/meta.csv")
meta = meta.set_index("reg_number")

meta_nans_removed = meta.loc[list(map(int,cr.columns.tolist()))]
meta_nans_removed = pd.concat([meta_nans_removed, max_corr], axis=1)
meta.river.unique()


array(['Tisza', 'Maros', 'Kettős-Körös', 'Hármas-Körös', 'Szamos',
       'Sebes-Körös', 'Bodrog', 'Fekete-Körös', 'Fehér-Körös', 'Túr',
       'Sajó', 'Kraszna', 'Hernád', 'Zagyva', 'Berettyó'], dtype=object)

In [24]:
marker_dict = {
    'Tisza': 'o',
    'Maros': 'v',
    'Kettős-Körös': '^',
    'Hármas-Körös': 'x',
    'Szamos': '+',
    'Sebes-Körös' : 'D',
    'Bodrog' : 'h',
    'Túr' : 'd',
    'Sajó' : 'X',
    'Kraszna' : '1',
    'Hernád' : '2',
    'Berettyó': '3',
    'Fekete-Körös' : '4',
    'Fehér-Körös': ',',
    'Zagyva': '<'
}

def rgb(val):
    return [[0.4 + min(val * 0.08,0.6),0.2, 0.00 + min(abs(val * 0.10),1.0)]]

rivers = []
actors = []

for index,row in meta_nans_removed.iterrows():
    a = plt.scatter(row['EOVx'],row['EOVy'],c=rgb(row['maximum_correlation']),marker=marker_dict[row['river']])
    if row['river'] not in rivers:
        rivers.append(row['river'])
        actors.append(a)

cbar = plt.colorbar()
plt.legend(actors,rivers)


plt.show()

qt.qpa.wayland: Wayland does not support QWindow::requestActivate()


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Your marker_dict and rgb function
plt.figure(figsize=(10, 7))
# Assuming you have 'meta_nans_removed', 'rivers', and 'actors' defined

# Create a ScalarMappable object
norm = plt.Normalize(vmin=min(meta_nans_removed['maximum_correlation']),
                     vmax=max(meta_nans_removed['maximum_correlation']))
cmap = cm.get_cmap('viridis')  # You can choose any colormap you like
sm = cm.ScalarMappable(cmap=cmap, norm=norm)

for index, row in meta_nans_removed.iterrows():
    color_val = row['maximum_correlation']
    color = sm.to_rgba(color_val)

    a = plt.scatter(row['EOVx'], row['EOVy'], c=[color], marker=marker_dict[row['river']])

    if row['river'] not in rivers:
        rivers.append(row['river'])
        actors.append(a)

# Add a colorbar
cbar = plt.colorbar(sm)
cbar.set_label('$t_x$')

plt.xlabel('x coordinates')
plt.ylabel('Y coordinates')
plt.legend(actors, rivers)
plt.show()

  cmap = cm.get_cmap('viridis')  # You can choose any colormap you like
  cbar = plt.colorbar(sm)
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()


In [11]:
from statsmodels.tsa.stattools import grangercausalitytests
def causality_test(df, lag,l,pv):

    for col in df.columns.tolist():
        vals = grangercausalitytests(data_nans_removed[['2275',col]], maxlag = [lag],verbose = 0)
        l.append(vals[lag][0]['ssr_ftest'][0])
        pv.append(vals[lag][0]['ssr_ftest'][1])


causalities = []
p_values = []
for i in np.linspace(1,10,10):
    causality_test(data_nans_removed,i,causalities,p_values)

causalitiy_matrix = np.transpose(np.reshape(np.array(causalities), (-1, len(data_nans_removed.columns))))

NameError: name 'data_nans_removed' is not defined

In [None]:
maximum_correlation = (cr.idxmax() - 5)

In [None]:
pd.DataFrame(maximum_correlation.values, index=maximum_correlation.index).to_csv("../data/maximum_correlation.csv")

In [None]:
maximum_correlation.values

In [None]:
def corr(df,station,starting_point,interval,method,l,pv,p1):
    period2 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]
    for col in df.columns.tolist():
        cor, p_value = method(p1[station],period2[col])
        l.append(cor)
        pv.append(p_value)


def calculate_correlations(df,starting_point,interval,method):
    cross_correlations = []
    for station in df.columns:
        day = pd.Timedelta(1,'d')
        sp = starting_point
        l = []
        pv = []

        period1 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]
        for i in np.linspace(0,5,5):
            starting_point = starting_point + day
            corr(df,station,starting_point,interval,method,l,pv,period1)
        starting_point =  sp + day
        for i in np.linspace(0,50,50):
            starting_point = starting_point - day
            corr(df,station,starting_point,interval,method,l,pv,period1)
        cm = np.transpose(np.reshape(np.array(l), (-1, len(df.columns))))
        pv = np.transpose(np.reshape(np.array(pv), (-1, len(df.columns))))
        cross_correlations.append(cm)
    return cross_correlations


In [None]:
cr = calculate_correlations(data,pd.Timestamp('2005-01-01'),pd.Timedelta(52,'w'),pearson)

In [None]:
correlation_tensor = {}
for idx,station in enumerate(data.columns):
    correlation_tensor[station] = (pd.DataFrame(data=np.transpose(cr[idx]),columns=data.columns))

In [None]:
correlation_tensor['1515']

In [None]:
correlation_tensor_max_corr = {k : v.idxmax() -5 for k,v in correlation_tensor.items() }

In [None]:
correlation_tensor_max_corr['2275']

In [76]:
pd.Series(data = correlation_tensor_max_corr.values(),index=correlation_tensor_max_corr.keys()).to_csv("../data/maximum_correlations_every_station.csv")

In [2]:
df = pd.read_csv('../data/maximum_correlations_every_station.csv')

In [8]:
df = df.rename(columns={'Unnamed: 0': 'stations'})

In [23]:
df.loc[1515][0]

'1515      0\n1516      0\n1518     -5\n1521     -5\n1719     -4\n1720     -3\n1722     -4\n1723     -4\n2040     -4\n2046     -3\n2048     -2\n2271     -2\n2272     -2\n2274     -2\n2275     -2\n2278     -3\n2543     -4\n2753     -5\n2756     -3\n2757     -2\n2760     -2\n1514      0\n2041     -1\n1523      0\n2042     -4\n2736      1\n1721     -4\n1724     -5\n1725     -5\n2744      0\n2748      0\n2759     -3\n1520     -5\n1527      0\n1729      1\n1726      1\n2541     -4\n1530      0\n1732      1\n1734      0\n2049      1\n2741     -4\n2742     -4\n2751     -5\n2545      0\n744624   -3\n210888   -1\n210900   -1\ndtype: int64'

In [20]:
df = df.set_index('stations')

In [11]:
df.to_csv('../data/maximum_correlations_every_station.csv')

In [12]:
pd.read_csv('../data/maximum_correlations_every_station.csv')

Unnamed: 0.1,Unnamed: 0,stations,0
0,0,1515,1515 0\n1516 0\n1518 -5\n1521 ...
1,1,1516,1515 0\n1516 0\n1518 -5\n1521 ...
2,2,1518,1515 1\n1516 1\n1518 0\n1521 ...
3,3,1521,1515 1\n1516 1\n1518 1\n1521 ...
4,4,1719,1515 2\n1516 1\n1518 1\n1521 ...
5,5,1720,1515 3\n1516 2\n1518 2\n1521...
6,6,1722,1515 2\n1516 1\n1518 1\n1521 ...
7,7,1723,1515 2\n1516 2\n1518 1\n1521...
8,8,2040,1515 3\n1516 2\n1518 2\n1521...
9,9,2046,1515 3\n1516 3\n1518 2\n1521...
