In [1]:
import os
import sys

import json
import pickle

import scipy
import seaborn as sns

import numpy as np
import pandas as pd
from dtw import dtw
import torch
from torchsummary import summary
import matplotlib.pyplot as plt
import dcor

In [2]:
import matplotlib
matplotlib.use('Qt5Agg')

In [3]:
data_training = pd.read_csv("../data/data_training.csv")
data_validation = pd.read_csv('../data/data_validation.csv')

data = pd.concat([data_training, data_validation])
data["Date"] = pd.to_datetime(data["Date"])
data = data.set_index('Date')
meta = pd.read_csv("../data/meta.csv")
data.index

DatetimeIndex(['1950-12-31', '1951-01-01', '1951-01-02', '1951-01-03',
               '1951-01-04', '1951-01-05', '1951-01-06', '1951-01-07',
               '1951-01-08', '1951-01-09',
               ...
               '2020-12-21', '2020-12-22', '2020-12-23', '2020-12-24',
               '2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28',
               '2020-12-29', '2020-12-30'],
              dtype='datetime64[ns]', name='Date', length=25568, freq=None)

In [4]:
def pearson(dx, dy):
    return scipy.stats.pearsonr(dx,dy)

def spearman(dx,dy):
    return scipy.stats.spearmanr(dx,dy)

def distance_correlation(dx,dy):
    return dcor.distance_correlation(dx, dy),0

def timewarping(dx,dy):
    distance, cost_matrix, acc_cost, path = dtw(np.array(dx).reshape(-1,1),np.array(dy).reshape(-1,1), dist = scipy.spatial.distance.euclidean)
    return distance



def corr(df,starting_point,interval,method,l,pv,p1):
    period2 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]

    for col in df.columns.tolist():
        cor, p_value = method(p1['2275'],period2[col])
        l.append(cor)
        pv.append(p_value)



def calculate_correlations(df,starting_point,interval,method):

    day = pd.Timedelta(1,'d')
    sp = starting_point
    l = []
    pv = []

    period1 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]
    for i in np.linspace(0,5,5):
        starting_point = starting_point + day
        corr(df,starting_point,interval,method,l,pv,period1)
    starting_point =  sp + day
    for i in np.linspace(0,50,50):
        starting_point = starting_point - day
        corr(df,starting_point,interval,method,l,pv,period1)

    cm = np.transpose(np.reshape(np.array(l), (-1, len(df.columns))))
    pv = np.transpose(np.reshape(np.array(pv), (-1, len(df.columns))))

    return cm, pv


correlation_matrix, p_values = calculate_correlations(data,pd.Timestamp('2006-01-01'),pd.Timedelta(52,'w'),pearson)

cr = pd.DataFrame(data = np.transpose(correlation_matrix), columns=data.columns)
cr.idxmax() - 5

1515       6
1516       5
1518       4
1521       5
1719       9
1720      14
1722       3
1723       3
2040       2
2046       1
2048       0
2271       0
2272       0
2274       0
2275       0
2278       2
2543       2
2753       5
2756       4
2757       2
2760       1
1514       5
2041      -1
1523       6
2042       2
2736       8
1721       3
1724       4
1725       4
2744       8
2748      10
2759       2
1520       4
1527       5
1729       4
1726       6
2541       3
1530       7
1732       5
1734       5
2049       5
2741       4
2742       6
2751       6
2545       5
744624     2
210888     0
210900     0
dtype: int64

In [5]:
max_corr = pd.Series((cr.idxmax() - 5),name="maximum_correlation")
max_corr.index = max_corr.index.astype(int)

In [None]:
meta = pd.read_csv("../data/meta.csv")
meta = meta.set_index("reg_number")

meta_nans_removed = meta.loc[list(map(int,cr.columns.tolist()))]
meta_nans_removed = pd.concat([meta_nans_removed, max_corr], axis=1)
meta.river.unique()


In [8]:
marker_dict = {
    'Tisza': 'o',
    'Maros': 'v',
    'Kettős-Körös': '^',
    'Hármas-Körös': 'x',
    'Szamos': '+',
    'Sebes-Körös' : 'D',
    'Bodrog' : 'h',
    'Túr' : 'd',
    'Sajó' : 'X',
    'Kraszna' : '1',
    'Hernád' : '2',
    'Berettyó': '3',
    'Fekete-Körös' : '4',
    'Fehér-Körös': ',',
    'Zagyva': '<'
}

def rgb(val):
    return [[0.4 + min(val * 0.08,0.6),0.2, 0.00 + min(abs(val * 0.10),1.0)]]

rivers = []
actors = []

for index,row in meta_nans_removed.iterrows():
    a = plt.scatter(row['EOVx'],row['EOVy'],c=rgb(row['maximum_correlation']),marker=marker_dict[row['river']])
    if row['river'] not in rivers:
        rivers.append(row['river'])
        actors.append(a)
plt.legend(actors,rivers)
plt.show()

qt.qpa.wayland: Wayland does not support QWindow::requestActivate()


KeyboardInterrupt: 

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests
def causality_test(df, lag,l,pv):

    for col in df.columns.tolist():
        vals = grangercausalitytests(data_nans_removed[['2275',col]], maxlag = [lag],verbose = 0)
        l.append(vals[lag][0]['ssr_ftest'][0])
        pv.append(vals[lag][0]['ssr_ftest'][1])


causalities = []
p_values = []
for i in np.linspace(1,10,10):
    causality_test(data_nans_removed,i,causalities,p_values)

causalitiy_matrix = np.transpose(np.reshape(np.array(causalities), (-1, len(data_nans_removed.columns))))

In [None]:
maximum_correlation = (cr.idxmax() - 5)

In [None]:
pd.DataFrame(maximum_correlation.values, index=maximum_correlation.index).to_csv("../data/maximum_correlation.csv")

In [None]:
maximum_correlation.values

In [10]:
def corr(df,station,starting_point,interval,method,l,pv,p1):
    period2 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]
    for col in df.columns.tolist():
        cor, p_value = method(p1[station],period2[col])
        l.append(cor)
        pv.append(p_value)


def calculate_correlations(df,starting_point,interval,method):
    cross_correlations = []
    for station in df.columns:
        day = pd.Timedelta(1,'d')
        sp = starting_point
        l = []
        pv = []

        period1 = df[(df.index >= starting_point - interval) & (df.index <= starting_point + interval)]
        for i in np.linspace(0,5,5):
            starting_point = starting_point + day
            corr(df,station,starting_point,interval,method,l,pv,period1)
        starting_point =  sp + day
        for i in np.linspace(0,50,50):
            starting_point = starting_point - day
            corr(df,station,starting_point,interval,method,l,pv,period1)
        cm = np.transpose(np.reshape(np.array(l), (-1, len(df.columns))))
        pv = np.transpose(np.reshape(np.array(pv), (-1, len(df.columns))))
        cross_correlations.append(cm)
    return cross_correlations


In [11]:
cr = calculate_correlations(data,pd.Timestamp('2005-01-01'),pd.Timedelta(52,'w'),pearson)

In [None]:
correlation_tensor = {}
for idx,station in enumerate(data.columns):
    correlation_tensor[station] = (pd.DataFrame(data=np.transpose(cr[idx]),columns=data.columns))

In [15]:
correlation_tensor['1515']

Unnamed: 0,1515,1516,1518,1521,1719,1720,1722,1723,2040,2046,...,1732,1734,2049,2741,2742,2751,2545,744624,210888,210900
0,0.948032,0.961216,0.971383,0.947838,0.881199,0.31541,0.862137,0.809591,0.554271,0.803721,...,0.325888,0.379134,0.603478,0.722922,0.653419,0.386093,0.563964,0.721131,0.720016,0.689409
1,0.853993,0.884141,0.926971,0.942456,0.886586,0.347622,0.872581,0.828029,0.574367,0.836251,...,0.276151,0.340917,0.544661,0.732812,0.668832,0.370262,0.537886,0.737705,0.753383,0.719141
2,0.765502,0.791829,0.851599,0.89524,0.863803,0.36464,0.852466,0.81591,0.570319,0.84726,...,0.236978,0.298636,0.488022,0.727511,0.666828,0.339246,0.507987,0.74089,0.777954,0.740427
3,0.69912,0.716504,0.775795,0.83235,0.825903,0.364079,0.818035,0.78558,0.553157,0.840157,...,0.213565,0.263593,0.445699,0.711337,0.648215,0.309149,0.481166,0.727239,0.790752,0.752099
4,0.652985,0.662954,0.714731,0.77275,0.783333,0.351647,0.779625,0.749544,0.532138,0.821545,...,0.196689,0.236873,0.414184,0.689756,0.624207,0.286454,0.458946,0.699868,0.792093,0.753936
5,1.0,0.974987,0.942949,0.895707,0.835029,0.274838,0.815754,0.761067,0.512767,0.758508,...,0.373756,0.389285,0.647061,0.693862,0.622013,0.374399,0.569859,0.691701,0.683313,0.655693
6,0.948083,0.90894,0.868806,0.821326,0.769664,0.23645,0.754448,0.701778,0.46477,0.711888,...,0.385502,0.36066,0.660684,0.651114,0.577977,0.336388,0.546823,0.657598,0.648533,0.623009
7,0.854223,0.820775,0.790447,0.75271,0.708198,0.205553,0.696896,0.645801,0.420226,0.669237,...,0.359611,0.315775,0.643151,0.608734,0.532949,0.290873,0.513206,0.625011,0.617064,0.593574
8,0.766183,0.742213,0.725623,0.697926,0.659242,0.186929,0.647751,0.598902,0.382203,0.630835,...,0.319968,0.273611,0.605991,0.571704,0.495588,0.250224,0.480555,0.593461,0.588435,0.565548
9,0.700814,0.680816,0.675401,0.655546,0.620841,0.175633,0.607141,0.560878,0.34937,0.5974,...,0.28017,0.238294,0.564085,0.540298,0.468996,0.216743,0.447741,0.560224,0.561849,0.53765


In [16]:
correlation_tensor_max_corr = {k : v.idxmax() -5 for k,v in correlation_tensor.items() }

In [17]:
correlation_tensor_max_corr['1515']

1515      0
1516      0
1518     -5
1521     -5
1719     -4
1720     -3
1722     -4
1723     -4
2040     -4
2046     -3
2048     -2
2271     -2
2272     -2
2274     -2
2275     -2
2278     -3
2543     -4
2753     -5
2756     -3
2757     -2
2760     -2
1514      0
2041     -1
1523      0
2042     -4
2736      1
1721     -4
1724     -5
1725     -5
2744      0
2748      0
2759     -3
1520     -5
1527      0
1729      1
1726      1
2541     -4
1530      0
1732      1
1734      0
2049      1
2741     -4
2742     -4
2751     -5
2545      0
744624   -3
210888   -1
210900   -1
dtype: int64

In [76]:
pd.Series(data = correlation_tensor_max_corr.values(),index=correlation_tensor_max_corr.keys()).to_csv("../data/maximum_correlations_every_station.csv")

In [2]:
df = pd.read_csv('../data/maximum_correlations_every_station.csv')

In [8]:
df = df.rename(columns={'Unnamed: 0': 'stations'})

In [23]:
df.loc[1515][0]

'1515      0\n1516      0\n1518     -5\n1521     -5\n1719     -4\n1720     -3\n1722     -4\n1723     -4\n2040     -4\n2046     -3\n2048     -2\n2271     -2\n2272     -2\n2274     -2\n2275     -2\n2278     -3\n2543     -4\n2753     -5\n2756     -3\n2757     -2\n2760     -2\n1514      0\n2041     -1\n1523      0\n2042     -4\n2736      1\n1721     -4\n1724     -5\n1725     -5\n2744      0\n2748      0\n2759     -3\n1520     -5\n1527      0\n1729      1\n1726      1\n2541     -4\n1530      0\n1732      1\n1734      0\n2049      1\n2741     -4\n2742     -4\n2751     -5\n2545      0\n744624   -3\n210888   -1\n210900   -1\ndtype: int64'

In [20]:
df = df.set_index('stations')

In [11]:
df.to_csv('../data/maximum_correlations_every_station.csv')

In [12]:
pd.read_csv('../data/maximum_correlations_every_station.csv')

Unnamed: 0.1,Unnamed: 0,stations,0
0,0,1515,1515 0\n1516 0\n1518 -5\n1521 ...
1,1,1516,1515 0\n1516 0\n1518 -5\n1521 ...
2,2,1518,1515 1\n1516 1\n1518 0\n1521 ...
3,3,1521,1515 1\n1516 1\n1518 1\n1521 ...
4,4,1719,1515 2\n1516 1\n1518 1\n1521 ...
5,5,1720,1515 3\n1516 2\n1518 2\n1521...
6,6,1722,1515 2\n1516 1\n1518 1\n1521 ...
7,7,1723,1515 2\n1516 2\n1518 1\n1521...
8,8,2040,1515 3\n1516 2\n1518 2\n1521...
9,9,2046,1515 3\n1516 3\n1518 2\n1521...
