In [61]:
import numpy as np
import random as rn
import matplotlib.pyplot as plt
import pandas as pd

### Degree Correlated Data

In [45]:
def create_correlated_degree_data(length=100, corr=0.8):
    # Generate the first variable
    x = np.random.randn(length)
    # Generate the second variable with a correlation of approximately 0.8
    y = corr * x + np.sqrt(1 - corr**2) * np.random.randn(length)
    # Transform to degree scale
    degree = (x + np.abs(x.min()))* 100 
    score = (y + np.abs(y.min()))* 20 
    print(f"Correlation coefficient: {np.corrcoef(x, y)[0, 1]}")

    return degree, score

In [62]:
for c in [0, 0.2, 0.4, 0.6, 0.8, 1]:
    found = False
    while not found:
        x, y = create_correlated_degree_data(corr=c)
        #print(f"Correlation coefficient: {np.corrcoef(x, y)[0, 1]}")
        if np.abs(np.corrcoef(x, y)[0, 1] - c) < 0.03:
            found = True
    #write to file
    df = pd.DataFrame({'degree': x, 'score': y})
    df.to_csv(f'data/correlated_degree_data_{c}.csv', index=True)


Correlation coefficient: 0.1097163588897943
Correlation coefficient: 0.10971635888979431
Correlation coefficient: -0.07738078152388535
Correlation coefficient: -0.07738078152388543
Correlation coefficient: 0.09917822516243992
Correlation coefficient: 0.09917822516243995
Correlation coefficient: -0.23488442699830478
Correlation coefficient: -0.23488442699830475
Correlation coefficient: 0.042054788372495246
Correlation coefficient: 0.04205478837249532
Correlation coefficient: 0.06813382329442866
Correlation coefficient: 0.06813382329442871
Correlation coefficient: -0.025634124968620865
Correlation coefficient: -0.02563412496862089
Correlation coefficient: 0.09120179095290025
Correlation coefficient: 0.09120179095290022
Correlation coefficient: 0.034389108186850995
Correlation coefficient: 0.03438910818685097
Correlation coefficient: 0.18828871730835756
Correlation coefficient: 0.18828871730835758
Correlation coefficient: 0.5878477744789459
Correlation coefficient: 0.5878477744789461
Corr

### Real degree distribution

In [443]:
def create_correlated_data_from_degrees(degrees, corr=0.8):
    # Generate the first variable
    x = np.array(list(degrees.values()))
    trans_x = (x-np.mean(x))/(x.max()-x.min())
    # Generate the second variable with a correlation of approximately 0.8
    y = corr * trans_x + trans_x* np.sqrt(1 - corr**2) * np.random.randn(len(trans_x))
    # untransform x
    degree = x
    score = y * (x.max()-x.min()) + np.mean(x)
    print(f"Correlation coefficient: {np.corrcoef(x, y)[0, 1]}")
    return degree, score

k = np.random.pareto(10, size=1000)
degrees = {i: int(j) for i, j in enumerate(k*1000/max(k))}


Correlation coefficient: -0.06257518603809818


In [444]:
for c in [0, 0.2, 0.4, 0.6, 0.8, 1]:
    found = False
    while not found:
        x, y = create_correlated_data_from_degrees(degrees, corr=c)
        #print(f"Correlation coefficient: {np.corrcoef(x, y)[0, 1]}")
        if np.abs(np.corrcoef(x, y)[0, 1] - c) < 0.03:
            found = True
    #write to file
    df = pd.DataFrame({'degree': x, 'score': y})
    df.to_csv(f'data/correlated_degree_data_pareto_{c}.csv', index=True)

Correlation coefficient: -0.1467016143099268
Correlation coefficient: -0.0548013962457262
Correlation coefficient: -0.023923695560674068
Correlation coefficient: 0.3227319911362293
Correlation coefficient: 0.2653211101974372
Correlation coefficient: 0.09922878665671132
Correlation coefficient: -0.03485485970541154
Correlation coefficient: 0.31009806791787625
Correlation coefficient: 0.167166968889108
Correlation coefficient: 0.3317977410699582
Correlation coefficient: 0.34478828773325065
Correlation coefficient: 0.1916761655308186
Correlation coefficient: 0.3015243197147956
Correlation coefficient: 0.40848715299754007
Correlation coefficient: 0.63799210081421
Correlation coefficient: 0.6699249153682392
Correlation coefficient: 0.5941433517442892
Correlation coefficient: 0.8391964380553875
Correlation coefficient: 0.8328377610462936
Correlation coefficient: 0.6885079569246193
Correlation coefficient: 0.7765060230964684
Correlation coefficient: 0.9999999999999993
