In [1]:
import dagsim.base as ds
import numpy as np
from sklearn.linear_model import LinearRegression as LinReg
import pandas as pd
from sklearn.metrics import r2_score


In [2]:
# define the function of the ground truth
def ground_truth(x, std_dev):
    y = x**2 + 1 + np.random.normal(0, std_dev)
    return y


In [3]:
# define a node for the input feature, and another node for the outcome of a linear regression model
Nodex = ds.Node(name="x", function=np.random.normal)
Nodey = ds.Node(name="y", function=ground_truth, kwargs={"x": Nodex, "std_dev": 0})

# define a list of all nodes, then instantiate the graph
listNodes = [Nodex, Nodey]
my_graph = ds.Graph("Linear Regression", listNodes)


In [4]:
my_graph.draw()


In [5]:
# simulate data for training and testing, with different sample sizes and filenames
train = my_graph.simulate(num_samples=70, csv_name="train")
test = my_graph.simulate(num_samples=30, csv_name="test")


2022-02-08 19:04:34.104465: Simulation started.
2022-02-08 19:04:34.105992: Simulating node "x".
2022-02-08 19:04:34.107380: Simulating node "y".
2022-02-08 19:04:34.114438: Simulation finished in 0.0100 seconds.
2022-02-08 19:04:34.115037: Simulation started.
2022-02-08 19:04:34.115539: Simulating node "x".
2022-02-08 19:04:34.116490: Simulating node "y".
2022-02-08 19:04:34.119573: Simulation finished in 0.0045 seconds.


In [6]:
# import the saved training data
train_data = pd.read_csv("./train.csv")
print(train_data.head())

x_train = train_data.iloc[:, 0].to_numpy().reshape([-1, 1])
print("x_train", x_train.shape)
y_train = train_data.iloc[:, 1].to_numpy().reshape([-1, 1])
print("y_train", y_train.shape)


          x         y
0 -0.366428  1.134270
1 -1.200749  2.441797
2 -0.499085  1.249086
3 -0.457521  1.209325
4 -0.626889  1.392990
x_train (70, 1)
y_train (70, 1)


In [7]:
# define a linear regression model
LR = LinReg()
# fit the model on the training data
reg = LR.fit(x_train, y_train)
print("Coefficient: ", LR.coef_)
print("Intercept: ", LR.intercept_)


Coefficient:  [[0.60225594]]
Intercept:  [1.86351053]


In [8]:
# import the saved testing data
test_data = pd.read_csv("./test.csv")
x_test = test_data.iloc[:, 0].to_numpy().reshape([-1, 1])
print("x_test", x_test.shape)
y_test = test_data.iloc[:, 1].to_numpy().reshape([-1, 1])
print("y_test", y_test.shape)

# get the R2 score of the model on the testing data
print("R2 score on test data: ", reg.score(x_test, y_test))

x_test (30, 1)
y_test (30, 1)
R2 score on test data:  0.19930024102485122
