In [1]:
import dagsim.base as ds
import numpy as np
from sklearn.linear_model import LinearRegression as LinReg
import pandas as pd
from sklearn.metrics import r2_score


In [2]:
# define the function of the ground truth
def ground_truth(x, std_dev):
    y = x**2 + 1 + np.random.normal(0, std_dev)
    return y


In [4]:
# define a node for the input feature, and another node for the outcome of a linear regression model
Nodex = ds.Node(name="x", function=np.random.normal)
Nodey = ds.Node(name="y", function=ground_truth, kwargs={"x": Nodex, "std_dev": 0})

# define a list of all nodes, then instantiate the graph
listNodes = [Nodex, Nodey]
my_graph = ds.Graph(listNodes, "Linear Regression")


In [5]:
my_graph.draw()

In [6]:
# simulate data for training and testing, with different sample sizes and filenames
train = my_graph.simulate(num_samples=70, csv_name="train")
test = my_graph.simulate(num_samples=30, csv_name="test")


2022-02-11 11:08:30.978147: Simulation started.
2022-02-11 11:08:30.979422: Simulating node "x".
2022-02-11 11:08:30.980526: Simulating node "y".
2022-02-11 11:08:30.984836: Simulation finished in 0.0067 seconds.
2022-02-11 11:08:30.985227: Simulation started.
2022-02-11 11:08:30.985523: Simulating node "x".
2022-02-11 11:08:30.986094: Simulating node "y".
2022-02-11 11:08:30.988415: Simulation finished in 0.0032 seconds.


In [7]:
# import the saved training data
train_data = pd.read_csv("./train.csv")
print(train_data.head())

x_train = train_data.iloc[:, 0].to_numpy().reshape([-1, 1])
print("x_train", x_train.shape)
y_train = train_data.iloc[:, 1].to_numpy().reshape([-1, 1])
print("y_train", y_train.shape)


          x         y
0 -0.219523  1.048190
1 -0.516195  1.266458
2 -1.954076  4.818412
3  0.148150  1.021948
4  0.951419  1.905198
x_train (70, 1)
y_train (70, 1)


In [8]:
# define a linear regression model
LR = LinReg()
# fit the model on the training data
reg = LR.fit(x_train, y_train)
print("Coefficient: ", LR.coef_)
print("Intercept: ", LR.intercept_)


Coefficient:  [[0.22041046]]
Intercept:  [2.17501631]


In [9]:
# import the saved testing data
test_data = pd.read_csv("./test.csv")
x_test = test_data.iloc[:, 0].to_numpy().reshape([-1, 1])
print("x_test", x_test.shape)
y_test = test_data.iloc[:, 1].to_numpy().reshape([-1, 1])
print("y_test", y_test.shape)

# get the R2 score of the model on the testing data
print("R2 score on test data: ", reg.score(x_test, y_test))

x_test (30, 1)
y_test (30, 1)
R2 score on test data:  -0.3043924205574846
