In [1]:
from dagsim.base import Graph, Generic
import numpy as np
from sklearn.linear_model import LinearRegression as LinReg
import pandas as pd


In [2]:
# define the function of the ground truth
def ground_truth(x, add_param):
    y = 2 * x + 1 + np.random.normal(0, add_param)
    return y


In [3]:
# define a node for the input feature, and another node for the outcome of a linear regression model
Nodex = Generic(name="x", function=np.random.normal)
Nodey = Generic(name="y", function=ground_truth, arguments={"add_param":1, "x": Nodex})

# define a list of all nodes, then instantiate the graph
listNodes = [Nodex, Nodey]
my_graph = Graph("Linear Regression", listNodes)


In [4]:
my_graph.draw()


In [5]:
# simulate data for training and testing, with different sample sizes and filenames
train = my_graph.simulate(num_samples=70, csv_name="train")
test = my_graph.simulate(num_samples=30, csv_name="test")


Simulation started
Simulation finished in 0.0049 seconds
Simulation started
Simulation finished in 0.0015 seconds


In [6]:
# import the saved training data
train_data = pd.read_csv("../train.csv")
print(train_data.head())

x_train = train_data.iloc[:, 0].to_numpy().reshape([-1, 1])
print("x_train", x_train.shape)
y_train = train_data.iloc[:, 1].to_numpy().reshape([-1, 1])
print("y_train", y_train.shape)


          x         y
0 -1.153562 -1.307123
1 -3.622153 -6.244307
2  0.929049  2.858098
3  0.274025  1.548049
x_train (4, 1)
y_train (4, 1)


In [7]:
# define a linear regression model
LR = LinReg()
# fit the model on the training data
reg = LR.fit(x_train, y_train)
reg.score(x_train, y_train)
print("Coefficient: ", LR.coef_)
print("Intercept: ", LR.intercept_)


Coefficient:  [[2.]]
Intercept:  [1.]


In [8]:
# import the saved testing data
test_data = pd.read_csv("../test.csv")
x_test = test_data.iloc[:, 0].to_numpy().reshape([-1, 1])
print("x_test", x_test.shape)
y_test = test_data.iloc[:, 1].to_numpy().reshape([-1, 1])
print("y_test", y_test.shape)

# get the R2 score of the model on the testing data
print("R2 score on test data: ", LR.score(x_test, y_test))

x_test (2, 1)
y_test (2, 1)
R2 score on test data:  -415.7297833600145
