In [1]:
import numpy as np
import DTLearner as dt

if __name__ == "__main__":
    # Read the data from the file
    data = np.genfromtxt('Data/Istanbul.csv', delimiter=',', skip_header=1)

    # Remove the first column (date-time)
    data = data[:, 1:]

    # Separate the features (X) and the target values (Y)
    X = data[:, :-1]
    Y = data[:, -1]

    # Create a DTLearner instance
    learner = dt.DTLearner(leaf_size=1, verbose=False)

    # Train the learner with 60% of the data
    train_size = int(0.6 * data.shape[0])
    train_X = X[:train_size]
    train_Y = Y[:train_size]
    learner.add_evidence(train_X, train_Y)

    # Test the learner with the remaining 40% of the data
    test_X = X[train_size:]
    test_Y = Y[train_size:]
    Y_pred = learner.query(test_X)

    # Evaluate the model
    rmse = np.sqrt(((test_Y - Y_pred) ** 2).mean())
    corr = np.corrcoef(test_Y, Y_pred)[0, 1]

    print("Out of sample results:")
    print(f"RMSE: {rmse}")
    print(f"Correlation: {corr}")



Out of sample results:
RMSE: 0.007288142047537807
Correlation: 0.6972253041904284


  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
import matplotlib.pyplot as plt

# set the range of leaf_sizes
leaf_sizes = range(1, 50)

# these lists will store your RMSE values for each leaf size
in_sample_rmses = []
out_sample_rmses = []

# open the file for writing
file = open("results.txt", "w")

for leaf_size in leaf_sizes:
    # train the learner
    learner = DTLearner(leaf_size=leaf_size, verbose=False)
    learner.add_evidence(train_x, train_y)
    
    # compute in-sample error
    pred_y = learner.query(train_x)
    in_sample_rmse = math.sqrt(((train_y - pred_y) ** 2).sum() / train_y.shape[0])
    in_sample_rmses.append(in_sample_rmse)
    
    # compute out-of-sample error
    pred_y = learner.query(test_x)
    out_sample_rmse = math.sqrt(((test_y - pred_y) ** 2).sum() / test_y.shape[0])
    out_sample_rmses.append(out_sample_rmse)

    # write results to file
    file.write(f"Leaf size: {leaf_size}\n")
    file.write(f"In-sample RMSE: {in_sample_rmse}\n")
    file.write(f"Out-of-sample RMSE: {out_sample_rmse}\n")
    file.write("\n")  # for readability

file.close()

# create a plot of the RMSE values
plt.figure(figsize=(10, 6))
plt.plot(leaf_sizes, in_sample_rmses, label="In-sample RMSE")
plt.plot(leaf_sizes, out_sample_rmses, label="Out-of-sample RMSE")
plt.xlabel("Leaf Size")
plt.ylabel("RMSE")
plt.legend()
plt.title("RMSE vs Leaf Size in DTLearner")
plt.grid(True)

# save the plot to a file
plt.savefig("rmse_vs_leaf_size.png")


In [2]:
import numpy as np
import DTLearner as dt
import RTLearner as rt

if __name__ == "__main__":
    # Read the data from the file
    data = np.genfromtxt('Data/Istanbul.csv', delimiter=',', skip_header=1)

    # Remove the first column (date-time)
    data = data[:, 1:]

    # Separate the features (X) and the target values (Y)
    X = data[:, :-1]
    Y = data[:, -1]

    # Create DTLearner and RTLearner instances
    dt_learner = dt.DTLearner(leaf_size=1, verbose=False)
    rt_learner = rt.RTLearner(leaf_size=1, verbose=False)

    # Train the learners with 60% of the data
    train_size = int(0.6 * data.shape[0])
    train_X = X[:train_size]
    train_Y = Y[:train_size]
    dt_learner.add_evidence(train_X, train_Y)
    rt_learner.add_evidence(train_X, train_Y)

    # Test the learners with the remaining 40% of the data
    test_X = X[train_size:]
    test_Y = Y[train_size:]
    Y_pred_dt = dt_learner.query(test_X)
    Y_pred_rt = rt_learner.query(test_X)

    # Evaluate the models
    rmse_dt = np.sqrt(((test_Y - Y_pred_dt) ** 2).mean())
    corr_dt = np.corrcoef(test_Y, Y_pred_dt)[0, 1]
    rmse_rt = np.sqrt(((test_Y - Y_pred_rt) ** 2).mean())
    corr_rt = np.corrcoef(test_Y, Y_pred_rt)[0, 1]

    print("DTLearner out of sample results:")
    print(f"RMSE: {rmse_dt}")
    print(f"Correlation: {corr_dt}")

    print("RTLearner out of sample results:")
    print(f"RMSE: {rmse_rt}")
    print(f"Correlation: {corr_rt}")


DTLearner out of sample results:
RMSE: 0.007288142047537807
Correlation: 0.6972253041904284
RTLearner out of sample results:
RMSE: 0.007475376036754763
Correlation: 0.660586808465725


In [3]:
import numpy as np
import DTLearner as dt
import RTLearner as rt
import LinRegLearner as lrl
import BagLearner as bl

if __name__ == "__main__":
    # Read the data from the file
    data = np.genfromtxt('Data/Istanbul.csv', delimiter=',', skip_header=1)

    # Remove the first column (date-time)
    data = data[:, 1:]

    # Separate the features (X) and the target values (Y)
    X = data[:, :-1]
    Y = data[:, -1]

    # Prepare training and testing data
    train_size = int(0.6 * data.shape[0])
    train_X = X[:train_size]
    train_Y = Y[:train_size]
    test_X = X[train_size:]
    test_Y = Y[train_size:]

    # Create a dictionary to hold the learners and their names
    learners = {
        "DTLearner": dt.DTLearner,
        "RTLearner": rt.RTLearner,
        "LinRegLearner": lrl.LinRegLearner
    }

    for name, learner in learners.items():
        # Create a BagLearner with 10 instances of the learner
        bag_learner = bl.BagLearner(learner=learner, kwargs={}, bags=10, boost=False, verbose=False)
        
        # Train the BagLearner
        bag_learner.add_evidence(train_X, train_Y)

        # Query the BagLearner
        Y_pred = bag_learner.query(test_X)

        # Evaluate the BagLearner
        rmse = np.sqrt(((test_Y - Y_pred) ** 2).mean())
        corr = np.corrcoef(test_Y, Y_pred)[0, 1]

        print(f"{name} BagLearner out of sample results:")
        print(f"RMSE: {rmse}")
        print(f"Correlation: {corr}")
        print("----------------------------")


DTLearner BagLearner out of sample results:
RMSE: 0.00476754614526203
Correlation: 0.8253738083252917
----------------------------
RTLearner BagLearner out of sample results:
RMSE: 0.005095201394077022
Correlation: 0.8136482880352421
----------------------------
LinRegLearner BagLearner out of sample results:
RMSE: 0.0040602006992904085
Correlation: 0.8890426790818324
----------------------------


In [4]:
Xtrain=train_X
Ytrain=train_Y
Xtest=test_X
Ytest=test_Y

In [7]:
import InsaneLearner as it

# Initialize the InsaneLearner
learner = it.InsaneLearner(verbose = False) 

# Train the learner
learner.add_evidence(Xtrain, Ytrain)

# Query the learner
Y_pred_insane = learner.query(Xtest)


In [8]:
import LinRegLearner as lrl
import BagLearner as bl

# Initialize the BagLearner with the same configuration as the InsaneLearner
learner = bl.BagLearner(learner=bl.BagLearner, kwargs={"learner": lrl.LinRegLearner, "kwargs": {}, "bags": 20}, bags=20, boost=False, verbose=False)

# Train the learner
learner.add_evidence(Xtrain, Ytrain)

# Query the learner
Y_pred_bag = learner.query(Xtest)


In [9]:
# Calculate RMSE for InsaneLearner
rmse_insane = np.sqrt(((Ytest - Y_pred_insane) ** 2).mean())
print(f"InsaneLearner RMSE: {rmse_insane}")

# Calculate RMSE for BagLearner
rmse_bag = np.sqrt(((Ytest - Y_pred_bag) ** 2).mean())
print(f"BagLearner RMSE: {rmse_bag}")

InsaneLearner RMSE: 0.004004174459194699
BagLearner RMSE: 0.0039952750656196785


In [10]:
from sklearn.metrics import r2_score

r2_insane = r2_score(Ytest, Y_pred_insane)
print(f"InsaneLearner R^2: {r2_insane}")

r2_bag = r2_score(Ytest, Y_pred_bag)
print(f"BagLearner R^2: {r2_bag}")


InsaneLearner R^2: 0.759316360675068
BagLearner R^2: 0.7603850244883746


In [11]:
corr_coef1 = np.corrcoef(Ytest, Y_pred_insane)[0, 1]
print(f"Correlation Coefficient: {corr_coef1}")
corr_coef2 = np.corrcoef(Ytest, Y_pred_bag)[0, 1]
print(f"Correlation Coefficient: {corr_coef2}")


Correlation Coefficient: 0.8893749200090458
Correlation Coefficient: 0.8895580237835897
