In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Load the provided dataset file
plantdata = pd.read_csv("/home/jovyan/work/CCPP_data.csv")

# Split Dataset

We'll split our dataset into training and test dataset.


In [55]:
# Create dataframes that represent subsets of the data using train_test_split() from scikit-learn.
# We'll use 20% of data for test and 80% for training
# x_train and x_test contain the features for training and testing, respectively
# y_train and y_test contain the targets for training and testing, respectively

x_train, x_test, y_train, y_test = train_test_split(plantdata.iloc[:, :-1], plantdata.iloc[:, -1], test_size=0.2, random_state=42)

We'll further create validation subsets using k-folds validation with 5 folds. We use 5 folds as 10 folds might lead to our validation sets being too small.

In [56]:
# Create validation datasets using KFold() from scikit-learn

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
kf_split = kf.split(x_train)


In [57]:
# Set the algorithm we want to use

#model = LinearRegression()
#model = Ridge(alpha = 1)
#model = Lasso(alpha = 0.1)
#model = DecisionTreeRegressor(max_depth = 5)
model = KNeighborsRegressor(n_neighbors = 5)

In [58]:
# Variables to store scores for each algorithm
scores = []

# Loop over the splits and train/validate the model
for train_index, val_index in kf_split:
    # Split the data into training and validation sets
    x_train_kf, x_val_kf = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the model on the training set
    model.fit(x_train_kf, y_train_kf)
    
    # Use the model to make predictions on the validation set
    y_pred_kf = model.predict(x_val_kf)
    
    # Calculate the MSE of the model on the validation set
    mse_kf = mean_squared_error(y_val_kf, y_pred_kf)
    
    scores.append(mse_kf)
    
mean_score = np.mean(scores)
std_score = np.std(scores)

print("Cross-validation scores:", scores)
print("Mean score:", mean_score)
print("Standard deviation:", std_score)


Cross-validation scores: [17.819981047681257, 17.150133123448718, 16.194120394513387, 16.710883639451332, 15.15479213071895]
Mean score: 16.60598206716273
Standard deviation: 0.9007197683696093


# Linear Regression
* Cross-validation scores: [22.33135825513286, 20.860235027955216, 20.40537789025917, 20.699403779044097, 20.256578600396626]
* Mean score: 20.910590710557592
* Standard deviation: 0.7414428992932569

# Ridge
* Cross-validation scores: [22.331341945676346, 20.860222907851586, 20.405359706193746, 20.699425056468446, 20.256602262761056]
* Mean score: 20.910590375790235
* Standard deviation: 0.7414339051768616

# Lasso
* Cross-validation scores: [22.32239067435148, 20.86379882951596, 20.40372051725932, 20.70803349443667, 20.257989751568772]
* Mean score: 20.911186653426437
* Standard deviation: 0.7374553101700931

# Decision Tree
* Cross-validation scores: [21.260240277154487, 19.95409736645763, 19.485747548126994, 20.784516374365207, 19.025468003972463]
* Mean score: 20.102013914015355
* Standard deviation: 0.8207207261731391

# K Nearest Neighbors
* Cross-validation scores: [17.819981047681257, 17.150133123448718, 16.194120394513387, 16.710883639451332, 15.15479213071895]
* Mean score: 16.60598206716273
* Standard deviation: 0.9007197683696093

# Evaluate Selected Model
Based on the validation data the K Nearest Neighbors algorithm had the best performance (lowest MSE)

Now we evaluate that model against the test data.

In [59]:
# Use the model to make predictions on the TEST set
y_pred_test = model.predict(x_test)

# Calculate the MSE of the model on the validation set
mse_test = mean_squared_error(y_test, y_pred_test)
r_squared = r2_score(y_test, y_pred_test)

print("MSE:", mse_test)
print("r-squared:", r_squared)

MSE: 15.878969475444094
r-squared: 0.9452558778755287
