In [31]:
# This serves as a template which will guide you through the implementation of this task.  It is advised
# to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps

In [63]:

# First, we import necessary libraries:
import numpy as np
import pandas as pd

from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


In [91]:

def transform_data(X):
    """
    This function transforms the 5 input features of matrix X (x_i denoting the i-th component of X)
    into 21 new features phi(X) in the following manner:
    5 linear features: phi_1(X) = x_1, phi_2(X) = x_2, phi_3(X) = x_3, phi_4(X) = x_4, phi_5(X) = x_5
    5 quadratic features: phi_6(X) = x_1^2, phi_7(X) = x_2^2, phi_8(X) = x_3^2, phi_9(X) = x_4^2, phi_10(X) = x_5^2
    5 exponential features: phi_11(X) = exp(x_1), phi_12(X) = exp(x_2), phi_13(X) = exp(x_3), phi_14(X) = exp(x_4), phi_15(X) = exp(x_5)
    5 cosine features: phi_16(X) = cos(x_1), phi_17(X) = cos(x_2), phi_18(X) = cos(x_3), phi_19(X) = cos(x_4), phi_20(X) = cos(x_5)
    1 constant features: phi_21(X)=1

    Parameters
    ----------
    X: matrix of floats, dim = (700,5), inputs with 5 features

    Returns
    ----------
    X_transformed: array of floats: dim = (700,21), transformed input with 21 features
    """
    #resize trasform depending on training data size (Kfolded)
    length = np.size(X,axis=0)
    X_transformed = np.zeros((length, 21))

    X_square = np.square(X)
    X_exp = np.exp(X)
    X_cos = np.cos(X)
    const = np.ones((length,1))

    X_transformed = np.concatenate((X, X_square, X_exp, X_cos, const), 1)
    assert X_transformed.shape == (length, 21)
    return X_transformed


In [76]:

def fit(X, y):
    """
    This function receives training data points, transform them, and then fits the linear regression on this
    transformed data. Finally, it outputs the weights of the fitted linear regression.

    Parameters
    ----------
    X: matrix of floats, dim = (700,5), inputs with 5 features
    y: array of floats, dim = (700,), input labels)

    Returns
    ----------
    w: array of floats: dim = (21,), optimal parameters of linear regression
    """
    w = np.zeros((21,))
    X_transformed = transform_data(X)

    regr = ElasticNetCV(cv=100, random_state=0, fit_intercept=False, max_iter = 10000)
    regr.fit(X_transformed, y)
    w = regr.coef_

    assert w.shape == (21,)
    return w

In [61]:
def calculate_RMSE(w, X, y):
    """This function takes test data points (X and y), and computes the empirical RMSE of
    predicting y from X using a linear model with weights w.

    Parameters
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression
    X: matrix of floats, dim = (15,13), inputs with 13 features
    y: array of floats, dim = (15,), input labels

    Returns
    ----------
    RMSE: float: dim = 1, RMSE value
    """
    td_y = y
    td_y_pred = X w

     #unsure if the **0.5 is needed or not
    RMSE = mean_squared_error(td_y, td_y_pred) **0.5
    assert np.isscalar(RMSE)
    return RMSE

In [35]:

# Main function. You don't have to change this
if __name__ == "__main__":
    # Data loading
    data = pd.read_csv("train.csv")
    y = data["y"].to_numpy()
    data = data.drop(columns=["Id", "y"])
    # print a few data samples
    print(data.head())

    X = data.to_numpy()
    # The function retrieving optimal LR parameters
    w = fit(X, y)
    # Save results in the required format
    np.savetxt("./results.csv", w, fmt="%.12f")


     x1    x2    x3    x4    x5
0  0.02  0.05 -0.09 -0.43 -0.08
1 -0.13  0.11 -0.08 -0.29 -0.03
2  0.08  0.06 -0.07 -0.41 -0.03
3  0.02 -0.12  0.01 -0.43 -0.02
4 -0.14 -0.12 -0.08 -0.02 -0.08


NameError: name 'ElasticNetCV' is not defined

In [73]:
    #prototyping starts here:

    # run setup code
    data = pd.read_csv("train.csv")
    y = data["y"].to_numpy()
    data = data.drop(columns=["Id", "y"])
    # print a few data samples
    print(data.head())

    X = data.to_numpy()


     x1    x2    x3    x4    x5
0  0.02  0.05 -0.09 -0.43 -0.08
1 -0.13  0.11 -0.08 -0.29 -0.03
2  0.08  0.06 -0.07 -0.41 -0.03
3  0.02 -0.12  0.01 -0.43 -0.02
4 -0.14 -0.12 -0.08 -0.02 -0.08


In [90]:
np.size(X,axis=0)


700

In [93]:
n_folds = 10

RMSE_mat = np.zeros(n_folds)
# and fill all entries in the matrix 'RMSE_mat'

Kf = KFold(n_splits=n_folds)


for i, (train_index, test_index) in enumerate(Kf.split(X)):

    #train values in a fold
    X_train = X[train_index]
    y_train = y[train_index]

    #test values in a fold
    X_test = X[test_index]
    y_test = y[test_index]

    #use fit function for w values
    w = fit(X_train,y_train)

    #calculate indiviudal RMSE from function put in matrix
    RMSE_mat[i] = calculate_RMSE(w, X_test, y_test)

RMSE_mat

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 21 is different from 5)

In [82]:
#testing for kfold optimizer

n_folds = 10

RMSE_mat = np.zeros(n_folds)
# and fill all entries in the matrix 'RMSE_mat'

Kf = KFold(n_splits=n_folds)

for i, (train_index, test_index) in enumerate(Kf.split(X)):
        print(train_index.shape)

(630,)
(630,)
(630,)
(630,)
(630,)
(630,)
(630,)
(630,)
(630,)
(630,)


In [47]:
# K-Fold Cross-Validation

def cross_validation(model, _X, _y, _cv=5):
      '''Function to perform 5 Folds Cross-Validation
       Parameters
       ----------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           This is the matrix of features.
      _y: array
           This is the target variable.
      _cv: int, default=5
          Determines the number of folds for cross-validation.
       Returns
       -------
       The function returns a dictionary containing the metrics 'accuracy', 'precision',
       'recall', 'f1' for both training set and validation set.
      '''
      _scoring = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)

      return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

In [50]:
cross_validation(fit, _X, _y, _cv=5)

NameError: name '_X' is not defined

TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.

In [192]:
X_transformed = transform_data(X)
n = 10
start= 50
scores = np.zeros((n,2))
k=0
for i in range(start, start+n):
    regr = ElasticNetCV(cv=i, random_state=0, fit_intercept=False, max_iter = 10000)
    regr.fit(X_transformed, y)

    scores[k,1] = regr.score(X_transformed,y)
    scores[k,0] = i
    k += 1


In [197]:
regro = ElasticNetCV(cv = 56, random_state=0, fit_intercept=False, max_iter = 10000)
score = cross_val_score(regro, X, y, cv=100)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

27.26 accuracy with a standard deviation of 27.32


In [193]:
scores

array([[5.00000000e+01, 1.66910154e-02],
       [5.10000000e+01, 1.56751893e-02],
       [5.20000000e+01, 1.66910154e-02],
       [5.30000000e+01, 1.66910154e-02],
       [5.40000000e+01, 1.66910154e-02],
       [5.50000000e+01, 1.66910154e-02],
       [5.60000000e+01, 1.68780919e-02],
       [5.70000000e+01, 1.68780919e-02],
       [5.80000000e+01, 1.68780919e-02],
       [5.90000000e+01, 1.66910154e-02]])

In [194]:
ind = np.unravel_index(np.argmax(scores[:,1], axis=None), 100)
ind

(6,)

In [187]:
scores[:,1].shape


(100,)

In [184]:

solution_luki =
[ 0.         -0.         -0.          0.          0.         -0.
  0.          0.         -0.          0.         -0.4040242  -1.08890313
 -1.37524395 -0.11770438 -0.32826536 -0.36769822 -0.61963535 -0.63849721
 -0.41928566 -0.60026965 -0.56232935]

SyntaxError: invalid syntax (543409885.py, line 1)