# AI300
### Cross Validation

In [26]:
import numpy as np
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def my_cross_val(model,
                 X,
                 y,
                 num_split=5,
                 shuffle=True,
                 random_state=None,
                 scoring=None):
  if random_state is not None:
    np.random.seed(random_state)

  n = len(X)

  indices = np.arange(n);
  if shuffle:
    np.random.shuffle(indices)

  folds = np.array_split(indices, num_split)
  score_array = np.zeros(num_split)

  for k in range(num_split):
    test_indices = folds[k]
    train_indices = np.concatenate([folds[i] for i in range(num_split) if i != k])

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    if scoring is None or scoring == 'mse':
      score_array[k] = mean_squared_error(y_test, predictions)
    elif scoring == 'accuracy':
      score_array[k] = accuracy_score(y_test, predictions)

  return score_array

In [None]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from tqdm import tqdm

dataset = load_digits()
X, y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

y = y.astype(int)

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

accuracy = knn.score(X_test, y_test)
print(f"Accuracy with 10 neighbors: {accuracy} \n")

k_vals = [3, 7, 11, 15]
scores_arr = np.array([])

print("Optimizing with my model.")
for k in tqdm(k_vals, desc="Optimizing k"):
  knn = KNeighborsClassifier(n_neighbors=k)
  scores = my_cross_val(knn, X, y, num_split=5, shuffle=True, random_state=42, scoring='accuracy')
  score = np.mean(scores)
  scores_arr = np.append(scores_arr, score)

optimal_k = k_vals[np.argmax(scores_arr)]
print(f"Optimal number of neighbors: {optimal_k}")
print(f"Optimal accuracy: {np.max(scores_arr)}\n")

scores_arr_sklearn = np.array([])

print("Optimizing with Sklearn")
for k in tqdm(k_vals, desc="Optimizing k"):
  knn = KNeighborsClassifier(n_neighbors=k)
  scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
  score = np.mean(scores)
  scores_arr_sklearn = np.append(scores_arr_sklearn, score)

optimal_k_sklearn = k_vals[np.argmax(scores_arr_sklearn)]
print(f"\nOptimal number of neighbors (Sklearn): {optimal_k_sklearn}")
print(f"Optimal accuracy (Sklearn): {np.max(scores_arr_sklearn)}")

Accuracy with 10 neighbors: 0.9866666666666667 

Optimizing with my model.


Optimizing k: 100%|██████████| 4/4 [00:00<00:00,  8.07it/s]


Optimal number of neighbors: 7
Optimal accuracy: 0.9933333333333334

Optimizing with Sklearn


Optimizing k: 100%|██████████| 4/4 [00:00<00:00,  6.58it/s]


Optimal number of neighbors (Sklearn): 3
Optimal accuracy (Sklearn): 0.966621788919839





In [None]:
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from tqdm import tqdm

np.random.seed(42)

x = np.random.uniform(-10, 10, 10000)
y = 5 + (2 * np.cos(x)) - (3 * np.sin(x)) + (6 * np.cos(2 * x)) + (4 * np.sin(2 * x)) - (2 * np.cos(3 * x)) + (7 * np.sin(3 * x)) + np.random.normal(0, 5, 10000)
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.7)

k_vals = [1, 3, 5, 7, 9]

def features(x, K):
    features = []

    for k in range(1, K + 1):
        features.append(np.cos(k * x))
        features.append(np.sin(k * x))
    return np.array(features).T

mse_scores = []

print("Lasso K Optimization...")
for K in tqdm(k_vals, desc="Optimizing K"):
    x_feats = features(x, K)

    lasso = Lasso(alpha=0.1, fit_intercept=True, random_state=42, max_iter=10000)

    scores = my_cross_val(lasso, x_feats, y, scoring='mse')
    mean_mse = np.mean(scores)
    mse_scores.append(mean_mse)

optimal_K_idx = np.argmin(mse_scores)
optimal_K = k_vals[optimal_K_idx]
optimal_mse = mse_scores[optimal_K_idx]
X_features = features(X_train, optimal_K)
lasso.fit(X_features, y_train)
theta = lasso.coef_

print(f"\nOptimal K: {optimal_K}")
print(f"Optimal Learnable Parameters: {theta}")
print(f"Optimal MSE on Test Dataset: {optimal_mse:.4f}")

Lasso K Optimization...


Optimizing K: 100%|██████████| 5/5 [00:00<00:00, 29.23it/s]


Optimal K: 7
Optimal Learnable Parameters: [ 1.8412785  -2.86610879  5.89911438  3.65634903 -1.79230434  6.76312888
 -0.          0.         -0.          0.         -0.          0.
  0.         -0.00908047]
Optimal MSE on Test Dataset: 24.5540





In [33]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from tqdm import tqdm

df = pd.read_csv("hour.csv")
df = df.dropna()

X, y = df.drop('cnt', axis=1), df['cnt']
X = X.drop(columns=['instant', 'dteday', 'casual', 'registered'])

categorical_cols = ['season', 'yr', 'mnth', 'hr', 'weekday', 'weathersit', 'holiday', 'workingday']
numerical_cols = ['temp', 'atemp', 'hum', 'windspeed']
X_categorical = X[categorical_cols]
X_numerical = X[numerical_cols]

one_hot_encoder = OneHotEncoder(sparse_output=False)
X_encoded = one_hot_encoder.fit_transform(X_categorical)

encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_cols)
X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names, index=X.index)
X = pd.concat([X_numerical, X_encoded_df], axis=1)

lambda_vals = [0.1, 0.5, 1.0, 5.0, 10.0]
mse_scores = []

print("Lasso Lambda Optimization...")
for l in tqdm(lambda_vals, desc="Optimizing lambda"):
  lasso = Lasso(alpha=l, random_state=42, max_iter=10000)

  scores = my_cross_val(lasso, X, y, scoring='mse')
  mean_mse = np.mean(scores)
  mse_scores.append(mean_mse)

optimal_lambda_idx = np.argmin(mse_scores)
optimal_lambda = lambda_vals[optimal_lambda_idx]
optimal_mse = mse_scores[optimal_lambda_idx]
print(f"Optimal Lambda: {optimal_lambda}")
print(f"Optimal MSE on Test Dataset: {optimal_mse:.4f}")

mse_scores = []
print("\nRidge Lambda Optimization...")
for l in tqdm(lambda_vals, desc="Optimizing lamda"):
  ridge = Ridge(alpha=l, random_state=42, max_iter=10000)

  scores = my_cross_val(ridge, X, y, scoring='mse')
  mean_mse = np.mean(scores)
  mse_scores.append(mean_mse)

optimal_lambda_idx = np.argmin(mse_scores)
optimal_lambda = lambda_vals[optimal_lambda_idx]
optimal_mse = mse_scores[optimal_lambda_idx]
print(f"\nOptimal Lambda: {optimal_lambda}")
print(f"Optimal MSE on Test Dataset: {optimal_mse:.4f}")

Lasso Lambda Optimization...


Optimizing lambda: 100%|██████████| 5/5 [00:05<00:00,  1.08s/it]


Optimal Lambda: 0.1
Optimal MSE on Test Dataset: 10231.1919

Ridge Lambda Optimization...


Optimizing lamda: 100%|██████████| 5/5 [00:00<00:00,  8.24it/s]


Optimal Lambda: 5.0
Optimal MSE on Test Dataset: 10377.0820



