# Scalar

In [None]:
# Scalar
scaler = StandardScaler().fit(Xtr)
Xtr_std = scaler.transform(Xtr)
Xts_std = scaler.transform(Xts)

vec = CountVectorizer(stop_words='english')
Xtr_vec = vec.fit_transform(Xtr_str)
Xts_vec = vec.transform(Xts_str)

# Linear Regression

In [None]:
# Linear Regression
features = ['Parks_Nearby', 'Grocery_Stores_Nearby', 'Schools_Nearby', 'Public_Transit_Nearby']
target = ['Walkability_Score']
X = df[features]
y = df[target]
random_state = 23
Xtr, Xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=random_state)
model = LinearRegression().fit(Xtr, ytr)
yts_hat = model.predict(Xts)
rsq = r2_score(yts, yts_hat)
mse = mean_squared_error(yts, yts_hat)

In [None]:
# Single split - random shuffle
random_state = 8
Xtr_one_shuf, Xts_one_shuf, ytr_one_shuf, yts_one_shuf = train_test_split(X, y, test_size=1/5, random_state=random_state)
model = LinearRegression()
model.fit(Xtr_one_shuf, ytr_one_shuf)
yts_one_shuf_pred = model.predict(Xts_one_shuf)
r2_one_shuf = r2_score(yts_one_shuf, yts_one_shuf_pred)
# Single split - sorted data, no shuffle
Xtr_one_order, Xts_one_order, ytr_one_order, yts_one_order = train_test_split(X, y, test_size=1/5, shuffle=False)
model = LinearRegression()
model.fit(Xtr_one_order, ytr_one_order)
yts_one_order_pred = model.predict(Xts_one_order)
r2_one_order = r2_score(yts_one_order, yts_one_order_pred)
# Multiple splits - random shuffle
n_fold = 5
r2_kf_shuffle = np.zeros(shape=(n_fold,))
kf = KFold(n_splits=n_fold, shuffle=True, random_state=random_state)       
for i, (idx_tr, idx_ts) in enumerate(kf.split(X)):
    model = LinearRegression()
    model.fit(X[idx_tr], y[idx_tr])
    y_pred_kfold = model.predict(X[idx_ts])
    r2_kf_shuffle[i] = r2_score(y[idx_ts], y_pred_kfold)
r2_kf_shuffle_mean = np.mean(r2_kf_shuffle)
# Multiple splits - time series
n_fold = 5
r2_ts = np.zeros(shape=(n_fold,))
ts = TimeSeriesSplit(n_splits=n_fold)
for i, (idx_tr, idx_ts) in enumerate(ts.split(X)):
    model = LinearRegression()
    model.fit(X[idx_tr], y[idx_tr])
    y_pred = model.predict(X[idx_ts])
    r2_ts[i] = r2_score(y[idx_ts], y_pred)
r2_ts_mean = np.mean(r2_ts)

# Ridge Regression

In [None]:
x_names = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
y_names = ['price']
random_state = 13
Xtr_df, Xts_df = train_test_split(df[x_names], test_size=0.3, random_state=random_state, shuffle=True)
ytr_df, yts_df = train_test_split(df[y_names], test_size=0.3, random_state=random_state, shuffle=True)
Xtr, Xts, ytr, yts = np.array(Xtr_df), np.array(Xts_df), np.array(ytr_df), np.array(yts_df)
alpha_list = np.array([0, 10, 20, 50, 100, 200, 500])
nfold = 5
mse_val = np.zeros((len(alpha_list), nfold))
# k-ford
kf = KFold(n_splits=nfold, shuffle=False)
# For each fold, standardize the data
for ifold, (idx_tr, idx_val) in enumerate(kf.split(Xtr)):
    X_train_fold, X_val_fold = Xtr[idx_tr], Xtr[idx_val]
    y_train_fold, y_val_fold = ytr[idx_tr], ytr[idx_val]
    scaler = StandardScaler().fit(X_train_fold)
    X_train_fold_std = scaler.transform(X_train_fold)
    X_val_fold_std = scaler.transform(X_val_fold)
    # For each alpha in the list, fit a Ridge regression model on the standardized data
    for i, alpha in enumerate(alpha_list):
        model = Ridge(alpha=alpha)
        model.fit(X_train_fold_std, y_train_fold)
        y_pred = model.predict(X_val_fold_std)
        # update the appropriate entry in mse_val
        mse_val[i, ifold] = mean_squared_error(y_val_fold, y_pred)
mse_mean = np.mean(mse_val, axis=1)
alpha_min_mse = alpha_list[np.argmin(mse_mean)]
# entire training set
scaler = StandardScaler().fit(Xtr)
Xtr_std = scaler.transform(Xtr)
Xts_std = scaler.transform(Xts)
model = Ridge(alpha=alpha_min_mse)
model.fit(Xtr_std, ytr)
y_pred = model.predict(Xts_std)
mse_ridge = mean_squared_error(yts, y_pred)

# Logistic Regression

In [None]:

# Standardize data in each fold.
random_state = 14
Xtr, Xts, ytr, yts = train_test_split(X, y, test_size=0.3, random_state=random_state)
med_values = np.nanmedian(Xtr, axis=0)
Xtr_filled = np.nan_to_num(Xtr, nan=med_values)
Xts_filled = np.nan_to_num(Xts, nan=med_values)
C_test = np.logspace(-1,3,10)
nfold = 3
acc_val = np.zeros((len(C_test), nfold))
for iC, C in enumerate(C_test): 
    kf = KFold(n_splits=nfold, shuffle=False)
    for ifold, (Itr, Ival) in enumerate(kf.split(Xtr_filled)):
        Xtr_fold, Xval_fold = Xtr_filled[Itr], Xtr_filled[Ival]
        ytr_fold, yval_fold = ytr[Itr], ytr[Ival]     
        scaler = StandardScaler().fit(Xtr_fold)
        Xtr_std = scaler.transform(Xtr_fold)
        Xvl_std = scaler.transform(Xval_fold)
        clf = LogisticRegression(random_state = random_state, solver = 'liblinear', penalty='l1', C = C)
        clf.fit(Xtr_std, ytr_fold)
        yhat = clf.predict(Xvl_std)
        acc_val[iC, ifold] = accuracy_score(yval_fold, yhat)
acc_mean = np.mean(acc_val, axis=1)
C_best = C_test[np.argmax(acc_mean)]
# entire training set
scaler = StandardScaler().fit(Xtr_filled)
Xtr_std = scaler.transform(Xtr_filled)
Xts_std = scaler.transform(Xts_filled)
clf_best = LogisticRegression(random_state = random_state, solver = 'liblinear', penalty='l1', C = C_best)
clf_best.fit(Xtr_std, ytr)
y_hat = clf_best.predict(Xts_std)

In [None]:
Xtr_str, Xts_str, ytr, yts = train_test_split(df['statement'].values, df['label_binary'].values, shuffle=True, random_state=0, test_size=0.25)
vec = CountVectorizer(stop_words='english')
Xtr_vec = vec.fit_transform(Xtr_str)
Xts_vec = vec.transform(Xts_str)
C_test = np.logspace(-3, 3, num=20)
nfold = 5
acc_val = np.zeros((len(C_test), nfold))
kf = KFold(n_splits=nfold)
for ifold, (Itr, Ival) in enumerate(kf.split(Xtr_vec)):
    for iC, C in enumerate(C_test):
        clf = LogisticRegression(random_state = 0, penalty='l1', solver='liblinear', C = C)
        clf.fit(Xtr_vec[Itr], ytr[Itr])
        yhat = clf.predict(Xtr_vec[Ival])
        acc_val[iC, ifold] = accuracy_score(ytr[Ival], yhat)
acc_mean = np.mean(acc_val, axis=1)
C_best = C_test[np.argmax(acc_mean)]
model_best = LogisticRegression(penalty='l1', C=C_best, random_state=0, solver='liblinear')
model_best.fit(Xtr_vec, ytr)
y_pred_best = model_best.predict(Xts_vec)
acc_best = accuracy_score(yts, y_pred_best)
count_best = np.count_nonzero(model_best.coef_)
acc_std = acc_val.std(axis=1)
acc_one_se = acc_mean - acc_std
C_one_se = C_test[np.argmax(acc_mean >= np.max(acc_one_se))]
model_one_se = LogisticRegression(penalty='l1', C=C_one_se, random_state=0, solver='liblinear')
model_one_se.fit(Xtr_vec, ytr)
y_pred_one_se = model_one_se.predict(Xts_vec)
acc_one_se = accuracy_score(yts, y_pred_one_se)
count_one_se = np.count_nonzero(model_one_se.coef_)

# K-fold CV with Fourier basis expansion

You decide to use a linear regression model with Fourier basis transformation of the `hourofweek` feature:

$$\hat{y}=w_0 + w_1 x +  \sum_{t \in \text{tlist}} w_{t,c} \cos(2\pi x/t)+w_{t,s} \sin(2\pi x/t)$$

where each sine and cosine pair represents the periodic behavior over a particular time interval. 

For example, if `tlist = [0.5, 1]`, then your model would be:

$$\hat{y}=w_0 + w_1 x + w_{0.5,c} \cos(2\pi x/0.5)+w_{0.5,s} \sin(2\pi x/0.5) + w_{1,c} \cos(2\pi x/1)+w_{1,s} \sin(2\pi x/1)$$


Now, you are ready to fit a K-fold CV! In your CV, you will fit and evaluate a `LinearRegression` model (using the `sklearn` implementation) on an increasing number of columns of the data, as described above - 

* in the first iteration of your K-fold CV, you will evaluate the regression for `tlist_eval = []`. 
* In the second iteration of your K-fold CV, you will evaluate the regression for `tlist_eval = [1]`.
* In the third iteration, you will evaluate the regression for `tlist_eval = [1, 2]`

and so on, until, in the final iteration, you will evaluate the regression for *all* the values in `tlist`.

(Of course, you won't re-compute the Fourier basis transformation inside the loop - you'll just select the appropriate rows and columns from `Xtr_trans` in each iteration.)

Since you have prepared a "ones column" in the data, you will pass `fit_intercept=False` as an argument to the `LinearRegression`, so that it won't also fit another "intercept" term (in addition to the coefficient for the "ones column".)


In [None]:
# Linear Regression
df_tr, df_ts = train_test_split(df, train_size=10000, shuffle=False, random_state=42)
features = ['hour', 'month', 'dayofweek', 'hourofweek']
target = 'ridership'
model = LinearRegression()
model.fit(df_tr[features], df_tr[target])
y_pred = model.predict(df_ts[features])
r2_lr = r2_score(df_ts[target], y_pred)
# K-fold CV with fourier basis expansion
tlist = np.arange(1, 51)
Xtr = df_tr['hourofweek'].values
Xts = df_ts['hourofweek'].values
ytr = df_tr['ridership'].values
yts = df_ts['ridership'].values
Xtr_trans = np.column_stack((np.ones_like(Xtr), Xtr))
Xts_trans = np.column_stack((np.ones_like(Xts), Xts))
for t in tlist:
    cos_tr = np.cos(2 * np.pi * Xtr / t)
    sin_tr = np.sin(2 * np.pi * Xtr / t)
    cos_ts = np.cos(2 * np.pi * Xts / t)
    sin_ts = np.sin(2 * np.pi * Xts / t)
    Xtr_trans = np.column_stack((Xtr_trans, cos_tr, sin_tr))
    Xts_trans = np.column_stack((Xts_trans, cos_ts, sin_ts))
Xtr_trans = np.round(Xtr_trans, 10)
Xts_trans = np.round(Xts_trans, 10)

nfold = 5
r2_val = np.zeros((len(tlist) + 1, nfold))
kf = KFold(n_splits=nfold, shuffle=False)
for i, t in enumerate(range(len(tlist) + 1)):
    # The first model uses only the intercept and original X (2 columns)
    # Subsequent models add Fourier features in pairs (cos and sin)
    num_columns = 2 + 2 * i 
    X_subset = Xtr_trans[:, :num_columns]
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_subset)):
            X_train, X_val = X_subset[train_idx], X_subset[val_idx]
            y_train, y_val = ytr[train_idx], ytr[val_idx]
            model = LinearRegression(fit_intercept=False)
            model.fit(X_train, y_train)           
            y_pred = model.predict(X_val)
            r2_val[i, fold] = r2_score(y_val, y_pred)

r2_mean = r2_val.mean(axis=1)
r2_se = np.std(r2_val, axis=1, ddof=1) / np.sqrt(nfold)

idx_max = np.argmax(r2_mean)
# Compute the threshold for the one-SE rule
one_se = r2_mean[idx_max] - r2_se[idx_max]
# Find the simplest model (smallest number of Fourier features) whose R² is within one SE of the best model
tlist_opt = []
for i in range(len(r2_mean)):
    if r2_mean[i] >= one_se:
        tlist_opt = tlist[:i]  # Select the first i values of tlist
        break
# Train a model on the entire training set for this `tlist_opt`, then evaluate its performance on the test set. 
# Save the test R2 score in r2_one_se.
num_columns = 2 + 2 * len(tlist_opt)
Xtr_opt = Xtr_trans[:, :num_columns]
Xts_opt = Xts_trans[:, :num_columns]
model_opt = LinearRegression(fit_intercept=False)
model_opt.fit(Xtr_opt, ytr)
y_pred_opt = model_opt.predict(Xts_opt)
r2_one_se = r2_score(yts, y_pred_opt)