In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

### Preprocessing

Read the data.

In [2]:
col_names = ["Page Popularity/likes", "Page Checkins", "Page talking about",
             "Page Category", "Derived", "Derived", "Derived", "Derived",
             "Derived", "Derived", "Derived", "Derived", "Derived",
             "Derived", "Derived", "Derived", "Derived", "Derived",
             "Derived", "Derived", "Derived", "Derived", "Derived",
             "Derived", "Derived", "Derived", "Derived", "Derived",
             "Derived", "CC1", "CC2", "CC3", "CC4", "CC5", "Base time",
             "Post length", "Post Share Count", "Post Promotion Status", "H Local",
             "Post Sunday", "Post Monday", "Post Tuesday", "Post Wednesday", "Post Thursday", "Post Friday", "Post Saturday",
             "Base Sunday", "Base Monday", "Base Tuesday", "Base Wednesday", "Base Thursday", "Base Friday", "Base Saturday",
             "Target Variable"]

In [3]:
df = pd.read_csv("train.csv", header=None, names=col_names)

  return _read(filepath_or_buffer, kwds)


In [4]:
df.head()

Unnamed: 0,Page Popularity/likes,Page Checkins,Page talking about,Page Category,Derived,Derived.1,Derived.2,Derived.3,Derived.4,Derived.5,...,Post Friday,Post Saturday,Base Sunday,Base Monday,Base Tuesday,Base Wednesday,Base Thursday,Base Friday,Base Saturday,Target Variable
0,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,0,0,0,1,0
1,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,0,0,1,0,0
2,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,1,0,0,0,0,0,0,0,1,0
3,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,1,0,0,1,0,0,0,0,0,0
4,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
X = df.loc[:, df.columns != 'Target Variable'].copy()
X.drop(["Post Promotion Status"], axis=1, inplace=True)

y = df['Target Variable'].values

### LR-algorithm implementation

In [17]:
def cost_function(X, y, w):
    return np.sum((X.dot(w) - y) ** 2) / (2 * len(y))

In [6]:
def rmse(y, y_pred):
    return np.sqrt(sum((y - y_pred) ** 2) / len(y))

def r2_score(y, y_pred):
    tss = sum((y - np.mean(y)) ** 2)
    rss = sum((y - y_pred) ** 2)
    return 1 - (rss / tss)

In [21]:
def gradient_descent(X, y, w, eta=1e-2, eps=1e-3, max_iters=10000):
    cost = np.inf

    for i in range(1, max_iters + 1):
        loss = X.dot(w) - y
        gradient = X.T.dot(loss) / len(y)
        w_new = w - eta / np.sqrt(i) * gradient

        cost_new = cost_function(X, y, w)
        if np.linalg.norm(w - w_new) < eps or cost - cost_new < eps:
            return w_new, i
        w, cost = w_new, cost_new

    return w, max_iters

Cross-validation.

In [8]:
def train_test_generator(X, y):
    kf = KFold(5, True, 1)
    
    for train, test in kf.split(X):
        X_train, X_test = X.iloc[train].copy(), X.iloc[test].copy()

        train_mean, train_std = X_train.mean(), X_train.std()
        X_train_scaled = (X_train - train_mean) / train_std
        X_test_scaled = (X_test - train_mean) / train_std

        m1, m2 = X_train_scaled.shape[0], X_test_scaled.shape[0]
        X_train = np.hstack((np.ones(m1).reshape(m1, 1), X_train_scaled))
        X_test = np.hstack((np.ones(m2).reshape(m2, 1), X_test_scaled)) 

        yield X_train, X_test, y[train], y[test]

In [22]:
weights, n_iters = [], []
rmse_train, r2_train = [], []
rmse_test, r2_test  = [], []

In [23]:
for X_train, X_test, y_train, y_test in train_test_generator(X, y):
    w_init = np.zeros(X_train.shape[1])
    w, iters = gradient_descent(X_train, y_train, w_init)
    weights.append(w)
    n_iters.append(iters)

    y_pred_train = X_train.dot(w)
    rmse_train.append(rmse(y_train, y_pred_train))
    r2_train.append(r2_score(y_train, y_pred_train))

    y_pred_test = X_test.dot(w)
    rmse_test.append(rmse(y_test, y_pred_test))
    r2_test.append(r2_score(y_test, y_pred_test))

Make a report.

In [58]:
fold_names = ['T1', 'T2', 'T3', 'T4', 'T5']

In [59]:
values = [n_iters, rmse_train, r2_train, rmse_test, r2_test]
output = pd.DataFrame(data=values, 
                      index=['n_iters', 'rmse_train', 'r2_train', 'rmse_test', 'r2_test'], 
                      columns=fold_names)

In [60]:
output['E'] = output.mean(axis=1)
output['STD'] = output.loc[:, output.columns != 'E'].std(axis=1)

In [61]:
weights_cols = ["Const", *X.columns.values]
df_weights = pd.DataFrame(data=weights, 
                          index=fold_names,
                          columns=weights_cols)

In [62]:
df_weights = df_weights.transpose()

df_weights['E'] = df_weights.mean(axis=1)
df_weights['STD'] = df_weights.loc[:, df_weights.columns != 'E'].std(axis=1)

In [63]:
output = output.append(df_weights)

In [64]:
output.to_csv("report.csv", index=True, header=True)