# Linear Regression

We compute a prediction baseline using linear regression for comparison.

## Download the Dataset

We first download and unzip the dataset.

In [1]:
import urllib, tarfile, os

file_url = 'http://www.lpthe.jussieu.fr/~erbin/files/data/cicy3o_data.tar.gz'
file_out = './cicy3o.tar.gz'
file_dat = 'cicy3o.h5'

if not os.path.isfile(file_out):
    urllib.request.urlretrieve(file_url, file_out)
    
if not os.path.isfile(file_dat):
    with tarfile.open(file_out, 'r') as tar:
        tar.extract(file_dat)

## Load the Dataset

We then load the dataset:

In [2]:
import pandas as pd

dat = pd.read_hdf(os.path.join('.', file_dat))

Remove the outliers (keep $h^{1,1} \in [1, 16]$ and $h^{2,1} \in [15, 86]$):

In [3]:
dat_out   = dat
dat_noout = dat.loc[(dat['h11'] > 0) &
                    (dat['h11'] < 17) &
                    (dat['h21'] > 14) &
                    (dat['h21'] < 87)
                   ]

dat_out   = dat_out[['h11', 'h21', 'matrix']]
dat_noout = dat_noout[['h11', 'h21', 'matrix']]

Then extract the `matrix` column into its dense format:

In [4]:
import numpy as np

def extract_series(series: pd.Series) -> pd.Series:
    '''
    Extract a Pandas series into its dense format.
    
    Required arguments:
        series: the pandas series.
        
    Returns:
        the pandas series in dense format.
    '''
    # avoid direct overwriting
    series = series.copy()
    
    # cget the maximum size of each axis
    max_shape = series.apply(np.shape).max()
    
    # return the transformed series
    if np.prod(max_shape) > 1:
        # compute the necessary shift and apply it
        offset = lambda s: [(0, max_shape[i] - np.shape(s)[i])
                            for i in range(len(max_shape))
                           ]
        return series.apply(lambda s: np.pad(s, offset(s), mode='constant'))
    else:
        return series
    
# apply it to the matrix
dat_out   = dat_out.apply(extract_series)
dat_noout = dat_noout.apply(extract_series)

## Training and Validation Strategy

We then subsample the set into training, validation and test sets for evaluation.

In [5]:
from sklearn.model_selection import train_test_split

# set random state
RAND = 42
np.random.seed(RAND)

# split training set
dat_out_train_80,   dat_out_test_80   = train_test_split(dat_out, train_size=0.8, shuffle=True, random_state=RAND)
dat_out_train_30,   dat_out_test_30   = train_test_split(dat_out, train_size=0.3, shuffle=True, random_state=RAND)
dat_noout_train_80, dat_noout_test_80 = train_test_split(dat_noout, train_size=0.8, shuffle=True, random_state=RAND)
dat_noout_train_30, dat_noout_test_30 = train_test_split(dat_noout, train_size=0.3, shuffle=True, random_state=RAND)

# check sizes
print('80% training data:')
print('    Training set w/ outliers:   {:.2f}%'.format(100 * dat_out_train_80.shape[0] / dat_out.shape[0]))
print('    Test set w/ outliers:       {:.2f}%'.format(100 * dat_out_test_80.shape[0] / dat_out.shape[0]))
print('')
print('    Training set w/o outliers:   {:.2f}%'.format(100 * dat_noout_train_80.shape[0] / dat_noout.shape[0]))
print('    Test set w/o outliers:       {:.2f}%'.format(100 * dat_noout_test_80.shape[0] / dat_noout.shape[0]))
print('')
print('30% training data:')
print('    Training set w/ outliers:   {:.2f}%'.format(100 * dat_out_train_30.shape[0] / dat_out.shape[0]))
print('    Test set w/ outliers:       {:.2f}%'.format(100 * dat_out_test_30.shape[0] / dat_out.shape[0]))
print('')
print('    Training set w/o outliers:   {:.2f}%'.format(100 * dat_noout_train_30.shape[0] / dat_noout.shape[0]))
print('    Test set w/o outliers:       {:.2f}%'.format(100 * dat_noout_test_30.shape[0] / dat_noout.shape[0]))

80% training data:
    Training set w/ outliers:   80.00%
    Test set w/ outliers:       20.00%

    Training set w/o outliers:   79.99%
    Test set w/o outliers:       20.01%

30% training data:
    Training set w/ outliers:   30.00%
    Test set w/ outliers:       70.00%

    Training set w/o outliers:   30.00%
    Test set w/o outliers:       70.00%


In [6]:
# matrix
mat_out_train_80   = np.array(dat_out_train_80['matrix'].tolist()).reshape(-1,180)
mat_noout_train_80 = np.array(dat_noout_train_80['matrix'].tolist()).reshape(-1,180)
mat_out_train_30   = np.array(dat_out_train_30['matrix'].tolist()).reshape(-1,180)
mat_noout_train_30 = np.array(dat_noout_train_30['matrix'].tolist()).reshape(-1,180)

mat_out_test_80    = np.array(dat_out_test_80['matrix'].tolist()).reshape(-1,180)
mat_noout_test_80  = np.array(dat_noout_test_80['matrix'].tolist()).reshape(-1,180)
mat_out_test_30    = np.array(dat_out_test_30['matrix'].tolist()).reshape(-1,180)
mat_noout_test_30  = np.array(dat_noout_test_30['matrix'].tolist()).reshape(-1,180)

# labels
lab_out_train_80 = {'h11_output': dat_out_train_80['h11'].values.reshape(-1,),
                    'h21_output': dat_out_train_80['h21'].values.reshape(-1,)
                   }
lab_noout_train_80 = {'h11_output': dat_noout_train_80['h11'].values.reshape(-1,),
                      'h21_output': dat_noout_train_80['h21'].values.reshape(-1,)
                     }
lab_out_train_30 = {'h11_output': dat_out_train_30['h11'].values.reshape(-1,),
                    'h21_output': dat_out_train_30['h21'].values.reshape(-1,)
                   }
lab_noout_train_30 = {'h11_output': dat_noout_train_30['h11'].values.reshape(-1,),
                      'h21_output': dat_noout_train_30['h21'].values.reshape(-1,)
                     }

lab_out_test_80   = {'h11_output': dat_out_test_80['h11'].values.reshape(-1,),
                     'h21_output': dat_out_test_80['h21'].values.reshape(-1,)
                    }
lab_noout_test_80   = {'h11_output': dat_noout_test_80['h11'].values.reshape(-1,),
                       'h21_output': dat_noout_test_80['h21'].values.reshape(-1,)
                      }
lab_out_test_30   = {'h11_output': dat_out_test_30['h11'].values.reshape(-1,),
                     'h21_output': dat_out_test_30['h21'].values.reshape(-1,)
                    }
lab_noout_test_30   = {'h11_output': dat_noout_test_30['h11'].values.reshape(-1,),
                       'h21_output': dat_noout_test_30['h21'].values.reshape(-1,)
                      }

## Linear Regression ($\ell_1$ regularised)

In [17]:
from sklearn.linear_model import Lasso
from sklearn.multioutput import MultiOutputRegressor

# define the estimators
lr_out_80   = MultiOutputRegressor(Lasso(alpha=2.0e-4, fit_intercept=False, max_iter=1e5, random_state=RAND), n_jobs=-1)
lr_noout_80 = MultiOutputRegressor(Lasso(alpha=2.0e-4, fit_intercept=False, max_iter=1e5, random_state=RAND), n_jobs=-1)
lr_out_30   = MultiOutputRegressor(Lasso(alpha=2.0e-4, fit_intercept=False, max_iter=1e5, random_state=RAND), n_jobs=-1)
lr_noout_30 = MultiOutputRegressor(Lasso(alpha=2.0e-4, fit_intercept=False, max_iter=1e5, random_state=RAND), n_jobs=-1)

# train the estimators
lr_out_80.fit(mat_out_train_80, pd.DataFrame(lab_out_train_80).values)
lr_noout_80.fit(mat_noout_train_80, pd.DataFrame(lab_noout_train_80).values)
lr_out_30.fit(mat_out_train_30, pd.DataFrame(lab_out_train_30).values)
lr_noout_30.fit(mat_noout_train_30, pd.DataFrame(lab_noout_train_30).values)

# compute the predictions
pred_out_80   = np.floor(lr_out_80.predict(mat_out_test_80)).astype(int)
pred_noout_80 = np.floor(lr_noout_80.predict(mat_noout_test_80)).astype(int)
pred_out_30   = np.floor(lr_out_30.predict(mat_out_test_30)).astype(int)
pred_noout_30 = np.floor(lr_noout_30.predict(mat_noout_test_30)).astype(int)

# save predictions to file
pd.DataFrame({'h11_pred': pred_out_80[:,0],
              'h11_true': lab_out_test_80['h11_output'],
              'h21_pred': pred_out_80[:,1],
              'h21_true': lab_out_test_80['h21_output']
             }
            ).to_csv('./dat/svm_out_80.csv')
pd.DataFrame({'h11_pred': pred_noout_80[:,0],
              'h11_true': lab_noout_test_80['h11_output'],
              'h21_pred': pred_noout_80[:,1],
              'h21_true': lab_noout_test_80['h21_output']
             }
            ).to_csv('./dat/svm_noout_80.csv')
pd.DataFrame({'h11_pred': pred_out_30[:,0],
              'h11_true': lab_out_test_30['h11_output'],
              'h21_pred': pred_out_30[:,1],
              'h21_true': lab_out_test_30['h21_output']
             }
            ).to_csv('./dat/svm_out_30.csv')
pd.DataFrame({'h11_pred': pred_noout_30[:,0],
              'h11_true': lab_noout_test_30['h11_output'],
              'h21_pred': pred_noout_30[:,1],
              'h21_true': lab_noout_test_30['h21_output']
             }
            ).to_csv('./dat/svm_noout_30.csv')

# compute accuracy
h11_acc_out_80   = np.mean((lab_out_test_80['h11_output'] == pred_out_80[:,0]).astype(int))
h21_acc_out_80   = np.mean((lab_out_test_80['h21_output'] == pred_out_80[:,1]).astype(int))
h11_acc_noout_80 = np.mean((lab_noout_test_80['h11_output'] == pred_noout_80[:,0]).astype(int))
h21_acc_noout_80 = np.mean((lab_noout_test_80['h21_output'] == pred_noout_80[:,1]).astype(int))
h11_acc_out_30   = np.mean((lab_out_test_30['h11_output'] == pred_out_30[:,0]).astype(int))
h21_acc_out_30   = np.mean((lab_out_test_30['h21_output'] == pred_out_30[:,1]).astype(int))
h11_acc_noout_30 = np.mean((lab_noout_test_30['h11_output'] == pred_noout_30[:,0]).astype(int))
h21_acc_noout_30 = np.mean((lab_noout_test_30['h21_output'] == pred_noout_30[:,1]).astype(int))

# print accuracy
print('80% training data:')
print('    Accuracy on h_11 w/  outliers: {:.2f}%'.format(100 * h11_acc_out_80))
print('    Accuracy on h_21 w/  outliers: {:.2f}%'.format(100 * h21_acc_out_80))
print('    Accuracy on h_11 w/o outliers: {:.2f}%'.format(100 * h11_acc_noout_80))
print('    Accuracy on h_21 w/o outliers: {:.2f}%'.format(100 * h21_acc_noout_80))
print('')
print('30% training data:')
print('    Accuracy on h_11 w/  outliers: {:.2f}%'.format(100 * h11_acc_out_30))
print('    Accuracy on h_21 w/  outliers: {:.2f}%'.format(100 * h21_acc_out_30))
print('    Accuracy on h_11 w/o outliers: {:.2f}%'.format(100 * h11_acc_noout_30))
print('    Accuracy on h_21 w/o outliers: {:.2f}%'.format(100 * h21_acc_noout_30))

80% training data:
    Accuracy on h_11 w/  outliers: 48.10%
    Accuracy on h_21 w/  outliers: 10.71%
    Accuracy on h_11 w/o outliers: 50.60%
    Accuracy on h_21 w/o outliers: 9.17%

30% training data:
    Accuracy on h_11 w/  outliers: 47.67%
    Accuracy on h_21 w/  outliers: 9.92%
    Accuracy on h_11 w/o outliers: 49.05%
    Accuracy on h_21 w/o outliers: 9.52%
