In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from statsmodels.api import OLS
import statsmodels.api as sm

## Read data

In [3]:
data = pd.read_csv("test_sample.csv")

In [4]:
data.shape

(500, 491)

In [5]:
data.head()

Unnamed: 0,Y,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X480,X481,X482,X483,X484,X485,X486,X487,X488,X489
0,62.021357,2.024706,1.027527,-1.326311,0.357027,0.51919,-0.257261,1.039989,-0.347428,-1.064346,...,-1.65128,4.136547,-1.865089,-0.722469,-0.606663,-1.036133,-1.651849,-1.709447,2.93277,-2.650658
1,-10.742381,-0.150313,1.222022,-0.66013,0.533368,-0.715817,1.009842,-4.690636,-2.756601,2.38581,...,0.11671,-0.908185,0.233237,1.986009,-0.015612,-0.022982,-0.655791,-0.132789,-1.472721,1.851572
2,36.153706,-0.108264,-0.561548,1.160181,-2.053494,-1.872296,0.343377,-0.245129,-0.667422,2.253708,...,0.477435,0.881063,-0.203446,-0.416133,-1.26392,3.109615,1.399428,-1.860654,-1.038935,-0.463618
3,-51.982139,-0.782493,-3.915502,3.091624,-1.402967,-1.692088,0.052822,-0.650964,-4.290895,-0.1742,...,0.042234,-3.874029,-2.378197,-3.307136,-3.061432,-2.071915,-1.672129,2.340858,-1.739158,-0.605747
4,-61.727506,-2.117168,-0.444977,-2.608002,-1.795239,1.603949,1.998514,1.408727,1.348166,2.688537,...,-2.46869,0.889213,-0.801028,-3.57659,1.338792,-1.069161,-0.355335,3.117281,-1.2378,-4.093385


In [6]:
# Prepare the data
X = data.drop('Y', axis=1)  # independent variables
y = data['Y']  # dependent variable

## Lasso Regression

In [9]:
lasso = LassoCV(cv=5, random_state=1, n_jobs=-1).fit(X, y)
optimal_alpha = lasso.alpha_
lasso_coef = lasso.coef_

In [12]:
# Indices of eliminated regressors by Lasso (coefficients that are zero)
eliminated_by_Lasso = [i for i, coef in enumerate(lasso_coef) if coef == 0]

In [13]:
# Linear regression using Ordinary Least Squares (OLS)
model = OLS(y, sm.add_constant(X)).fit()

In [14]:
# Get p-values and eliminate coefficients with p-value greater than 0.1
p_values = model.pvalues[1:] 
eliminated_by_lm = [i for i, p_value in enumerate(p_values) if p_value > 0.1]

In [26]:
model.pvalues > 0.1

const    False
X0       False
X1       False
X2        True
X3       False
         ...  
X485     False
X486     False
X487     False
X488     False
X489     False
Length: 491, dtype: bool

In [20]:
p_values

X0      2.113700e-14
X1      1.430216e-06
X2      3.128378e-01
X3      8.264249e-12
X4      1.155034e-06
            ...     
X485    3.137936e-09
X486    3.252316e-16
X487    1.423677e-06
X488    5.818670e-13
X489    4.691442e-08
Length: 490, dtype: float64

## Drop the answer

In [15]:
# Prepare strings with blank spaces as separators
lasso_zeros = ' '.join([str(idx) for idx in eliminated_by_Lasso])
lm_zeros = ' '.join([str(idx) for idx in eliminated_by_lm])

In [25]:
pd.DataFrame([lasso_zeros,lm_zeros], index = ['eliminated_by_Lasso','eliminated_by_lm']).to_csv('answer.csv')