# Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from getData import getData

acme, acme_raw = getData()

In [2]:
from sklearn.linear_model import Lasso

In [3]:
acme.columns

Index(['impression_id', 'click', 'number_of_vehicles', 'number_of_drivers',
       'rank', 'policies_sold', 'married', 'insured'],
      dtype='object')

# Lasso

In [4]:
acme = acme[acme.click == 1]

y = acme['policies_sold']
X_temp = acme.copy().drop(['click', 'impression_id', 'policies_sold'], axis = 1)

X = X_temp.copy().drop(['number_of_vehicles', 'number_of_drivers'],axis = 1)

X['one_v'] = 2 - X_temp['number_of_vehicles']
X['two_v'] = X_temp['number_of_vehicles'] - 1

X['one_d'] = 2 - X_temp['number_of_drivers']
X['two_d'] = X_temp['number_of_drivers'] - 1

X['single'] = 1 - X_temp['married']
X['not_insured'] = 1 - X_temp['insured']

In [5]:
print(len(X.columns))

9


In [6]:
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

n = 9

# These will hold our coefficient estimates
lasso_coefs = np.empty((len(alpha), n))

# for each alpha value
for i in range(len(alpha)):    
    # set up the lasso pipeline
    
    lasso = Lasso(alpha = alpha[i], normalize = True, max_iter = 1000000)
    # fit the lasso
    lasso.fit(X, y)
    
    # record the coefficients
    lasso_coefs[i,:] = lasso.coef_


In [7]:
pd.DataFrame(np.round(lasso_coefs,8),
            index = ["alpha=" + str(a) for a in alpha])


Unnamed: 0,0,1,2,3,4,5,6,7,8
alpha=1e-05,-0.008219,-0.009018,-0.175051,0.088518,-0.0,0.151755,-0.0,0.054234,0.002144
alpha=0.0001,-0.005656,-0.006705,-0.169433,0.083496,-0.0,0.135747,-0.0,0.043344,0.001893
alpha=0.001,-0.0,-0.0,-0.104837,0.002084,-0.0,0.019552,-0.0,0.0,0.001306
alpha=0.01,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
alpha=0.1,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
alpha=1,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
alpha=10,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
alpha=100,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
alpha=1000,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0


In [9]:
print(X)

      rank  married  insured  one_v  two_v  one_d  two_d  single  not_insured
0        1        1        0      1      0      1      0       0            1
1        4        1        0      0      1      1      0       0            1
2        2        0        0      1      0      1      0       1            1
3        5        1        1      0      1      1      0       0            0
4        4        0        1      0      1      0      1       1            0
...    ...      ...      ...    ...    ...    ...    ...     ...          ...
9995     1        0        0      1      0      0      1       1            1
9996     1        1        0      1      0      0      1       0            1
9997     5        1        0      0      1      1      0       0            1
9998     4        0        1      1      0      0      1       1            0
9999     1        1        0      1      0      0      1       0            1

[10000 rows x 9 columns]


In [8]:
from sklearn.linear_model import Ridge

In [18]:
ridge = Ridge()

In [19]:
ridge.fit(X=X[['rank', 'insured', 'not_insured', 'one_v', 'one_d']], y=y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [62]:
X['prediction'] = ridge.predict(X[['rank', 'insured', 'not_insured', 'one_v', 'one_d']])

for p in np.array(range(330, 410,5)) / 1000:
    z = X.prediction.apply(lambda x: 1 if x > p else 0)
    a = len(y[(y == 1) & (z == 1)])
    b = len(y[(y == 1) & (z == 0)])
    c = len(y[(y == 0) & (z == 1)])
    print("p value: %f" % p)
    if a + c != 0:
        pre = a / (a + c)
        print("    precision: %f" % pre)
        
    if a + b != 0:
        rec = a / (a + b)
        print("    recall: %f" % rec)
        
    if a + c != 0 and a + b != 0:
        print("    f1: %f" % (pre * rec / (pre + rec)))

p value: 0.330000
    precision: 0.472707
    recall: 0.794495
    f1: 0.296372
p value: 0.335000
    precision: 0.472707
    recall: 0.794495
    f1: 0.296372
p value: 0.340000
    precision: 0.473165
    recall: 0.792661
    f1: 0.296296
p value: 0.345000
    precision: 0.473165
    recall: 0.792661
    f1: 0.296296
p value: 0.350000
    precision: 0.475664
    recall: 0.788991
    f1: 0.296756
p value: 0.355000
    precision: 0.475664
    recall: 0.788991
    f1: 0.296756
p value: 0.360000
    precision: 0.482719
    recall: 0.768807
    f1: 0.296532
p value: 0.365000
    precision: 0.486194
    recall: 0.743119
    f1: 0.293904
p value: 0.370000
    precision: 0.486194
    recall: 0.743119
    f1: 0.293904
p value: 0.375000
    precision: 0.486194
    recall: 0.743119
    f1: 0.293904
p value: 0.380000
    precision: 0.486194
    recall: 0.743119
    f1: 0.293904
p value: 0.385000
    precision: 0.486027
    recall: 0.733945
    f1: 0.292398
p value: 0.390000
    precision: 0.48602

In [15]:
X[['rank', 'insured', 'not_insured', 'one_v', 'one_d']]

Unnamed: 0,rank,insured,not_insured,one_v,one_d
0,1,0,1,1,1
8,3,0,1,0,1
16,2,0,1,1,0
19,5,1,0,0,1
27,1,0,1,1,0
...,...,...,...,...,...
9977,1,0,1,1,0
9986,1,1,0,1,0
9987,1,0,1,1,0
9992,1,1,0,1,0


In [32]:
y.apply(lambda x: 1 if x > 0.5 else 0)

0       1
8       1
16      1
19      0
27      1
       ..
9977    0
9986    1
9987    1
9992    0
9996    0
Name: policies_sold, Length: 1374, dtype: int64

In [31]:
y_pred.apply(lambda x: 1 if x > 0.5 else 0)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [35]:
type(y)

pandas.core.series.Series

In [38]:
y_pred.apply(lambda x: 1 if x > 0 else 0)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [46]:
y_pred.transpose().shape

(1,)

In [43]:
y.shape

(1374,)

In [48]:
y_pred = pd.DataFrame({'prediction':ridge.predict(X[['rank', 'insured', 'not_insured', 'one_v', 'one_d']])})

In [50]:
y_pred = y_pred['prediction']

In [51]:
y_pred.shape

(1374,)