In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from dask import compute, delayed
import dask.multiprocessing
import multiprocessing
import timeit
import warnings
warnings.filterwarnings("ignore")

In [17]:
#loading data
df = pd.read_csv('data/Auto.csv',na_values = "?")
df.dropna(inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [18]:
#create variables
median = df['mpg'].median()
df['mpg_high'] = 0
df.mpg_high[df['mpg'] >= median] = 1

df['orgn1'] = 0
df.orgn1[df['origin'] == 1] = 1
df['orgn2'] = 0
df.orgn2[df['origin'] == 2] = 1

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,mpg_high,orgn1,orgn2
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,0,1,0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,0,1,0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,0,1,0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,0,1,0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,0,1,0


## (a)

In [19]:
#X and y setting 
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'orgn1', 'orgn2']].values
y = df['mpg_high'].values

In [20]:
#serial computation
start_time1 = timeit.default_timer()
mse1 = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=i+100)
    logit = LogisticRegression(solver='lbfgs', max_iter=2000, n_jobs=1) 
    logit.fit(X_train, y_train)
    y_pred = logit.predict(X_test)
    mse_now = ((y_test - y_pred) ** 2).mean()
    mse1.append(mse_now)
    
avg_mse1 = np.array(mse1).mean()
elapsed_time1 = timeit.default_timer() - start_time1
print('The average error rate is ', avg_mse1)
print('The computation takes ', elapsed_time1, 'seconds')

The average error rate is  0.10166666666666666
The computation takes  4.764709830007632 seconds


## (b)

In [21]:
# check how many cores are available
num_cores = multiprocessing.cpu_count()
print('The number of available cores is', num_cores)

The number of available cores is 4


In [22]:
#error calculation function
def errorcal(bootstrap_num, seed, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state=seed+bootstrap_num)
    logit = LogisticRegression(solver='lbfgs', max_iter=2000, n_jobs=1)
    logit.fit(X_train, y_train)
    y_pred = logit.predict(X_test)
    mse = ((y_test - y_pred) ** 2).mean()
    return mse

#parallel computation
start_time2 = timeit.default_timer()
mse2 = []
for i in range(100):
    mse2.append(delayed(errorcal)(i, 100, X, y))
results_par = compute(*mse2, scheduler=dask.multiprocessing.get, num_workers=num_cores)

avg_mse2 = np.array(results_par).mean()
elapsed_time2 = timeit.default_timer() - start_time2
print('The average error rate is ', avg_mse2)
print('The computation takes ', elapsed_time2, 'seconds')

The average error rate is  0.10166666666666666
The computation takes  2.2747912220074795 seconds


As expected, the average error rate is the same.<br>
The parallel computation takes relatively shorter time. <br>