# Problem Set 2
## Yongfei Lu

## Problem 1 (a)

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data and drop missing values
df = pd.read_csv('data/Auto.csv', na_values=['?'])
df.dropna(inplace=True)

# creat dummy variable mpg_high
df['mpg_high'] = np.where(df['mpg'] >= np.median(df['mpg']), 1, 0)

# create indicators orgn1, orgn2
df['orgn1']=(df['origin']==1).astype('int')
df['orgn2']=(df['origin']==2).astype('int')

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,mpg_high,orgn1,orgn2
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,0,1,0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,0,1,0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,0,1,0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,0,1,0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,0,1,0


In [3]:
# set X and y

X = df[['cylinders', 'displacement', 'horsepower', 'weight', 
         'acceleration', 'year', 'orgn1', 'orgn2']]
y = df['mpg_high']

In [4]:
# Serial computation
err_rate_seri = np.zeros(100)
seeds = np.random.choice(1000, 100, replace=False)

start_time_ser = time.perf_counter()

for i,seed in enumerate(seeds):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.65, random_state=seed)

    results = LogisticRegression(n_jobs=None).fit(X_train, y_train)
    
    y_pred = results.predict(X_test)
    
    err_rate_seri[i] = ((y_test - y_pred)**2).mean()
    
    
    # print error rate for each bootstrap
    print('Error rate for bootstrap {} (seed: {}) is {}%, '.format(i+1, seed, round(err_rate_seri[i]*100,2)))

end_time_ser = time.perf_counter()
print("The average error rate from the serial computation is around {:.2f}%".format(100 * err_rate_seri.mean()))
print("Running time of the computation is around {:.2f} s".format(end_time_ser - start_time_ser))

Error rate for bootstrap 1 (seed: 796) is 13.04%, 
Error rate for bootstrap 2 (seed: 448) is 10.87%, 
Error rate for bootstrap 3 (seed: 219) is 9.42%, 
Error rate for bootstrap 4 (seed: 604) is 11.59%, 
Error rate for bootstrap 5 (seed: 203) is 12.32%, 
Error rate for bootstrap 6 (seed: 325) is 7.25%, 
Error rate for bootstrap 7 (seed: 870) is 7.25%, 
Error rate for bootstrap 8 (seed: 840) is 7.97%, 
Error rate for bootstrap 9 (seed: 377) is 8.7%, 
Error rate for bootstrap 10 (seed: 152) is 8.7%, 
Error rate for bootstrap 11 (seed: 80) is 9.42%, 
Error rate for bootstrap 12 (seed: 823) is 11.59%, 
Error rate for bootstrap 13 (seed: 562) is 10.14%, 
Error rate for bootstrap 14 (seed: 580) is 9.42%, 
Error rate for bootstrap 15 (seed: 883) is 12.32%, 
Error rate for bootstrap 16 (seed: 292) is 7.25%, 
Error rate for bootstrap 17 (seed: 905) is 11.59%, 
Error rate for bootstrap 18 (seed: 240) is 6.52%, 
Error rate for bootstrap 19 (seed: 57) is 13.77%, 
Error rate for bootstrap 20 (seed: 

## Problem 1 (b)

In [5]:
from dask.distributed import Client
from dask import compute, delayed
import dask.multiprocessing
import multiprocessing


In [11]:
# create the required function
def calculate_err_rate(boot_num, seed, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=seed)
    results = LogisticRegression(n_jobs=1).fit(X_train, y_train)
    y_pred = results.predict(X_test)
    err_rate = ((y_test - y_pred)**2).mean()
    return err_rate

In [12]:
# Use Dask to parallelize these bootstraps.
lazy_vals = []
err_rate_seri_2 = np.zeros(100)

start_time_2 = time.perf_counter()
for i in range(100):
    lazy_vals.append(delayed(calculate_err_rate)(i + 1, seeds[i], X, y))

err_rate_seri_2 = compute(*lazy_vals, scheduler=dask.multiprocessing.get, num_workers=4)
end_time_2 = time.perf_counter()


for i in range(100):
    print('Error rate for bootstrap {} (seed: {}) is aound {}%, '.format(i+1, seeds[i], round(err_rate_seri_2[i]*100,2)))

print("Average error rate: ", round(np.mean(err_rate_seri_2)*100,2),"%")
print("The running time for the computation is around {:.2f} s".format(end_time_2 - start_time_2))

Error rate for bootstrap 1 (seed: 796) is aound 13.04%, 
Error rate for bootstrap 2 (seed: 448) is aound 10.87%, 
Error rate for bootstrap 3 (seed: 219) is aound 9.42%, 
Error rate for bootstrap 4 (seed: 604) is aound 11.59%, 
Error rate for bootstrap 5 (seed: 203) is aound 12.32%, 
Error rate for bootstrap 6 (seed: 325) is aound 7.25%, 
Error rate for bootstrap 7 (seed: 870) is aound 7.25%, 
Error rate for bootstrap 8 (seed: 840) is aound 7.97%, 
Error rate for bootstrap 9 (seed: 377) is aound 8.7%, 
Error rate for bootstrap 10 (seed: 152) is aound 8.7%, 
Error rate for bootstrap 11 (seed: 80) is aound 9.42%, 
Error rate for bootstrap 12 (seed: 823) is aound 11.59%, 
Error rate for bootstrap 13 (seed: 562) is aound 10.14%, 
Error rate for bootstrap 14 (seed: 580) is aound 9.42%, 
Error rate for bootstrap 15 (seed: 883) is aound 12.32%, 
Error rate for bootstrap 16 (seed: 292) is aound 7.25%, 
Error rate for bootstrap 17 (seed: 905) is aound 11.59%, 
Error rate for bootstrap 18 (seed: 