## Assignment 2

### Qian ZHANG

In [1]:
# import some packages

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, mean_squared_error

import timeit
import pandas as pd
import numpy as np

from dask.distributed import Client
from dask import compute, delayed
import dask.multiprocessing
import multiprocessing

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load the data

auto=pd.read_csv('data/Auto.csv',na_values="?")

# drop the missing values

auto.dropna(inplace=True)
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [3]:
# check null values
auto.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [4]:
# create mpg_high

auto['mpg_high']=(auto['mpg']>=auto['mpg'].median()).astype('int')
print('The median of mpg is {}.'.format(auto['mpg'].median()))
auto['mpg_high'].describe()

The median of mpg is 22.75.


count    392.000000
mean       0.500000
std        0.500639
min        0.000000
25%        0.000000
50%        0.500000
75%        1.000000
max        1.000000
Name: mpg_high, dtype: float64

In [5]:
# create orgn1, orgn2

auto['orgn1']=(auto['origin']==1).astype('int')
auto['orgn2']=(auto['origin']==2).astype('int')
print(auto['orgn1'].value_counts())
print(auto['orgn2'].value_counts())

1    245
0    147
Name: orgn1, dtype: int64
0    324
1     68
Name: orgn2, dtype: int64


### Question 1

### (a)

In [6]:
# set X and y

y = auto['mpg_high']
X = auto[['cylinders','displacement','horsepower','weight','acceleration','year','orgn1','orgn2']]

In [7]:
# estimate a logistic on 100 bootstrapped train sets and predict

auto_err = np.zeros(100)
random_seed=np.random.choice(range(10000),100,replace=False)

start_time = timeit.default_timer()

for i,seed in enumerate(random_seed):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.65, random_state=seed)

    logit_train = LogisticRegression(n_jobs=None).fit(X_train, y_train)
    
    y_pred = logit_train.predict(X_test)
    
    auto_err[i] = 1-precision_score(y_test, y_pred)
    
    # print error rate for each bootstrap
    print('Error rate for bootstrap {} (seed={}) is {}%, '.format(i+1, seed, round(auto_err[i]*100,2)))
          
running_time = timeit.default_timer() - start_time

print("\nAverage error rate: ", round(np.mean(auto_err)*100,2),"%")
print("Running Time is about ", running_time, "seconds")

Error rate for bootstrap 1 (seed=8886) is 4.05%, 
Error rate for bootstrap 2 (seed=4089) is 7.58%, 
Error rate for bootstrap 3 (seed=1332) is 13.7%, 
Error rate for bootstrap 4 (seed=4797) is 8.24%, 
Error rate for bootstrap 5 (seed=9252) is 17.72%, 
Error rate for bootstrap 6 (seed=6514) is 12.5%, 
Error rate for bootstrap 7 (seed=4209) is 12.33%, 
Error rate for bootstrap 8 (seed=3428) is 15.79%, 
Error rate for bootstrap 9 (seed=1652) is 12.7%, 
Error rate for bootstrap 10 (seed=4223) is 11.11%, 
Error rate for bootstrap 11 (seed=9547) is 8.45%, 
Error rate for bootstrap 12 (seed=5004) is 10.61%, 
Error rate for bootstrap 13 (seed=6145) is 8.47%, 
Error rate for bootstrap 14 (seed=1081) is 11.11%, 
Error rate for bootstrap 15 (seed=5224) is 12.33%, 
Error rate for bootstrap 16 (seed=1598) is 11.86%, 
Error rate for bootstrap 17 (seed=9273) is 7.69%, 
Error rate for bootstrap 18 (seed=853) is 5.0%, 
Error rate for bootstrap 19 (seed=5649) is 14.29%, 
Error rate for bootstrap 20 (seed

### (b)

In [8]:
# check the number of cores

num_cores = multiprocessing.cpu_count()
print('Number of available cores is', num_cores)

Number of available cores is 16


In [9]:
# define the function

start_time2 = timeit.default_timer()

def func(boot_num, ran_seed_set, x_data, y_data):
    
    X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, train_size = 0.65, random_state=ran_seed_set[boot_num])
    
    logit_train = LogisticRegression(n_jobs=1).fit(X_train, y_train)
    
    y_pred = logit_train.predict(X_test)
        
    return (1-precision_score(y_test, y_pred))

# do the paralleling

auto_err2 = np.zeros(100)

lazy_values = []

for i in range(100):
    lazy_values.append(delayed(func)(i,random_seed,X,y))

auto_err2 = compute(*lazy_values, scheduler=dask.multiprocessing.get, num_workers=num_cores)

running_time2 = timeit.default_timer() - start_time2

# print the error rate for each bootstraps in order

for i in range(100):
    print('Error rate for bootstrap {} (seed={}) is {}%, '.format(i+1, random_seed[i], round(auto_err2[i]*100,2)))

print("\nAverage error rate: ", round(np.mean(auto_err2)*100,2),"%")
print("Running Time is about ", running_time2, "seconds")

Error rate for bootstrap 1 (seed=8886) is 4.05%, 
Error rate for bootstrap 2 (seed=4089) is 7.58%, 
Error rate for bootstrap 3 (seed=1332) is 13.7%, 
Error rate for bootstrap 4 (seed=4797) is 8.24%, 
Error rate for bootstrap 5 (seed=9252) is 17.72%, 
Error rate for bootstrap 6 (seed=6514) is 12.5%, 
Error rate for bootstrap 7 (seed=4209) is 12.33%, 
Error rate for bootstrap 8 (seed=3428) is 15.79%, 
Error rate for bootstrap 9 (seed=1652) is 12.7%, 
Error rate for bootstrap 10 (seed=4223) is 11.11%, 
Error rate for bootstrap 11 (seed=9547) is 8.45%, 
Error rate for bootstrap 12 (seed=5004) is 10.61%, 
Error rate for bootstrap 13 (seed=6145) is 8.47%, 
Error rate for bootstrap 14 (seed=1081) is 11.11%, 
Error rate for bootstrap 15 (seed=5224) is 12.33%, 
Error rate for bootstrap 16 (seed=1598) is 11.86%, 
Error rate for bootstrap 17 (seed=9273) is 7.69%, 
Error rate for bootstrap 18 (seed=853) is 5.0%, 
Error rate for bootstrap 19 (seed=5649) is 14.29%, 
Error rate for bootstrap 20 (seed