In [1]:
!pip install dask[dataframe] --upgrade --quiet
!pip install dask-ml[complete] --quiet
!pip install aiohttp --quiet
!pip install joblib --quiet
!pip install dask distributed --upgrade --quiet
!pip install -U ipykernel --quiet
!pip install scikit-learn==0.23.2 --quiet

In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client, progress
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import roc_auc_score
import joblib
from dask_ml.model_selection import train_test_split
import pandas as pd
import warnings
import dask
import distributed

warnings.filterwarnings("ignore")

In [3]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:39849  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


In [4]:
dask.__version__

'2021.01.0'

In [5]:
distributed.__version__

'2021.01.0'

In [6]:
# This loads the data into a Dask DataFrame
df = dd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/creditcard.csv', dtype={'Time': 'float64'})

df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
df.describe().compute()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.175161e-15,3.384974e-16,-1.341216e-15,2.088465e-15,9.707851e-16,1.494498e-15,-5.652268e-16,1.143626e-16,-2.409599e-15,2.236957e-15,1.679714e-15,-1.245415e-15,8.206966e-16,1.200708e-15,4.885859e-15,1.437017e-15,-3.784146e-16,9.596083e-16,1.037048e-15,6.402711e-16,1.640595e-16,-3.544643e-16,2.610582e-16,4.473116e-15,5.205196e-16,1.687298e-15,-3.666889e-16,-1.219469e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,1.08885,1.020713,0.9992014,0.9952742,0.9585956,0.915316,0.8762529,0.8493371,0.8381762,0.8140405,0.770925,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,-24.58826,-4.797473,-18.68371,-5.791881,-19.21433,-4.498945,-14.12985,-25.1628,-9.498746,-7.213527,-54.49772,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,49346.0,-0.78686,-0.5536365,-0.7156553,-0.7067841,-0.4093779,-0.6546632,-0.4787135,-0.1339191,-0.5477089,-0.4837668,-0.5417076,-0.2351421,-0.6201901,-0.3439514,-0.3409833,-0.4198368,-0.4071982,-0.4129756,-0.3581993,-0.16865,-0.2256367,-0.525053,-0.1301482,-0.3241623,-0.2218416,-0.2821043,-0.06089022,-0.02762231,6.84,0.0
50%,76029.0,0.09301532,0.08999754,0.1743346,0.1809941,0.1480477,-0.1649056,0.1590763,0.07790153,0.02583745,-0.0915358,0.1428546,0.2220066,0.005421844,0.06496143,0.318552,0.089904,-0.01179887,0.08393824,0.03469481,-0.0251349,0.00663631,0.1241855,0.05160336,0.06794774,0.1674311,-0.001290092,0.0108823,0.0234504,24.99,0.0
75%,138472.0,1.912886,0.8916958,1.373682,1.012508,0.8613822,0.4775488,0.7334868,0.3706835,0.7092123,0.4814288,1.04184,0.6508526,0.6986651,0.5466114,0.8783822,0.5405436,0.4455337,0.5905041,0.4781289,0.1678982,0.2385322,0.7308589,0.2346538,0.5274699,0.4190202,0.293847,0.1060637,0.08061929,84.91,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,23.74514,12.01891,7.848392,7.126883,10.52677,8.877742,17.31511,9.253526,5.041069,5.591971,39.4209,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [8]:
# This is the feature set
X = df[["V1", "V2", "V3", "Amount"]]

# This is the target variable
Y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# Because your data can fit into memory,
# persist it to the RAM
X_train.persist()
X_test.persist()
y_train.persist()
y_test.persist()

Dask Series Structure:
npartitions=3
    int64
      ...
      ...
      ...
Name: Class, dtype: int64
Dask Name: split, 3 tasks

In [9]:
rf_model = RandomForestClassifier()

with joblib.parallel_backend('dask'):
    scores = cross_validate(rf_model, X_train.compute(), y_train.compute(), cv=4)
    
scores

{'fit_time': array([21.85528779, 56.67017007, 57.06225014, 54.48151708]),
 'score_time': array([0.12155294, 0.21826673, 0.2184217 , 0.21860576]),
 'test_score': array([0.99756123, 0.99835073, 0.99875428, 0.99854373])}

In [10]:
# Random forest classifier
rf_params = {"max_depth": [2, 4, 8, 16]}

rf_model = RandomForestClassifier()

grid_search_rf = GridSearchCV(rf_model,
                           param_grid=rf_params,
                           return_train_score=True,
                           iid=True,
                           cv=4,
                           n_jobs=-1, 
                           scoring='roc_auc')

In [11]:
with joblib.parallel_backend('dask'):
    grid_search_rf.fit(X_train.compute(), y_train.compute())

In [12]:
print("The best value is: ", grid_search_rf.best_params_)
print("The test AUC score is: ", grid_search_rf.score(X_test.compute(), y_test.compute()))

The best value is:  {'max_depth': 8}
The test AUC score is:  0.919151789203928


In [13]:
from dask_ml.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train.values.compute(), y_train.values.compute())

LogisticRegression()

In [14]:
preds_train = lr.predict(X_train.values.compute())
preds_test = lr.predict(X_test.values.compute())

print("Training score is: ", roc_auc_score(preds_train, y_train.values.compute()))
print("Test score is: ", roc_auc_score(preds_test, y_test.values.compute()))

Training score is:  0.8433109683308756
Test score is:  0.7108254333694475


## In this task, you'll train several machine-learning models from scikit-learn, using Dask as the backend of joblib. This time, you need to use all of the variables except Class as your feature set. The Class variable will be your target variable.

In [15]:
# use all features, but drop target
X = df.drop('Class',axis=1)

# This is the target variable
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Because your data can fit into memory,
# persist it to the RAM
X_train.persist()
X_test.persist()
y_train.persist()
y_test.persist()

Dask Series Structure:
npartitions=3
    int64
      ...
      ...
      ...
Name: Class, dtype: int64
Dask Name: split, 3 tasks

In [18]:
lr = LogisticRegression(fit_intercept=False)

with joblib.parallel_backend('dask'):
    lr.fit(X_train.compute(), y_train.compute())
    
preds_train = lr.predict(X_train.values.compute())
preds_test = lr.predict(X_test.values.compute())

print("Logistic regression training score is: ", roc_auc_score(preds_train, y_train.values.compute()))
print("Logistic regression test score is: ", roc_auc_score(preds_test, y_test.values.compute()))

Logistic regression training score is:  0.8338801703406193
Logistic regression test score is:  0.831027723834703


In [19]:
rfc = RandomForestClassifier()

with joblib.parallel_backend('dask'):
    rfc.fit(X_train.compute(), y_train.compute())
    
preds_train = rfc.predict(X_train.values.compute())
preds_test = rfc.predict(X_test.values.compute())

print("Random forest training score is: ", roc_auc_score(preds_train, y_train.values.compute()))
print("Random forest test score is: ", roc_auc_score(preds_test, y_test.values.compute()))

Random forest training score is:  1.0
Random forest test score is:  0.9850928681501826


In [20]:
gbc = GradientBoostingClassifier()

with joblib.parallel_backend('dask'):
    gbc.fit(X_train.compute(), y_train.compute())
    
preds_train = gbc.predict(X_train.values.compute())
preds_test = gbc.predict(X_test.values.compute())

print("Gradient boosting tree training score is: ", roc_auc_score(preds_train, y_train.values.compute()))
print("Gradient boosting tree test score is: ", roc_auc_score(preds_test, y_test.values.compute()))

Gradient boosting tree training score is:  0.9447045595837774
Gradient boosting tree test score is:  0.9239536850701433


## Compare the results of your models.

The random forest model performs best on the test data, even tough it is over-fitting. Logistic Regression performs the worst of the 3 models. 

It would be interesting to see how these models do with hyperparameter tuning. We used the default parameters for this project. In the future it would be nice to see which tuned model does best. 