<a href="https://colab.research.google.com/github/tiafrosty/Project-taya/blob/main/ml_models_CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Check GPU
!nvidia-smi

Fri Mar 15 11:39:21 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 460, done.[K
remote: Counting objects: 100% (191/191), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 460 (delta 131), reused 124 (delta 91), pack-reused 269[K
Receiving objects: 100% (460/460), 126.19 KiB | 2.52 MiB/s, done.
Resolving deltas: 100% (233/233), done.
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 549.0 kB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a Tesla T4 GPU!
We will install the latest stable RAPIDS via pip 24.2.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==24.2.*
  Downloading https://pypi.nvidi

## Critical Imports

In [3]:
# Critical imports
import cudf
import cuml
import os
import numpy as np
import pandas as pd

In [4]:
# Import
import datetime as dt

# Using a dictionary of key-value pairs
# Each key in the dictionary represents a category
# The key is the category's name
# The value is a list of the values in that category
gdf = cudf.DataFrame({
    # Create 10 busindates ess from 1st January 2019 via pandas
    'dates': pd.date_range('1/1/2019', periods=10, freq='B'),
    # Integers
    'integers': [i for i in range(10)],
    # Floats
    'floats': [float(i) for i in range(10)]
})

# Print dataframe
print(gdf)

       dates  integers  floats
0 2019-01-01         0     0.0
1 2019-01-02         1     1.0
2 2019-01-03         2     2.0
3 2019-01-04         3     3.0
4 2019-01-07         4     4.0
5 2019-01-08         5     5.0
6 2019-01-09         6     6.0
7 2019-01-10         7     7.0
8 2019-01-11         8     8.0
9 2019-01-14         9     9.0


### Create a 2 Column  Dataframe with Pandas Bridge
- Consisting of integers and string category
- For all string columns, you must convert them to type `category` for filtering functions to work intuitively (for now)

In [5]:
from cuml import preprocessing
#from cudf.DataFrame import Series, CategoricalColumn

lab_enc = preprocessing.LabelEncoder()
# Create pandas dataframe
my_data_heart = pd.DataFrame(pd.read_csv("heart_disease.csv").dropna().drop_duplicates()).rename(columns={'HeartDiseaseorAttack': 'target'})

# Bridge from pandas to cudf
heart_data_cuda = cudf.DataFrame.from_pandas(my_data_heart)
heart_data_cuda['target'] =  lab_enc.fit_transform(heart_data_cuda['target'].astype('category'))

# Print dataframe
print(heart_data_cuda.head())

   target  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  Diabetes  \
0       0     1.0       1.0        1.0  40.0     1.0     0.0       0.0   
1       0     0.0       0.0        0.0  25.0     1.0     0.0       0.0   
2       0     1.0       1.0        1.0  28.0     0.0     0.0       0.0   
3       0     1.0       0.0        1.0  27.0     0.0     0.0       0.0   
4       0     1.0       1.0        1.0  24.0     0.0     0.0       0.0   

   PhysActivity  Fruits  ...  AnyHealthcare  NoDocbcCost  GenHlth  MentHlth  \
0           0.0     0.0  ...            1.0          0.0      5.0      18.0   
1           1.0     0.0  ...            0.0          1.0      3.0       0.0   
2           0.0     1.0  ...            1.0          1.0      5.0      30.0   
3           1.0     1.0  ...            1.0          0.0      2.0       0.0   
4           1.0     1.0  ...            1.0          0.0      2.0       3.0   

   PhysHlth  DiffWalk  Sex   Age  Education  Income  
0      15.0       1.0  0.0

In [6]:
# import libs
from cuml import LinearRegression
from cuml import LogisticRegression
from cuml.neighbors import KNeighborsClassifier
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier
from cuml.svm import LinearSVC, SVC

my_models = [

{
        'label': 'Logistic Regression',
        'model': LogisticRegression(),
        'grid_params': None
 },
 {
        'label': 'Elastic net',
        'model': LogisticRegression(penalty= 'elasticnet', l1_ratio= 0.1), # , solver = "newton-cg" #SGDClassifier(loss='log_loss', penalty='elasticnet', max_iter=100000),
        'grid_params':  { 'l1_ratio': np.array([0.4, 0.5,  0.7, 0.9])}
},
{
        'label': 'KNN',
        'model': KNeighborsClassifier(),
        'grid_params':  {'n_neighbors' : [5,7,9,11,13]} #, 'weights' : ['uniform','distance']}
        #'metric' : ['minkowski','euclidean','manhattan']}
},
{
        'label': 'Random Forest',
        'model': RandomForestClassifier(max_features = 'sqrt'),
        'grid_params': {'n_estimators': [50, 100, 200]}
},
{
        # C is the penalty, gamma measures how far away can influencing points be
        'label': 'Linear SVM',
        'model': LinearSVC(C = 1, probability=True),
        'grid_params': None  # ?
},
{
        'label': 'Non-linear SVM',
        'model': SVC(kernel='rbf', probability=True), # can also try poly/sigmoid/etc, rbf is a default one
        # C penalty term, gamma is hwo far the points considered from the hyperplane
        # high gamma: only the close points are considered, low gamma: far away points are considered
        'grid_params':  None # {'C': np.logspace(-2, 5, 4), 'gamma':  np.logspace(-9, 7, 4)}
}
]


cv_score = 5



In [45]:
from time import time
from matplotlib import pyplot as plt
from sklearn import preprocessing
import seaborn as sns
# leave it here for now
from sklearn.pipeline import make_pipeline
# for CV
from cuml.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from cuml.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import KFold

def get_auc_for_every_model(N, iris, scale, dataset_name, task, cv_score, my_models):

    """
    Takes the dataset and performs the binary classification of the target column using each of the models in a given list
    for a given number of iterations. Returns the computed AUC scores as a data frame.

    Parameters
    ----------
    N : integer
        The number of iterations
    iris: data frame
        The dataset for which to perform binary classification
    scale: boolean
        Indicates wether or not features need to be scaled
    dataset_name: string
        Name of the dataset
    task: string
        Indicates which type of problem needs to be solved (classification or regression). Currently not used but is planned to use later.
    cv_score: integer
        The value of k in k-fold cross-validation performed for grid search
    my_models: dictionary
        The list of models used for implementing the binary classification

    Returns
    -------
    The table  of size Nxk of obtained AUC scores, where N is the number of iterations and k is the number of models

    """


    # Prepare the data
    y = iris['target']#.astype('category')
    #print(repr(y))
    #y = lab_enc.fit(y.category)
    #y =  lab_enc.fit_transform(y.astype('category'))

    #y = y.category
    X = iris.drop(['target'], axis=1)

    #if dataset_name == 'ptb':
    #    for cur_col in X.columns:
    #        X[cur_col] = lab_enc.fit_transform(X[cur_col].astype('category'))
    # Splitting into train and test
    # take 70% for training
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)

    kfold = KFold(cv_score, shuffle=True)#, random_state = i)

    # auc = np.mean(roc_scores)

    # metrics for all models:
    all_rocs = []
    aucs_best = []
    # to keep the  times
    all_times = []
    # best prameters for all models
    best_params_all = []
    for m in my_models:

        model = m['model']  # select the model

        print('\n', m['label'])
        #if m['label'] == 'KNN':
           # aa = 1

        # for models with parameters grid
        params = m['grid_params']

        scaler = preprocessing.MinMaxScaler()

        if params:
                gs = GridSearchCV(model, params, cv=kfold, refit=True, scoring='roc_auc', verbose=1)
                best_par = gs.fit(X_train, y_train).best_params_
                model.set_params(**best_par)

        # create new splits N times and fit the best model
        all_roc_scores = []

        # check time
        t = time()
        #for i in range(N):
        for i in tqdm(range(N)):
            # make a new split
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
            #print(repr(np.int32(y_test.to_numpy())))
            # fit the bext model
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_test)
            all_roc_scores.append(roc_auc_score(np.int32(y_test.to_numpy()), y_pred.loc[:,1]))


        all_times.append(round(time() - t, 2))

        all_rocs.append(all_roc_scores)

        #print(f'\n Model {m["label"]} took {time() - t:.2f}s')

        print(f'\n Model {m["label"]} returned average AUC {np.mean(all_roc_scores)}')


    print(all_times)

    return all_rocs

In [None]:
roc_matrix_heart_py = pd.DataFrame(get_auc_for_every_model(1000, heart_data_cuda, False, 'heart','classification', cv_score, my_models)).T


 Logistic Regression


100%|██████████| 1000/1000 [10:47<00:00,  1.54it/s]



 Model Logistic Regression returned average AUC 0.8352643554210663

 Elastic net
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 365, in _score
    y_type = type_of_target(y)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/multiclass.py", line 309, in type_of_target
    if is_multilabel(y):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/multiclass.py", line 169, in is_multilabel
    y = check_array(y, dtype=None, **check_y_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_array_api.py", line 185, in _as

In [None]:
y_true = np.array([0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0])
y_scores = np.array(0.01969006 0.10525063 0.20032453 0.15456406 0.06185784 0.01579279
 0.02999059 0.04098829 0.04746175 0.00354398 0.78122472 0.00962155
 0.02456464 0.02084574 0.27244606 0.07712017 0.02359279 0.0345437
 0.12309532 0.00592701 0.00961389 0.0380129  0.06961793 0.27921737
 0.18281896 0.28277651 0.17133959 0.01117683 0.08931676 0.04848843
 0.14800372 0.06099183 0.173044   0.00682266 0.06297498 0.04969023
 0.52246995 0.19540607 0.02111638 0.01901777 0.09703812 0.3309116
 0.00691807 0.23344814 0.00517426 0.59921565 0.02719512 0.08631413
 0.03849443 0.45848329 0.00227406 0.02160293 0.00491479 0.09999827
 0.08130201 0.041786   0.18373323 0.04680499 0.02104984 0.1083287
 0.0811837  0.13545515 0.02045309 0.0134489  0.00456414 0.00298327
 0.24631903 0.00363187 0.22795318 0.47915525 0.00544977 0.10775565
 0.14714507 0.26219041 0.05862062 0.01453391 0.19939563 0.00330082
 0.21094146 0.6948207  0.41678219 0.14294897 0.06648045 0.01081401
 0.22270945 0.53521056 0.03123727 0.3653997  0.16471238 0.1996704
 0.14752281 0.00789325 0.13020258 0.06018429 0.00655358 0.01339269
 0.39548505 0.01690178 0.36925307 0.06569362 0.33335905 0.08471498
 0.15214815 0.04483338 0.09852019 0.14798682 0.10771556 0.0184673
 0.14529649 0.14093507 0.07962228 0.02626408 0.01707747 0.15703438
 0.01225238 0.00292004 0.32897166 0.14597855 0.24753964 0.07378081
 0.15754571 0.0105005  0.0369974  0.02289437 0.04802689 0.17054505
 0.00970819 0.08758725 0.19502376 0.03167129 0.00298499 0.01418953
 0.1040889  0.28879478 0.0330811  0.03621225 0.01104429 0.00344584
 0.07950025 0.14430524 0.10747202 0.14380328 0.42575422 0.20165155
 0.13087093 0.18221316 0.65649913 0.00174644 0.01033052 0.07245893
 0.00880416 0.15843374 0.02715337 0.27078647 0.00681496 0.0073283
 0.00457253 0.10301986 0.03961895 0.1234459  0.07418219 0.01264281
 0.14582072 0.22271671 0.00919427 0.03192634 0.03867414 0.67363969
 0.11034944 0.34090777 0.17184021 0.06295924 0.01669613 0.22419283
 0.15453999 0.06882599 0.0195292  0.11463929 0.08386991 0.12615673
 0.23651888 0.01740183 0.08399216 0.21741497 0.04615618 0.05473841
 0.15674883 0.35551869 0.06035604 0.08082709 0.01979394 0.45323655
 0.15936047 0.15541761 0.08406561 0.1151297  0.6507331  0.02428335
 0.04435991 0.00293048 0.05021398 0.06217559 0.18220397 0.03250975
 0.10178547 0.241764   0.2985423  0.51891318 0.10485746 0.01623359
 0.10759365 0.00559996 0.17949141 0.06051788 0.02477665 0.50114363
 0.06498163 0.03810284 0.00156348 0.41797527 0.06836412 0.2328272
 0.00783003 0.03566408 0.26289993 0.13644897 0.25659885 0.0342925
 0.08170691 0.40611046 0.00214753 0.00419031 0.07313087 0.02335399
 0.04744363 0.26241401 0.00565014 0.00413257 0.06355943 0.16545354
 0.03877726 0.12745228 0.09914192 0.12584038 0.09540765 0.00613432
 0.02181095 0.17276168 0.08652714 0.07493752 0.06332512 0.04883084
 0.04847295 0.20079582 0.01340627 0.01274294 0.01238442 0.01490144
 0.07003442 0.05561645 0.03539554 0.0405379  0.02899269 0.54356533
 0.04832925 0.02371979 0.01678192 0.02439242 0.05190202 0.50471661
 0.1141741  0.36610394 0.02446177 0.09182516 0.68501276 0.12832503
 0.19115387 0.0400555  0.37017487 0.46819508 0.18005322 0.09022919
 0.00488994 0.0996286  0.04642341 0.07655426 0.27102826 0.02000512
 0.09672521 0.00754883 0.0262928  0.3550636  0.02371733 0.16479679
 0.15333037 0.22734459 0.30203123 0.41306902 0.02391474 0.03273842)
print(type(y_true))
print(type(y_scores))
print(roc_auc_score(y_true, y_scores))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
0.75
