<a href="https://colab.research.google.com/github/tiafrosty/Exercises-all/blob/main/ml_models_CUDA_NEW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Check GPU
!nvidia-smi

Tue Mar 19 19:14:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 460, done.[K
remote: Counting objects: 100% (191/191), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 460 (delta 131), reused 124 (delta 91), pack-reused 269[K
Receiving objects: 100% (460/460), 126.19 KiB | 3.32 MiB/s, done.
Resolving deltas: 100% (233/233), done.
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 1.2 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a Tesla T4 GPU!
We will install the latest stable RAPIDS via pip 24.2.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==24.2.*
  Downloading https://pypi.nvidia.

## Critical Imports

In [3]:
# Critical imports
import cudf
import cuml
import os
import numpy as np
import pandas as pd

In [4]:
# Import
import datetime as dt

# Using a dictionary of key-value pairs
# Each key in the dictionary represents a category
# The key is the category's name
# The value is a list of the values in that category
gdf = cudf.DataFrame({
    # Create 10 busindates ess from 1st January 2019 via pandas
    'dates': pd.date_range('1/1/2019', periods=10, freq='B'),
    # Integers
    'integers': [i for i in range(10)],
    # Floats
    'floats': [float(i) for i in range(10)]
})

# Print dataframe
print(gdf)

       dates  integers  floats
0 2019-01-01         0     0.0
1 2019-01-02         1     1.0
2 2019-01-03         2     2.0
3 2019-01-04         3     3.0
4 2019-01-07         4     4.0
5 2019-01-08         5     5.0
6 2019-01-09         6     6.0
7 2019-01-10         7     7.0
8 2019-01-11         8     8.0
9 2019-01-14         9     9.0


### Create a 2 Column  Dataframe with Pandas Bridge
- Consisting of integers and string category
- For all string columns, you must convert them to type `category` for filtering functions to work intuitively (for now)

In [10]:
from cuml import preprocessing
#from cudf.DataFrame import Series, CategoricalColumn

lab_enc = preprocessing.LabelEncoder()
# Create pandas dataframe
my_data_heart = pd.DataFrame(pd.read_csv("heart_disease.csv").dropna().drop_duplicates()).rename(columns={'HeartDiseaseorAttack': 'target'})

# Bridge from pandas to cudf
heart_data_cuda = cudf.DataFrame.from_pandas(my_data_heart)
heart_data_cuda['target'] =  lab_enc.fit_transform(heart_data_cuda['target'].astype('category'))

# Print dataframe
print(heart_data_cuda.head())

   target  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  Diabetes  \
0       0     1.0       1.0        1.0  40.0     1.0     0.0       0.0   
1       0     0.0       0.0        0.0  25.0     1.0     0.0       0.0   
2       0     1.0       1.0        1.0  28.0     0.0     0.0       0.0   
3       0     1.0       0.0        1.0  27.0     0.0     0.0       0.0   
4       0     1.0       1.0        1.0  24.0     0.0     0.0       0.0   

   PhysActivity  Fruits  ...  AnyHealthcare  NoDocbcCost  GenHlth  MentHlth  \
0           0.0     0.0  ...            1.0          0.0      5.0      18.0   
1           1.0     0.0  ...            0.0          1.0      3.0       0.0   
2           0.0     1.0  ...            1.0          1.0      5.0      30.0   
3           1.0     1.0  ...            1.0          0.0      2.0       0.0   
4           1.0     1.0  ...            1.0          0.0      2.0       3.0   

   PhysHlth  DiffWalk  Sex   Age  Education  Income  
0      15.0       1.0  0.0

In [6]:
# import libs
from cuml import LinearRegression
from cuml import LogisticRegression
from cuml.neighbors import KNeighborsClassifier
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier
from cuml.svm import LinearSVC, SVC

my_models = [

{
        'label': 'Logistic Regression',
        'model': LogisticRegression(),
        'grid_params': None
 },
 {
        'label': 'Elastic net',
        'model': LogisticRegression(penalty= 'elasticnet', l1_ratio= 0.1), # , solver = "newton-cg" #SGDClassifier(loss='log_loss', penalty='elasticnet', max_iter=100000),
        'grid_params':  { 'l1_ratio': np.array([0.4, 0.5,  0.7, 0.9])}
},
{
        'label': 'KNN',
        'model': KNeighborsClassifier(),
        'grid_params':  {'n_neighbors' : [5,7,9,11,13]} #, 'weights' : ['uniform','distance']}
        #'metric' : ['minkowski','euclidean','manhattan']}
},
{
        'label': 'Random Forest',
        'model': RandomForestClassifier(max_features = 'sqrt'),
        'grid_params': {'n_estimators': [50, 100, 200]}
},
{
        # C is the penalty, gamma measures how far away can influencing points be
        'label': 'Linear SVM',
        'model': LinearSVC(C = 1, probability=True),
        'grid_params': None  # ?
},
{
        'label': 'Non-linear SVM',
        'model': SVC(kernel='rbf', probability=True), # can also try poly/sigmoid/etc, rbf is a default one
        # C penalty term, gamma is hwo far the points considered from the hyperplane
        # high gamma: only the close points are considered, low gamma: far away points are considered
        'grid_params': {'C': np.logspace(-2, 5, 4), 'gamma':  np.logspace(-9, 7, 4)}
}
]


cv_score = 5



In [25]:
from time import time
from matplotlib import pyplot as plt
from sklearn import preprocessing
import seaborn as sns
# leave it here for now
from sklearn.pipeline import make_pipeline
# for CV
from cuml.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from cuml.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import KFold

def get_auc_for_every_model(N, iris, scale, dataset_name, task, cv_score, my_models):

    """
    Takes the dataset and performs the binary classification of the target column using each of the models in a given list
    for a given number of iterations. Returns the computed AUC scores as a data frame.

    Parameters
    ----------
    N : integer
        The number of iterations
    iris: data frame
        The dataset for which to perform binary classification
    scale: boolean
        Indicates wether or not features need to be scaled
    dataset_name: string
        Name of the dataset
    task: string
        Indicates which type of problem needs to be solved (classification or regression). Currently not used but is planned to use later.
    cv_score: integer
        The value of k in k-fold cross-validation performed for grid search
    my_models: dictionary
        The list of models used for implementing the binary classification

    Returns
    -------
    The table  of size Nxk of obtained AUC scores, where N is the number of iterations and k is the number of models

    """


    # Prepare the data
    y = iris['target']#.astype('category')
    #y = lab_enc.fit(y.category)
    y = lab_enc.fit_transform(y.astype('category'))

    #y = y.category
    X =  iris.drop(['target'], axis=1)


    if dataset_name == 'ptb':
        for cur_col in X.columns:
            X[cur_col] = lab_enc.fit_transform(X[cur_col].astype('category'))
    # Splitting into train and test
    # take 70% for training
    # auc = np.mean(roc_scores)

    # metrics for all models:
    all_rocs = []
    aucs_best = []
    # to keep the  times
    all_times = []
    # best prameters for all models
    best_params_all = []
    for m in my_models:

        model = m['model']  # select the model

        print('\n', m['label'])
        #if m['label'] == 'KNN':


        X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=1)

        kfold = KFold(cv_score, shuffle=True)#, random_state = i)


        #if m['label'] == 'Non-linear SVM':
        #        continue

        # for models with parameters grid
        params = m['grid_params']

        scaler = preprocessing.MinMaxScaler()


        if params:
                gs = GridSearchCV(model, params, cv=kfold, refit=True, scoring='roc_auc', verbose=1)
                if m['label'] == 'Logistic Regression' or m['label'] == 'Elastic net':
                    best_par = gs.fit(np.float64(X_train.to_numpy()), np.int64(y_train.to_numpy())).best_params_

                else:
                    best_par = gs.fit(np.float32(X_train.to_numpy()), np.int32(y_train.to_numpy())).best_params_
                model.set_params(**best_par)

        # create new splits N times and fit the best model
        all_roc_scores = []


        # check time
        t = time()
        #for i in range(N):
        for i in tqdm(range(N)):
            # make a new split

            X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=i)


            # fit the bext model
            if m['label'] == 'Logistic Regression' or m['label'] == 'Elastic net':

                model.fit(np.float64(X_train.to_numpy()), np.int64(y_train.to_numpy()))

            else:
                model.fit(np.float32(X_train.to_numpy()), np.int32(y_train.to_numpy()))

            y_pred = model.predict_proba(X_test)
            all_roc_scores.append(roc_auc_score(np.int32(y_test.to_numpy()), y_pred.loc[:,1]))


        all_times.append(round(time() - t, 2))

        all_rocs.append(all_roc_scores)

        #print(f'\n Model {m["label"]} took {time() - t:.2f}s')


        print(f'\n Model {m["label"]} returned average AUC {np.mean(all_roc_scores)}')


    print(all_times)

    return all_rocs

In [None]:
roc_matrix_heart_py = pd.DataFrame(get_auc_for_every_model(100, heart_data_cuda, False, 'heart','classification', cv_score, my_models)).T


 Logistic Regression


100%|██████████| 100/100 [01:11<00:00,  1.39it/s]



 Model Logistic Regression returned average AUC 0.8352989447116852

 Elastic net
Fitting 5 folds for each of 4 candidates, totalling 20 fits


100%|██████████| 100/100 [01:11<00:00,  1.39it/s]



 Model Elastic net returned average AUC 0.8353117674589157

 KNN
[I] [20:51:35.947328] Unused keyword parameter: n_jobs during cuML estimator initialization
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[I] [20:51:35.960440] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:36.911528] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:37.160865] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:37.408746] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:37.653716] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:37.908752] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:38.154577] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:38.406938] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:51:38.665777] Unused keyword parameter

100%|██████████| 100/100 [02:06<00:00,  1.27s/it]



 Model KNN returned average AUC 0.7546395087242126

 Random Forest
Fitting 5 folds for each of 3 candidates, totalling 15 fits


100%|██████████| 100/100 [04:59<00:00,  2.99s/it]



 Model Random Forest returned average AUC 0.8284091311693191

 Linear SVM


  8%|▊         | 8/100 [00:01<00:20,  4.54it/s]

[W] [20:59:16.571136] L-BFGS stopped, because the line search failed to advance (step delta = 0.000000)


100%|██████████| 100/100 [00:21<00:00,  4.58it/s]



 Model Linear SVM returned average AUC 0.833774374127388

 Non-linear SVM
Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [8]:
pre_term_data = pd.DataFrame(pd.read_csv("whole_cleaned_data_RAB_2019DEC23.csv").dropna().drop_duplicates()).rename(columns={'out': 'target'})
pre_term_data_nuli = pre_term_data.loc[pre_term_data['parity_cat'] == 'Nuliparous']
ptb_new_features = pre_term_data_nuli[[ 'target', 'matage_cat', 'pre.ext.h_cat', 'gender',
       'prev_abortion_cat',
       'infection_cat', 'height_cat',
       'alcohl_cat', 'diabetes_cat', 'drug_cat',
       'hyper_cat', 'breastfeed_cat', 'BMI.cat', 'health_con_cat', 'smok_ft',
       'folik_cat', 'incom_cat', 'educ_cat', 'minorit_cat',
       'imq_cat', 'resid_smoker_cat', 'conception_cat',
       'medical_exp_cat', 'PAPP_mom_cat',
       'WG_FT1']]

ptb_data_cuda = cudf.DataFrame.from_pandas(ptb_new_features)

ptb_data_cuda['target'] =  lab_enc.fit_transform(ptb_data_cuda['target'].astype('category'))
print(ptb_data_cuda.shape)

(115658, 25)


In [None]:
roc_matrix_ptb_py = pd.DataFrame(get_auc_for_every_model(100, ptb_data_cuda, False, 'ptb', 'classification', 5, my_models)).T


 Logistic Regression

 Elastic net

 KNN

 Random Forest

 Linear SVM


100%|██████████| 100/100 [00:08<00:00, 12.24it/s]



 Model Linear SVM returned average AUC 0.59300113260746

 Non-linear SVM
Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: 
All the 80 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/cuml/internals/api_decorators.py", line 188, in wrapper
    ret = func(*args, **kwargs)
  File "svc.pyx", line 485, in cuml.svm.svc.SVC.fit
  File "/usr/local/lib/python3.10/dist-packages/cuml/internals/api_decorators.py", line 188, in wrapper
    ret = func(*args, **kwargs)
  File "svc.pyx", line 463, in cuml.svm.svc.SVC._fit_proba
  File "svc.pyx", line 464, in cuml.svm.svc.SVC._fit_proba
  File "/usr/local/lib/python3.10/dist-packages/sklearn/calibration.py", line 395, in fit
    self.calibrated_classifiers_ = parallel(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "/usr/local/lib/python3.10/dist-packages/joblib/parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
  File "/usr/local/lib/python3.10/dist-packages/joblib/parallel.py", line 1792, in _get_sequential_output
    res = func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/calibration.py", line 577, in _fit_classifier_calibrator_pair
    estimator.fit(X_train, y_train, **fit_params_train)
  File "/usr/local/lib/python3.10/dist-packages/cuml/internals/api_decorators.py", line 188, in wrapper
    ret = func(*args, **kwargs)
  File "svc.pyx", line 571, in cuml.svm.svc.SVC.fit
TypeError: Input data type should be float32 or float64
