In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
from tqdm.notebook import tqdm

from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from concurrent.futures import ProcessPoolExecutor, as_completed

from Const import *
from helper_code import *
from extract_features import *

In [3]:
def extract_features_and_labels(data_folder, verbose):
    # Find data files.
    if verbose >= 1:
        tqdm.write('Finding data files...')

    # Find the patient data files.
    patient_files = find_patient_files(data_folder)
    num_patient_files = len(patient_files)

    if num_patient_files==0:
        raise Exception('No data was provided.')

    # Extract the features and labels.
    if verbose >= 1:
        tqdm.write('Extracting features and labels from the Challenge data...')

    murmur_classes = ['Present', 'Unknown', 'Absent']
    num_murmur_classes = len(murmur_classes)
    outcome_classes = ['Abnormal', 'Normal']
    num_outcome_classes = len(outcome_classes)

    features = list()
    murmurs = list()
    outcomes = list()

    # Create a executor with 4 workers
    executor = ProcessPoolExecutor(max_workers=4)
    inputs = []

    for i in tqdm(range(num_patient_files)):
        if verbose >= 2:
            tqdm.write('    {}/{}...'.format(i+1, num_patient_files))

        # Load the current patient data and recordings.
        current_patient_data = load_patient_data(patient_files[i])
        current_recordings = load_recordings(data_folder, current_patient_data)

        # Extract features.
        inputs.append((current_patient_data, current_recordings))
        # current_features = get_features(current_patient_data, current_recordings)
        # features.append(current_features)

        # Extract labels and use one-hot encoding.
        current_murmur = np.zeros(num_murmur_classes, dtype=int)
        murmur = get_murmur(current_patient_data)
        if murmur in murmur_classes:
            j = murmur_classes.index(murmur)
            current_murmur[j] = 1
        murmurs.append(current_murmur)

        current_outcome = np.zeros(num_outcome_classes, dtype=int)
        outcome = get_outcome(current_patient_data)
        if outcome in outcome_classes:
            j = outcome_classes.index(outcome)
            current_outcome[j] = 1
        outcomes.append(current_outcome)

    futures = [executor.submit(get_features, input[0], input[1]) for input in inputs]
    for future in as_completed(futures):
        current_features = future.result()
        features.append(current_features)
        
    features = np.vstack(features)
    murmurs = np.vstack(murmurs)
    outcomes = np.vstack(outcomes)
    
    return (features, murmurs, outcomes)

In [4]:
data_folder = "C:/Users/lumin/Desktop/Work/20212/Data/circor-heart-sound/final/train"
verbose = 4

features, murmurs, outcomes = extract_features_and_labels(data_folder, verbose)

Finding data files...
Extracting features and labels from the Challenge data...


  0%|          | 0/743 [00:00<?, ?it/s]

    1/743...
    2/743...
    3/743...
    4/743...
    5/743...
    6/743...
    7/743...
    8/743...
    9/743...
    10/743...
    11/743...
    12/743...
    13/743...
    14/743...
    15/743...
    16/743...
    17/743...
    18/743...
    19/743...
    20/743...
    21/743...
    22/743...
    23/743...
    24/743...
    25/743...
    26/743...
    27/743...
    28/743...
    29/743...
    30/743...
    31/743...
    32/743...
    33/743...
    34/743...
    35/743...
    36/743...
    37/743...
    38/743...
    39/743...
    40/743...
    41/743...
    42/743...
    43/743...
    44/743...
    45/743...
    46/743...
    47/743...
    48/743...
    49/743...
    50/743...
    51/743...
    52/743...
    53/743...
    54/743...
    55/743...
    56/743...
    57/743...
    58/743...
    59/743...
    60/743...
    61/743...
    62/743...
    63/743...
    64/743...
    65/743...
    66/743...
    67/743...
    68/743...
    69/743...
    70/743...
    71/743...
    72/743...
 

In [5]:
imputer = SimpleImputer().fit(features)
features = imputer.transform(features)
murmur_classifier = RandomForestClassifier(random_state=RANDOM_STATE)
outcome_classifier = RandomForestClassifier(random_state=RANDOM_STATE)

In [6]:
# Define parameters for tuning
random_grid = {
    'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_features' : ['auto', 'sqrt'],
    'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)],
    'min_samples_split' : [2,5,10], 
    'min_samples_leaf' : [1,2,4], 
    'bootstrap' : [True, False], 
}

### Randomized Search

In [7]:
murmur_random = RandomizedSearchCV(murmur_classifier, param_distributions=random_grid, n_iter=100, cv=5, verbose=verbose, random_state=42, n_jobs = -1)
outcome_random = RandomizedSearchCV(outcome_classifier, param_distributions=random_grid, n_iter=100, cv=5, verbose=verbose, random_state=42, n_jobs = -1)

In [8]:
murmur_random.fit(features, murmurs)
outcome_random.fit(features, outcomes)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


175 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dis

Fitting 5 folds for each of 100 candidates, totalling 500 fits


175 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dis

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=6789),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy',
                                                      'log_loss'],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=4)

In [9]:
print(murmur_random.best_estimator_)
print(murmur_random.best_score_)

RandomForestClassifier(criterion='entropy', max_depth=80, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=1200, random_state=6789)
0.7133593324868492


In [10]:
print(outcome_random.best_estimator_)
print(outcome_random.best_score_)

RandomForestClassifier(criterion='entropy', max_depth=70, min_samples_split=5,
                       n_estimators=800, random_state=6789)
0.5343551605296571


## GridSearch 

In [12]:
# Define parameters for tuning
murmur_param_grid = {
    'n_estimators' : [800, 1000, 1200, 1400],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_features' : ['auto', 'sqrt'],
    'max_depth' : [60,70,80,90],
    'min_samples_split' : [8,10,12], 
    'min_samples_leaf' : [1,2,3], 
    'bootstrap' : [True, False], 
}

outcome_param_grid = {
    'n_estimators' : [800, 900, 700],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_features' : ['auto', 'sqrt'],
    'max_depth' : [70,80,90, 60],
    'min_samples_split' : [4,5,7], 
    'min_samples_leaf' : [4,5,6], 
    'bootstrap' : [True, False], 
}

In [15]:
murmur_grid = GridSearchCV(murmur_classifier, param_grid=murmur_param_grid, cv=5, verbose=verbose, n_jobs = -1)
outcome_grid = GridSearchCV(outcome_classifier, param_grid=outcome_param_grid, cv=5, verbose=verbose, n_jobs = -1)

In [16]:
murmur_grid.fit(features, murmurs)
outcome_grid.fit(features, outcomes)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


2880 fits failed out of a total of 8640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1440 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self.

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


2160 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Users\lumin\anaconda3\envs\py38\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self.

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=6789),
             n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [70, 80, 90, 60],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [4, 5, 6],
                         'min_samples_split': [4, 5, 7],
                         'n_estimators': [800, 900, 700]},
             verbose=4)

In [17]:
print(murmur_random.best_estimator_)
print(murmur_random.best_score_)

RandomForestClassifier(criterion='entropy', max_depth=80, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=1200, random_state=6789)
0.7133593324868492


In [18]:
print(outcome_grid.best_estimator_)
print(outcome_grid.best_score_)

RandomForestClassifier(max_depth=70, min_samples_leaf=4, min_samples_split=4,
                       n_estimators=700, random_state=6789)
0.5343642300018139


In [None]:
kjj