In [1]:
import numpy as np

In [2]:
np_array_store_file = f'/home/valentin-rexer/uni/UofM/datascience/datasets/len_500_data.npz'

data = np.load(np_array_store_file)
X = data['X']
y = data['y']

print(X.shape, y.shape)

(267286, 1000, 6) (267286,)


In [3]:
from sklearn.model_selection import train_test_split

def make_split_for_binary_class(X, y, kept_label, out_path, train_split_size=0.9, random_state=42):
    y_modified = np.where(y != kept_label, 0, y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_modified, train_size=train_split_size, random_state=random_state)

    out_file = f'{out_path}/dataset_label_{kept_label}.npz'
    with open(out_file, 'w') as f:
        f.write('')
    
    np.savez(out_file, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    
    return out_file 

In [4]:
data_path = '/home/valentin-rexer/uni/UofM/datascience/data_RFs'

In [5]:
class_labels = [i for i in range(1,8)]
class_paths = []

for cl in class_labels:
    class_paths.append(make_split_for_binary_class(X, y, cl, data_path))
    print(f'processed label {cl}')

processed label 1
processed label 2
processed label 3
processed label 4
processed label 5
processed label 6
processed label 7


In [16]:
data_path = f'/home/valentin-rexer/uni/UofM/datascience/data_RFs/dataset_label_3.npz'


data = np.load(data_path)

X_train = data['X_train']
y_train = data['y_train']

X_test = data['X_test']
y_test = data['y_test']

X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

print(np.unique(y_train))
print(np.unique(y_test))

(240557, 6000)
(240557,)
(26729, 6000)
(26729,)
[0 3]
[0 3]


In [2]:
from sklearn.ensemble import RandomForestClassifier
import joblib
import numpy as np

def train_new_RFC(out_path, input_data, name,  n_estimators, random_state=42):
    data = np.load(input_data)
    model_path = f"{out_path}/{name}.joblib"
    
    X_train = data['X_train']
    y_train = data['y_train']

    X_train = X_train.reshape(X_train.shape[0], -1)

    print(X_train.shape)
    print(y_train.shape)
    

    rfc = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rfc.fit(X_train, y_train)

    joblib.dump(rfc, model_path)

    return model_path

In [3]:
import joblib
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, matthews_corrcoef
import numpy as np

def eval_RFC(model_path, data_path):
    rfc_predictor = joblib.load(model_path)
    print(rfc_predictor.classes_)
    data = np.load(data_path)
    
    X_test = data['X_test']
    y_test = data['y_test']

    X_test = X_test.reshape(X_test.shape[0], -1)

    print(X_test.shape)
    print(y_test.shape)

    preds = rfc_predictor.predict(X_test)

    print(accuracy_score(y_test, preds), '\n')
    print(classification_report(y_test, preds), '\n')
    print(matthews_corrcoef(y_test, preds), '\n')
    print(confusion_matrix(y_test, preds))

In [4]:
out_folder = '/home/valentin-rexer/uni/UofM/datascience/models_RF'

for i in range(1,8):
    print(f'Processing label {i}')
    data_path = f'/home/valentin-rexer/uni/UofM/datascience/data_RFs/dataset_label_{i}.npz'
    rf = train_new_RFC(out_folder, data_path, f'RF{i}', 300)

Processing label 1
(240557, 6000)
(240557,)
Processing label 2
(240557, 6000)
(240557,)
Processing label 3
(240557, 6000)
(240557,)
Processing label 4
(240557, 6000)
(240557,)
Processing label 5
(240557, 6000)
(240557,)
Processing label 6
(240557, 6000)
(240557,)
Processing label 7
(240557, 6000)
(240557,)


In [7]:
for i in range(1,8):
    path_to_model = f'/home/valentin-rexer/uni/UofM/datascience/models_RF/RF{i}.joblib'
    data_path = f'/home/valentin-rexer/uni/UofM/datascience/data_RFs/dataset_label_{i}.npz'
    eval_RFC(path_to_model, data_path)

[0 1]
(26729, 6000)
(26729,)
0.9462381682816416 

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     23357
           1       1.00      0.57      0.73      3372

    accuracy                           0.95     26729
   macro avg       0.97      0.79      0.85     26729
weighted avg       0.95      0.95      0.94     26729
 

0.7352447525679979 

[[23357     0]
 [ 1437  1935]]
[0 2]
(26729, 6000)
(26729,)
0.8919899734370909 

              precision    recall  f1-score   support

           0       0.86      1.00      0.92     17420
           2       1.00      0.69      0.82      9309

    accuracy                           0.89     26729
   macro avg       0.93      0.85      0.87     26729
weighted avg       0.91      0.89      0.89     26729
 

0.7687910074489007 

[[17401    19]
 [ 2868  6441]]
[0 3]
(26729, 6000)
(26729,)
0.916532605035729 

              precision    recall  f1-score   support

           0       0.90      1.00 

[0 1]
(26729, 6000)
(26729,)
0.8746679636350032 

              precision    recall  f1-score   support

           0       0.94      0.92      0.93     25314
           1       0.00      0.00      0.00         0
           7       0.00      0.00      0.00      1415

    accuracy                           0.87     26729
   macro avg       0.31      0.31      0.31     26729
weighted avg       0.89      0.87      0.88     26729
 

-0.033024406592619196 

[[23379  1935     0]
 [    0     0     0]
 [ 1415     0     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
