In [1]:
!pip install git+https://github.com/pycaret/pycaret.git@master --upgrade

Collecting git+https://github.com/pycaret/pycaret.git@master
  Cloning https://github.com/pycaret/pycaret.git (to revision master) to /tmp/pip-req-build-fgtufeur
  Running command git clone --filter=blob:none --quiet https://github.com/pycaret/pycaret.git /tmp/pip-req-build-fgtufeur
  Resolved https://github.com/pycaret/pycaret.git to commit 58ec3c282d58e94727f9d5b77b49f241e9103ab3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


#Data Preparation

In [2]:
from pycaret.datasets import get_data
dataset = get_data('mice')

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,Control,Memantine,C/S,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,Control,Memantine,C/S,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,Control,Memantine,C/S,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,...,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,Control,Memantine,C/S,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,...,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,Control,Memantine,C/S,c-CS-m


In [3]:
dataset.shape

(1080, 82)

In [4]:
data = dataset.sample(frac=0.95, random_state=786)
data_unseen = dataset.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (1026, 82)
Unseen Data For Predictions: (54, 82)


#Model Training

In [5]:
from pycaret.anomaly import *

exp_ano101 = setup(data, normalize = True,
                   ignore_features = ['MouseID'],
                   session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(1026, 82)"
2,Transformed data shape,"(1026, 88)"
3,Ignore features,1
4,Numeric features,77
5,Categorical features,4
6,Rows with missing values,48.5%
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [9]:
# List all available models
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pycaret.internal.patches.pyod.CBLOFForceToDouble
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [8]:
# `fraction` parameter determines the proportion of outliers in the dataset
iforest = create_model('iforest', fraction = 0.05)
print(iforest) # contamination should equal to fraction

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

IForest(behaviour='new', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1,
    random_state=123, verbose=0)


In [10]:
# Use isolation forest to detect anomaly
iforest_results = assign_model(iforest)
iforest_results.head()

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Anomaly,Anomaly_Score
0,0.34493,0.626194,0.383583,2.534561,4.097317,0.303547,0.222829,4.59277,0.239427,1.360164,...,0.2527,0.218868,0.249187,1.139493,Ts65Dn,Memantine,S/C,t-SC-m,0,-0.035529
1,0.630001,0.839187,0.357777,2.651229,4.261675,0.253184,0.185257,3.816673,0.20494,1.716583,...,0.155008,0.153219,,1.642886,Control,Memantine,C/S,c-CS-m,0,-0.078359
2,0.555122,0.726229,0.278319,2.097249,2.897552,0.222222,0.174356,1.86788,0.203379,1.610137,...,0.136109,0.15553,0.185484,1.65767,Ts65Dn,Memantine,C/S,t-CS-m,0,-0.061515
3,0.275849,0.430764,0.285166,2.265254,3.250091,0.189258,0.157837,2.91761,0.202594,1.734746,...,0.127944,0.207671,0.175357,0.893598,Control,Saline,S/C,c-SC-s,0,-0.074478
4,0.304788,0.617299,0.335164,2.638236,4.876609,0.28059,0.199417,4.835421,0.236314,1.226532,...,0.245277,0.202171,0.240372,0.795637,Ts65Dn,Memantine,S/C,t-SC-m,0,-0.077693


In [11]:
# Plot the datapoints, T-distributed Stochastic Neighbor Embedding
plot_model(iforest, plot = 'tsne')

In [14]:
plot_model(iforest, plot = 'umap')

TypeError: check_array() got an unexpected keyword argument 'ensure_all_finite'

#Predict on unseen data

In [18]:
unseen_predictions = predict_model(iforest, data=data_unseen.drop('MouseID', axis=1))
unseen_predictions.head()

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,class_t-SC-m,class_c-CS-m,class_t-CS-m,class_c-SC-s,class_c-SC-m,class_t-SC-s,class_c-CS-s,class_t-CS-s,Anomaly,Anomaly_Score
0,0.093986,0.048182,0.972051,0.257353,1.045721,-0.341448,-0.197942,-1.076573,-0.537373,0.033333,...,-0.36907,2.495066,-0.380906,-0.380906,-0.400791,-0.380906,-0.379226,-0.32862,0,-0.078627
1,1.144847,0.746841,0.62303,0.494295,1.846065,-0.66067,-0.611462,-1.102764,-1.265247,1.104284,...,-0.36907,2.495066,-0.380906,-0.380906,-0.400791,-0.380906,-0.379226,-0.32862,0,-0.069881
2,0.329338,0.318143,1.146602,1.79164,2.245345,-0.14503,-0.270556,-0.674679,-0.69496,0.233266,...,-0.36907,2.495066,-0.380906,-0.380906,-0.400791,-0.380906,-0.379226,-0.32862,0,-0.039648
3,0.018893,-0.212273,-1.228987,-0.779094,-0.321112,-0.449881,-0.178477,-0.287926,-1.386747,-0.451378,...,-0.36907,2.495066,-0.380906,-0.380906,-0.400791,-0.380906,-0.379226,-0.32862,0,-0.088104
4,-0.207867,-0.580953,-1.239478,-1.266102,-0.973721,-0.349054,-1.154566,-0.715432,-2.14745,-0.877701,...,-0.36907,2.495066,-0.380906,-0.380906,-0.400791,-0.380906,-0.379226,-0.32862,1,0.015706


In [16]:
# Save model
save_model(iforest,'iforest_anomalyDetection')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['DYRK1A_N', 'ITSN1_N', 'BDNF_N',
                                              'NR1_N', 'NR2A_N', 'pAKT_N',
                                              'pBRAF_N', 'pCAMKII_N', 'pCREB_N',
                                              'pELK_N', 'pERK_N', 'pJNK_N',
                                              'PKCA_N', 'pMEK_N', 'pNR1_N',
                                              'pNR2A_N', 'pNR2B_N', 'pPKCAB_N',
                                              'pRSK_N', 'AKT_N', 'BRAF_N',
                                              'CAMKII_N', 'CREB_N', 'ELK_N',
                                              'ERK_N', 'GSK3B_N', 'JNK_N',
                                              'ME...
                  TransformerWrapper(include=['class'],
                                     transformer=OneHotEncoder(cols=['class'],
                                       