In [1]:
from collections import Counter

import pandas as pd
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.pipeline import make_pipeline

seed = 66

In [2]:
df = pd.read_csv('ionosphere.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [3]:
X, y = df.iloc[:, :34], df.iloc[:, -1]

In [4]:
print(sorted(Counter(y).items()))

[('b', 126), ('g', 225)]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=.33, 
    random_state=seed
)

In [6]:
print(sorted(Counter(y_train).items()))

[('b', 88), ('g', 147)]


In [7]:
rf = RandomForestClassifier(random_state=seed)
rf.fit(X_train, y_train).score(X_test, y_test)

0.9396551724137931

In [8]:
X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[('b', 146), ('g', 147)]


In [9]:
rf = RandomForestClassifier(random_state=seed)
rf.fit(X_resampled, y_resampled).score(X_test, y_test)

0.9568965517241379

In [10]:
model = make_pipeline(
    SMOTEENN(random_state=seed),
    RandomForestClassifier(random_state=seed)
)

cv_results = cross_validate(
    estimator=model, 
    X=X, 
    y=y, 
    return_train_score=True, 
    return_estimator=True,
    n_jobs=-1,
    cv=10
)

print(
    f"Accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.2f} +/- "
    f"{cv_results['test_score'].std():.2f}"
)

Accuracy mean +/- std. dev.: 0.93 +/- 0.05
