In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
[?25l  Downloading https://files.pythonhosted.org/packages/e6/62/08c14224a7e242df2cef7b312d2ef821c3931ec9b015ff93bb52ec8a10a3/imbalanced_learn-0.5.0-py3-none-any.whl (173kB)
[K    100% |████████████████████████████████| 174kB 1.8MB/s ta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.5.0


In [2]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import re
from imblearn.over_sampling import SMOTE

In [3]:
dataset_dir = '../datasets/breast-cancer'

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [4]:
%%bash
cd ../datasets/breast-cancer
rm -f breast-cancer-wisconsin.data
wget https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data

--2019-08-10 01:29:30--  https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19889 (19K) [application/x-httpd-php]
Saving to: ‘breast-cancer-wisconsin.data’

     0K .......... .........                                  100%  128K=0.2s

2019-08-10 01:29:31 (128 KB/s) - ‘breast-cancer-wisconsin.data’ saved [19889/19889]



In [5]:
df = pd.read_csv(os.path.join(dataset_dir, 'breast-cancer-wisconsin.data'), header=None, na_values='?',
                              names=[
                                  'id', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape',
                                  'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
                                  'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'
                              ])

In [6]:
del df['id']

In [7]:
# This SMOTE implementation doesn't work with missing values.
df = df.dropna()
X = df[[ c for c in df.columns.values if c != 'class' ]]
y = df['class']
sm = SMOTE(sampling_strategy={2: 5000, 4: 5000}, random_state=1)
X_smoted, y_smoted = sm.fit_resample(X, y)
Xy = np.concatenate([X_smoted, y_smoted.reshape(10000,1)], axis=1)
df = pd.DataFrame(Xy, columns=df.columns.values)

  n_samples_majority))
  n_samples_majority))


In [8]:
# Shuffle
df = df.sample(frac=1)

In [9]:
df.to_csv(os.path.join(dataset_dir, 'breast-cancer.csv'), index=False)