In [1]:
from setup_general import *

# Rebalancing

In this notebook we focus on rebalancing our data, as it is very imbalanced originally. First, we used RandomOverSampler, which randomly copies instances. Then we also used the SMOTE-algorithm, which fabricates new instances that are close to other instances in the dataset.
In the end, new .cvs-files were created to include the rebalanced data to be used in other notebooks.

In [2]:
data = train_est_prepared.copy()
x = data.drop(['type'], axis=1)
y = data.type.astype("category")
# ROS cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
y = label_encoder.transform(y)

In [3]:
#function to have resamplers resample to specific number of samples per class
def by_num(data, min_samples):
    b = Counter(y).values()
    a = Counter(y).keys()
    a = list(a)
    b = list(b)

    if min_samples > max(b):
        min_samples = max(b)

    for i in range(len(a)):
        if b[i] < min_samples :
            b[i] = min_samples
    return dict(zip(a, b))

#function to have resamplers resample to specific number of samples per class
def by_perc(data, increase_perc):
    a = Counter(y).keys()
    b = Counter(y).values()
    a = list(a)
    b = list(b)

    max_samples = max(b)

    for i in range(len(b)):
        new_samples = int(b[i] * (1 + increase_perc/100))
        if new_samples > max_samples:
            b[i] = max_samples
        else:
            b[i] = new_samples
    return dict(zip(a, b))

# SMOTE

In [4]:
from imblearn.over_sampling import SMOTE
balancer = SMOTE('minority', k_neighbors=3,random_state=1)
X_resSMOTE, y_resSMOTE = balancer.fit_resample(x, y)
dataSMOTE = X_resSMOTE.copy()
dataSMOTE['type'] = y_resSMOTE



# save

In [13]:
dataSMOTE['type'] = dataSMOTE['type'].replace(type_lookup.id.to_list(), type_lookup.english.to_list())
dataSMOTE['id'] = dataSMOTE.index
dataSMOTE.set_index('id', inplace=True)
dataSMOTE.to_csv('data/prepared_ready/train_est_smote_03.csv', index=True)

In [9]:
data.type.value_counts()

photo                                           2680
photo negative                                  1516
design/drawing/sketch                            567
plan                                             558
archaeological find                              549
poster                                           537
letter                                           434
book                                             366
document                                         325
manuscript                                       316
graphics                                         163
music sheet                                      149
script, song/vocal music                         141
coin                                             140
seal/imprint                                     133
digital image                                    130
postcard                                         108
small print                                       80
magazines                                     

# ros


In [15]:
from imblearn.over_sampling import RandomOverSampler
balancer = RandomOverSampler(sampling_strategy=by_perc(y,100), random_state=0)
X_resSMOTE, y_resSMOTE = balancer.fit_resample(x, y)
dataSMOTE = X_resSMOTE.copy()
dataSMOTE['type'] = y_resSMOTE

[2680, 1516, 537, 149, 61, 316, 558, 434, 549, 71, 80, 29, 163, 41, 140, 15, 133, 366, 325, 130, 567, 63, 141, 58, 13, 18, 42, 54, 108, 46, 17, 21, 22, 33, 4, 33, 17, 31, 15, 14, 29, 10, 9, 9, 10, 12, 14, 20, 12, 11, 16, 8, 5, 13, 12] 2680


In [16]:
dataSMOTE.value_counts('type')

type
37    2680
36    2680
11    1134
41    1116
1     1098
43    1074
22     868
4      732
14     650
26     632
18     326
30     298
45     282
8      280
48     266
12     260
42     216
51     160
25     142
19     126
32     122
29     116
2      108
54      92
50      84
27      82
21      66
5       66
39      62
16      58
38      58
23      44
31      42
6       40
35      36
47      34
40      34
9       32
44      30
52      30
13      28
10      28
49      26
17      26
0       24
20      24
28      24
33      22
3       20
53      20
46      18
34      18
7       16
24      10
15       8
dtype: int64

In [5]:
dataSMOTE['type'] = dataSMOTE['type'].replace(type_lookup.id.to_list(), type_lookup.english.to_list())
dataSMOTE['id'] = dataSMOTE.index
dataSMOTE.set_index('id', inplace=True)
dataSMOTE.to_csv('data/prepared_ready/train_est_smote_03.csv', index=True)

# rus

In [3]:
from imblearn.under_sampling import RandomUnderSampler
balancer = RandomUnderSampler('majority', random_state=0)
X_resSMOTE, y_resSMOTE = balancer.fit_resample(x, y)
dataSMOTE = X_resSMOTE.copy()
dataSMOTE['type'] = y_resSMOTE



In [None]:
dataSMOTE['type'] = dataSMOTE['type'].replace(type_lookup.id.to_list(), type_lookup.english.to_list())
dataSMOTE['id'] = dataSMOTE.index
dataSMOTE.set_index('id', inplace=True)
dataSMOTE.to_csv('data/prepared_ready/train_est_smote_03.csv', index=True)