In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [20]:
# loading data
train_data = np.load('train_data.npz')

# train data
train_features = train_data['features']
train_domains = train_data['domains']
train_digits = train_data['digits']

test_data = np.load('test_data.npz')

# test data
test_features = test_data['features']
test_domains = test_data['domains']
test_digits = test_data['digits']

domain_tags = np.unique(train_domains)

# specific label train data
tag_train_features_dic = {}
tag_train_domains_dic = {}
tag_train_digits_dic = {}

for tag in domain_tags:
    tag_mask = train_domains == tag
    
    tag_train_features = train_features[tag_mask]
    tag_train_domains = train_domains[tag_mask]
    tag_train_digits = train_digits[tag_mask]
    
    tag_train_features_dic[tag] = tag_train_features
    tag_train_domains_dic[tag] = tag_train_domains
    tag_train_digits_dic[tag] = tag_train_digits

# specific label test data
tag_test_features_dic = {}
tag_test_domains_dic = {}
tag_test_digits_dic = {}

    
for tag in domain_tags:
    tag_mask = test_domains == tag
        
    tag_test_features = test_features[tag_mask]
    tag_test_domains = test_domains[tag_mask]
    tag_test_digits = test_digits[tag_mask]
    
    tag_test_features_dic[tag] = tag_test_features
    tag_test_domains_dic[tag] = tag_test_domains
    tag_test_digits_dic[tag] = tag_test_digits
    
domain_clf = model = pickle.load(open('trained_models/domain_clf_200_estimators_30_min_samples_split.pickle', 'rb'))


In [27]:
# testing accuracy with different mixes of domains
mix_ratios = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

for tag in domain_tags:
    print(f'class {tag}:')
    print('---------')
    for mix_ratio in mix_ratios:
        # mixing data
        print(f'mix ratio = {mix_ratio}')
        
        enhanced_tag_train_features = tag_train_features_dic[tag]
        enhanced_tag_train_digits = tag_train_digits_dic[tag]

        other_tags = domain_tags
        other_tags = np.delete(domain_tags, domain_tags == tag)

        for other_tag in other_tags:
            mix_from_features = tag_train_features_dic[other_tag]
            mix_from_digits = tag_train_digits_dic[other_tag]

            mix_indices = np.random.choice(mix_from_features.shape[0], int(mix_ratio * len(mix_from_features)))
            mix_features = mix_from_features[mix_indices]
            mix_digits = mix_from_digits[mix_indices]

            enhanced_tag_train_features = \
                np.concatenate([enhanced_tag_train_features, mix_features])
            enhanced_tag_train_digits = \
                np.concatenate([enhanced_tag_train_digits, mix_digits])
            
        # training digits classifier on each enhanced domain
        n_estimators = 200
        min_samples_split = 20

        enhanced_domain_digit_clf = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=123)
        enhanced_domain_digit_clf.fit(enhanced_tag_train_features, enhanced_tag_train_digits)

        ### testing accuracy ###

        # accuracy on train data (split based on true domain)
        tag_train_features = tag_train_features_dic[tag]
        tag_train_digits = tag_train_digits_dic[tag]

        tag_train_digits_predict = enhanced_domain_digit_clf.predict(tag_train_features)
        score = f1_score(tag_train_digits, tag_train_digits_predict, average='weighted')
        print(f'class {tag}: f1_score on train data (split based on true domain) = {score}')

        # accuracy on train data (split based on predicted domain)
        train_domains_predict = domain_clf.predict(train_features)
        tag_train_features = train_features[train_domains_predict == tag]
        tag_train_digits = train_digits[train_domains_predict == tag]

        tag_train_digits_predict = enhanced_domain_digit_clf.predict(tag_train_features)
        score = f1_score(tag_train_digits, tag_train_digits_predict, average='weighted')
        print(f'class {tag}: f1_score on train data (split based on predicted domain) = {score}')


        # accuracy on test data (split based on true domain)
        tag_test_features = tag_test_features_dic[tag]
        tag_test_digits = tag_test_digits_dic[tag]

        tag_test_digits_predict = enhanced_domain_digit_clf.predict(tag_test_features)
        score = f1_score(tag_test_digits, tag_test_digits_predict, average='weighted')
        print(f'class {tag}: f1_score on test data (split based on true domain) = {score}')

        # accuracy on test data (split based on predicted domain)
        test_domains_predict = domain_clf.predict(test_features)
        tag_test_features = test_features[test_domains_predict == tag]
        tag_test_digits = test_digits[test_domains_predict == tag]

        tag_test_digits_predict = enhanced_domain_digit_clf.predict(tag_test_features)
        score = f1_score(tag_test_digits, tag_test_digits_predict, average='weighted')
        print(f'class {tag}: f1_score on test data (split based on predicted domain) = {score}')
    print()

class 0:
---------
mix ratio = 0
class 0: f1_score on train data (split based on true domain) = 0.9987995651372393
class 0: f1_score on train data (split based on predicted domain) = 0.9967633711534225
class 0: f1_score on test data (split based on true domain) = 0.968148350995694
class 0: f1_score on test data (split based on predicted domain) = 0.9660559432944722
mix ratio = 0.1
class 0: f1_score on train data (split based on true domain) = 0.9989997971526636
class 0: f1_score on train data (split based on predicted domain) = 0.997484018275675
class 0: f1_score on test data (split based on true domain) = 0.9683540864283469
class 0: f1_score on test data (split based on predicted domain) = 0.9668542025996821
mix ratio = 0.2
class 0: f1_score on train data (split based on true domain) = 0.9992001841402176
class 0: f1_score on train data (split based on predicted domain) = 0.9978442509056451
class 0: f1_score on test data (split based on true domain) = 0.9699661732342378
class 0: f1_sco

class 2: f1_score on train data (split based on predicted domain) = 0.9717394625906721
class 2: f1_score on test data (split based on true domain) = 0.7383806170768076
class 2: f1_score on test data (split based on predicted domain) = 0.7333293741341248
mix ratio = 0.2
class 2: f1_score on train data (split based on true domain) = 0.9950082662082892
class 2: f1_score on train data (split based on predicted domain) = 0.9747117827340481
class 2: f1_score on test data (split based on true domain) = 0.738120107421548
class 2: f1_score on test data (split based on predicted domain) = 0.7330202290064212
mix ratio = 0.3
class 2: f1_score on train data (split based on true domain) = 0.9952022555685962
class 2: f1_score on train data (split based on predicted domain) = 0.9765193512616812
class 2: f1_score on test data (split based on true domain) = 0.7374750460925089
class 2: f1_score on test data (split based on predicted domain) = 0.7329767228414001
mix ratio = 0.4
class 2: f1_score on train 

class 4: f1_score on test data (split based on predicted domain) = 0.9849971827427915
mix ratio = 0.3
class 4: f1_score on train data (split based on true domain) = 0.9994000127333508
class 4: f1_score on train data (split based on predicted domain) = 0.9989333573421638
class 4: f1_score on test data (split based on true domain) = 0.9851769456564058
class 4: f1_score on test data (split based on predicted domain) = 0.9835633271268545
mix ratio = 0.4
class 4: f1_score on train data (split based on true domain) = 0.9992000006168151
class 4: f1_score on train data (split based on predicted domain) = 0.9983996756174012
class 4: f1_score on test data (split based on true domain) = 0.9861831227761914
class 4: f1_score on test data (split based on predicted domain) = 0.9847874609088914
mix ratio = 0.5
class 4: f1_score on train data (split based on true domain) = 0.9997999901147093
class 4: f1_score on train data (split based on predicted domain) = 0.9992888381132548
class 4: f1_score on test

In [28]:
# mixing train data ratios
test_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
mix_ratios = {
    0: 0.7,
    1: 0.2,
    2: 0.1,
    3: 0,
    4: 0,
}

# mixing train data
enhanced_tag_train_features_dic = {}
enhanced_tag_train_digits_dic = {}

for tag in domain_tags:
    enhanced_tag_train_features_dic[tag] = tag_train_features_dic[tag]
    enhanced_tag_train_digits_dic[tag] = tag_train_digits_dic[tag]

    other_tags = domain_tags
    other_tags = np.delete(domain_tags, domain_tags == tag)
    
    for other_tag in other_tags:
        mix_ratio = mix_ratios[tag]
    
        mix_from_features = tag_train_features_dic[other_tag]
        mix_from_digits = tag_train_digits_dic[other_tag]
        
        mix_indices = np.random.choice(mix_from_features.shape[0], int(mix_ratio * len(mix_from_features)))
        mix_features = mix_from_features[mix_indices]
        mix_digits = mix_from_digits[mix_indices]
    
        enhanced_tag_train_features_dic[tag] = \
            np.concatenate([enhanced_tag_train_features_dic[tag], mix_features])
        enhanced_tag_train_digits_dic[tag] = \
            np.concatenate([enhanced_tag_train_digits_dic[tag], mix_digits])

In [29]:
# training digits classifier on each enhanced domain
enhanced_domain_digit_clfs = {}
n_estimators = 200
min_samples_split = 20

for tag in domain_tags:   
    enhanced_domain_digit_clf = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=123)
    enhanced_domain_digit_clf.fit(enhanced_tag_train_features_dic[tag], enhanced_tag_train_digits_dic[tag])
    
    enhanced_domain_digit_clfs[tag] = enhanced_domain_digit_clf

In [30]:
### accuracy of final models (using f1_score) ###

# accuracy on train data (split based on true domain)
weighted_score_sum = 0
weight_sum = 0
for tag in domain_tags:
    digit_clf = enhanced_domain_digit_clfs[tag]
    tag_train_features = tag_train_features_dic[tag]
    tag_train_digits = tag_train_digits_dic[tag]
    
    tag_train_digits_predict = digit_clf.predict(tag_train_features)
    score = f1_score(tag_train_digits, tag_train_digits_predict, average='weighted')
    print(f'class {tag}: f1_score on train data (split based on true domain) = {score}')
    
    weighted_score_sum += (score * len(tag_train_features))
    weight_sum += len(tag_train_features)
    
weighted_score = (weighted_score_sum / weight_sum)
print(f'f1_score on train data (split based on true domain) = {weighted_score}')

# accuracy on train data (split based on predicted domain)
print()
train_domains_predict = domain_clf.predict(train_features)
weighted_score_sum = 0
weight_sum = 0

for tag in domain_tags:
    digit_clf = enhanced_domain_digit_clfs[tag]
    tag_train_features = train_features[train_domains_predict == tag]
    tag_train_digits = train_digits[train_domains_predict == tag]
    
    tag_train_digits_predict = digit_clf.predict(tag_train_features)
    score = f1_score(tag_train_digits, tag_train_digits_predict, average='weighted')
    print(f'class {tag}: f1_score on train data (split based on predicted domain) = {score}')
    
    weighted_score_sum += (score * len(tag_train_features))
    weight_sum += len(tag_train_features)
    
weighted_score = (weighted_score_sum / weight_sum)
print(f'f1_score on train data (split based on predicted domain) = {weighted_score}')

# accuracy on test data (split based on true domain)
print()
weighted_score_sum = 0
weight_sum = 0
for tag in domain_tags:
    digit_clf = enhanced_domain_digit_clfs[tag]
    tag_test_features = tag_test_features_dic[tag]
    tag_test_digits = tag_test_digits_dic[tag]
    
    tag_test_digits_predict = digit_clf.predict(tag_test_features)
    score = f1_score(tag_test_digits, tag_test_digits_predict, average='weighted')
    print(f'class {tag}: f1_score on test data (split based on true domain) = {score}')
    
    weighted_score_sum += (score * len(tag_test_features))
    weight_sum += len(tag_test_features)
    
weighted_score = (weighted_score_sum / weight_sum)
print(f'f1_score on test data (split based on true domain) = {weighted_score}')


# accuracy on test data (split based on predicted domain)
print()
test_domains_predict = domain_clf.predict(test_features)
weighted_score_sum = 0
weight_sum = 0
for tag in domain_tags:
    digit_clf = enhanced_domain_digit_clfs[tag]
    tag_test_features = test_features[test_domains_predict == tag]
    tag_test_digits = test_digits[test_domains_predict == tag]
    
    tag_test_digits_predict = digit_clf.predict(tag_test_features)
    score = f1_score(tag_test_digits, tag_test_digits_predict, average='weighted')
    print(f'class {tag}: f1_score on test data (split based on predicted domain) = {score}')
    
    weighted_score_sum += (score * len(tag_test_features))
    weight_sum += len(tag_test_features)
    
weighted_score = (weighted_score_sum / weight_sum)
print(f'f1_score on test data (split based on predicted domain) = {weighted_score}')

class 0: f1_score on train data (split based on true domain) = 0.9994000033049222
class 1: f1_score on train data (split based on true domain) = 0.9989999985177732
class 2: f1_score on train data (split based on true domain) = 0.9942073426050163
class 3: f1_score on train data (split based on true domain) = 0.9989997880088513
class 4: f1_score on train data (split based on true domain) = 0.9996000273116841
f1_score on train data (split based on true domain) = 0.9982414319496494

class 0: f1_score on train data (split based on predicted domain) = 0.9987427716247168
class 1: f1_score on train data (split based on predicted domain) = 0.9931988236408626
class 2: f1_score on train data (split based on predicted domain) = 0.9720890476175972
class 3: f1_score on train data (split based on predicted domain) = 0.9983767696734978
class 4: f1_score on train data (split based on predicted domain) = 0.9989333012693208
f1_score on train data (split based on predicted domain) = 0.9917733165969552

cl