In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

In [None]:
%load_ext autoreload
%autoreload 2

sns.set_style('whitegrid')

In [None]:
from classifiers import *

In [None]:
import re
import glob

def load_datasets(lang, folder):
    test = glob.glob(f'./{folder}/{lang}/test/*/*.csv')
    train = glob.glob(f'./{folder}/{lang}/train/*/*.csv')
    
    error_titles = []
    
    features_test = []
    part = 'test'
    for title in tqdm(test, desc=part):
        try:
            txt_type = title[title.find('\\') + 1:title.rfind('\\')]
            name = int(re.findall(r'\d+', title)[0])    
            ft = pd.read_csv(title, header=None).iloc[0]
            ft['name'] = name
            ft['text_type'] = txt_type
            ft['part'] = part
            features_test.append(ft)
        except:
            error_titles.append(title)
            
    features_train = []
    part = 'train'
    for title in tqdm(train, desc=part):
        try:
            txt_type = title[title.find('\\') + 1:title.rfind('\\')]
            name = int(re.findall(r'\d+', title)[0])    
            ft = pd.read_csv(title, header=None).iloc[0]
            ft['name'] = name
            ft['text_type'] = txt_type
            ft['part'] = part
            features_train.append(ft)
        except:
            error_titles.append(title)
            
    features = pd.concat([
        pd.DataFrame(features_test), pd.DataFrame(features_train)
    ]).reset_index(drop=True)
    
    np.save(f'{folder}/{lang}_bad_files.npy', error_titles)
    
    return features

## SVD results

In [None]:
svd_datasets = {}
for lang in tqdm(['rus', 'eng']):
    svd_datasets[lang] = load_datasets(lang, 'svd_results')

In [None]:
np.save('svd_datasets.npy', svd_datasets, allow_pickle=True)

In [None]:
svd_datasets = np.load('svd_datasets.npy', allow_pickle=True).item()

In [None]:
svd_res = {}
for lang in tqdm(['rus', 'eng']):
    svd_res[lang] = full_pipeline(svd_datasets[lang])

In [None]:
np.save('svd_results.npy', svd_res, allow_pickle=True)

## CBOW results

In [None]:
cbow_datasets = {}
for lang in tqdm(['rus', 'eng']):
    cbow_datasets[lang] = load_datasets(lang, 'cbow_results')

In [None]:
np.save('cbow_datasets.npy', cbow_datasets, allow_pickle=True)

In [None]:
cbow_datasets = np.load('cbow_datasets.npy', allow_pickle=True).item()

In [None]:
cbow_res = {}
for lang in tqdm(['rus', 'eng']):
    cbow_res[lang] = full_pipeline(cbow_datasets[lang])

In [None]:
np.save('cbow_results.npy', cbow_res, allow_pickle=True)

## best

In [1]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 

In [2]:
res = {
    'svd': np.load('svd_results.npy', allow_pickle=True).item(), 
    'cbow': np.load('cbow_results.npy', allow_pickle=True).item(),
}

In [3]:
datasets = {
    'svd': np.load('svd_datasets.npy', allow_pickle=True).item(),
    'cbow': np.load('cbow_datasets.npy', allow_pickle=True).item()
}

In [16]:
print('Random Forest, accuracy')
print('emb', *list(res['svd'].keys()), sep='\t')

for emb_type in ['svd', 'cbow']:
    print(emb_type, end='\t')
    for lang in res[emb_type].keys():
        test_label = datasets[emb_type][lang].query('part == \'test\'').text_type == 'lit'
        test_ft = datasets[emb_type][lang].query('part == \'test\'').drop(columns=['name', 'text_type', 'part'])

        rf_model = res[emb_type][lang]['random_forest']['model']
        test_predict = rf_model.predict(test_ft)

        s = accuracy_score(test_label, test_predict)
        print("%.2f" % s, end='\t')
    print('\n')

Random Forest, accuracy

emb	rus	eng

svd	0.64	0.79	
cbow	0.51	0.61	

In [15]:
print('Random Forest, f1 score')
print('emb', *list(res['svd'].keys()), sep='\t')

for emb_type in ['svd', 'cbow']:
    print(emb_type, end='\t')
    for lang in res[emb_type].keys():
        test_label = datasets[emb_type][lang].query('part == \'test\'').text_type == 'lit'
        test_ft = datasets[emb_type][lang].query('part == \'test\'').drop(columns=['name', 'text_type', 'part'])

        rf_model = res[emb_type][lang]['random_forest']['model']
        test_predict = rf_model.predict(test_ft)

        s = f1_score(test_label, test_predict)
        print("%.2f" % s, end='\t')
    print('\n')

Random Forest, f1 score

emb	rus	eng

svd	0.68	0.79	
cbow	0.05	0.56	

In [14]:
print('Decision tree, accuracy')
print('emb', *list(res['svd'].keys()), sep='\t')

for emb_type in ['svd', 'cbow']:
    print(emb_type, end='\t')
    for lang in res[emb_type].keys():
        test_label = datasets[emb_type][lang].query('part == \'test\'').text_type == 'lit'
        test_ft = datasets[emb_type][lang].query('part == \'test\'').drop(columns=['name', 'text_type', 'part'])

        rf_model = res[emb_type][lang]['decision_tree']['model']
        test_predict = rf_model.predict(test_ft)

        s = accuracy_score(test_label, test_predict)
        print("%.2f" % s, end='\t')
    print('\n')

Decision tree, accuracy

emb	rus	eng

svd	0.61	0.78	
cbow	0.50	0.58	

In [13]:
print('Decision tree, f1 score')
print('emb', *list(res['svd'].keys()), sep='\t')

for emb_type in ['svd', 'cbow']:
    print(emb_type, end='\t')
    for lang in res[emb_type].keys():
        test_label = datasets[emb_type][lang].query('part == \'test\'').text_type == 'lit'
        test_ft = datasets[emb_type][lang].query('part == \'test\'').drop(columns=['name', 'text_type', 'part'])

        rf_model = res[emb_type][lang]['decision_tree']['model']
        test_predict = rf_model.predict(test_ft)

        s = f1_score(test_label, test_predict)
        print("%.2f" % s, end='\t')
    print('\n')

Decision tree, f1 score

emb	rus	eng

svd	0.66	0.78	
cbow	0.02	0.58	

In [12]:
print('LSVC tree, accuracy')
print('emb', *list(res['svd'].keys()), sep='\t')

for emb_type in ['svd', 'cbow']:
    print(emb_type, end='\t')
    for lang in res[emb_type].keys():
        test_label = datasets[emb_type][lang].query('part == \'test\'').text_type == 'lit'
        test_ft = datasets[emb_type][lang].query('part == \'test\'').drop(columns=['name', 'text_type', 'part'])

        rf_model = res[emb_type][lang]['lsvc']['model']
        test_predict = rf_model.predict(test_ft)

        s = accuracy_score(test_label, test_predict)
        print("%.2f" % s, end='\t')
    print('\n')

LSVC tree, accuracy

emb	rus	eng

svd	0.54	0.42	
cbow	0.50	0.63	

In [11]:
print('LSVC tree, f1 score')
print('emb', *list(res['svd'].keys()), sep='\t')

for emb_type in ['svd', 'cbow']:
    print(emb_type, end='\t')
    for lang in res[emb_type].keys():
        test_label = datasets[emb_type][lang].query('part == \'test\'').text_type == 'lit'
        test_ft = datasets[emb_type][lang].query('part == \'test\'').drop(columns=['name', 'text_type', 'part'])

        rf_model = res[emb_type][lang]['lsvc']['model']
        test_predict = rf_model.predict(test_ft)

        s = f1_score(test_label, test_predict)
        print("%.2f" % s, end='\t')
    print('\n')

LSVC tree, f1 score

emb	rus	eng

svd	0.52	0.50	
cbow	0.00	0.59	