In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
os.chdir('../')

In [None]:
%%capture
from nutrition_labels.grant_tagger import GrantTagger

In [None]:
data = pd.read_csv('data/processed/training_data.csv')

In [None]:
def run_experiment(sample_not_relevant_range, num_repeats, vectorizer_type,model_type = 'naive_bayes'):
    experiments_results = []
    for n in sample_not_relevant_range:
        average_results_train = None
        average_results_test = None
        for i in range(0, num_repeats):
            # Take the average of 5 runs
            grant_tagger = GrantTagger(
                sample_not_relevant=n,
                ngram_range=(1,2),
                test_size=0.25,
                irrelevant_sample_seed=i,
                split_seed=i,
                vectorizer_type = vectorizer_type,
                model_type = model_type
            )
            X_train, X_test, y_train, y_test = grant_tagger.transform(data)
            grant_tagger.fit(X_train, y_train)
            results_train = grant_tagger.evaluate(X_train, y_train, print_results=False, average='weighted')
            results_test = grant_tagger.evaluate(X_test, y_test, print_results=False, average='weighted')
            if average_results_train: # will be the same for test too
                for key in average_results_train.keys(): # will be the same for test too
                    average_results_train[key] += results_train[key]
                    average_results_test[key] += results_test[key]
            else:
                average_results_train = results_train
                average_results_test = results_test
        average_results_train = {key: value/num_repeats for key, value in average_results_train.items()}
        average_results_test = {key: value/num_repeats for key, value in average_results_test.items()}
        results_dict = {'sample_not_relevant': n}
        for key, value in average_results_train.items():
            results_dict[key+'_train'] = value
        for key, value in average_results_test.items():
            results_dict[key+'_test'] = value
        experiments_results.append(results_dict)
        
    return experiments_results

In [None]:
def plot_results(experiments_results_df):
    fig, (ax1,ax2,ax3,ax4) = plt.subplots(nrows=1, ncols=4, figsize=(15, 4))
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='accuracy_train',ax=ax1, marker='.')
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='accuracy_test', color='red', ax=ax1, marker='.')
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='f1_train',ax=ax2, marker='.')
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='f1_test', color='red', ax=ax2, marker='.')
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='precision_score_train',ax=ax3, marker='.')
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='precision_score_test', color='red', ax=ax3, marker='.')
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='recall_score_train',ax=ax4, marker='.')
    experiments_results_df.plot(kind='line', x='sample_not_relevant', y='recall_score_test', color='red', ax=ax4, marker='.')
    # fig.savefig(f'sample_not_relevant_{num_repeats}reps_{vectorizer_type}.png')
    # plt.close(fig)
    plt.show()

In [None]:
count_experiments_results = run_experiment(
    range(40, 1000, 40), num_repeats=10, vectorizer_type='count'
)

In [None]:
experiments_results_df = pd.DataFrame(count_experiments_results)
plot_results(experiments_results_df)

In [None]:
tfidf_experiments_results = run_experiment(
    range(40, 600, 40), num_repeats=10, vectorizer_type='tfidf'
)

In [None]:
experiments_results_df = pd.DataFrame(tfidf_experiments_results)
plot_results(experiments_results_df)

In [None]:
print(len(data.loc[data['Relevance code'] == 1]))
print(len(data.loc[data['Relevance code'] == 0]))

## Explore all the results of those with good parameters

In [None]:
def test_setting(data, vectorizer_type, n):
    i = 4
    grant_tagger = GrantTagger(
                sample_not_relevant=n,
                ngram_range=(1,2),
                test_size=0.25,
                irrelevant_sample_seed=i,
                split_seed=i,
                vectorizer_type = vectorizer_type
                )
    X_train, X_test, y_train, y_test = grant_tagger.transform(data)
    grant_tagger.fit(X_train, y_train)
    results_train = grant_tagger.evaluate(X_train, y_train, average='weighted')
    results_test = grant_tagger.evaluate(X_test, y_test, average='weighted')

In [None]:
test_setting(data, 'tfidf', 40)

In [None]:
test_setting(data, 'tfidf', 320)

In [None]:
test_setting(data, 'count', 40)

In [None]:
test_setting(data, 'count', 880)

## Testing Bert Vectorizer


Bert and Naive Bayes model


In [None]:
%%capture
bert_bayes_experiments_results = run_experiment(
    range(40, 600, 40), num_repeats=10, vectorizer_type='bert'
)


In [None]:
experiments_results_df = pd.DataFrame(bert_bayes_experiments_results)
plot_results(experiments_results_df)

Bert and SVM model


In [None]:
%%capture
bert_svm_experiments_results = run_experiment(
    range(40, 600, 40), num_repeats=10, vectorizer_type='bert', model_type='SVM'
)


In [None]:
experiments_results_df = pd.DataFrame(bert_svm_experiments_results)
plot_results(experiments_results_df)

Bert and Logistic regression model


In [None]:
%%capture
bert_logreg_experiments_results = run_experiment(
    range(40, 600, 40), num_repeats=10, vectorizer_type='bert', model_type='log_reg'
)


In [None]:
experiments_results_df = pd.DataFrame(bert_logreg_experiments_results)
plot_results(experiments_results_df)