In [1]:
#default_exp verbatims/core

In [2]:
#all_slow

In [3]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Verbatims - Core

> This module defines the training configuration objects for all verbatim ML tasks, DataBlocks, and helper methods to build DataLoaders for each

In [4]:
#export
import os, datetime
import sklearn.metrics as skm
from tritonlytics_ai.utils import *

from fastai.text.all import *
from transformers import *
from blurr.utils import *
from blurr.data.core import *
from blurr.modeling.core import *

In [5]:
#hide
import pdb, gc

# pandas and plotting config
import seaborn as sns
sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

from nbdev.showdoc import *
from fastcore.test import *

In [6]:
#hide
from fastai import __version__ as fa_version
from torch import __version__ as pt_version
from transformers import __version__ as hft_version

print(f'Using pytorch {pt_version}')
print(f'Using fastai {fa_version}')
print(f'Using transformers {hft_version}')

Using pytorch 1.6.0
Using fastai 2.0.16
Using transformers 3.3.1


In [7]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Sentiment

Basic Configuration

In [8]:
#export
m_pre_sentiment = ''
m_suf_sentiment = '_multilabel_hf'

sentiment_train_config = {
    'm_pre': m_pre_sentiment,
    'm_suf': m_suf_sentiment,
    
    'batch_size': 8,
    'corpus_cols': ['answer_text'],
    'corpus_suf': '_ans',
    'train_data': SENTIMENT_CLS_PATH/'train.csv',
    'valid_data': SENTIMENT_CLS_PATH/'test.csv',
    'cache_data_path': SENTIMENT_CLS_PATH/'data_cls_sentiment.pkl',
    
    'opt_beta': 1, 
    'opt_beta_average': 'weighted',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'fbeta_score', 
    'save_model_comp': np.greater,
    'save_model_filename': f'{m_pre_sentiment}cls_bestmodel{m_suf_sentiment}',
    'export_filename': f'{m_pre_sentiment}export_clas{m_suf_sentiment}.pkl',

    'learner_path': SENTIMENT_CLS_PATH
}

Prepare the data source

In [9]:
train_df = pd.read_csv(sentiment_train_config['train_data'])
valid_df = pd.read_csv(sentiment_train_config['valid_data'])

In [10]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,overall_sentiment,is_very_positive,is_positive,is_very_negative,is_negative,is_suggestion,feels_threatened,has_profanity,is_nonsense
0,19309,1877,Our department had an MSO who was great at finances but lacked leadership skills. Many members ...,,English,118,9,SAW,UCSD,88342,,"59. If you would like to elaborate on your responses above, or if you have any additional feedba...",Verbatim,201.0,Comments re Work Environment at UCSD,Other,,,,1.0,,280,,,,,,,,,,,,,,,,,2.0,0,0,0,1,0,0,0,0
1,305515,24623,Just tell us what's wrong.. but we have no funding to correct issues.,,English,129,22,CSS,CALPOLYSLO,181859,EH&S,Let us know your suggestions on how the EH&S can better meet the needs of the Campus.,Verbatim-Dept-Improve,325.0,EHS_Improve,"Environmental, Health and Safety",Safety,"Safety, Risk & Sustainability","Env, Health & Safety",1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,2.0,0,0,0,1,0,0,0,0


Remove any rows whre the "corpus_cols" are nan

In [11]:
train_df.dropna(subset=sentiment_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=sentiment_train_config['corpus_cols'], inplace=True)

In [12]:
train_df.feels_threatened.value_counts(), valid_df.feels_threatened.value_counts()

(0    14032
 1      353
 Name: feels_threatened, dtype: int64,
 0    1558
 1      43
 Name: feels_threatened, dtype: int64)

In [13]:
#hide

# If we want to add a "labels" column with all the labels space delimited (for exmaple, as we would with
# non-encoded labels)
# train_df['labels'] = train_df[SENT_LABELS[1:]].apply(lambda row: ' '.join(row.columns[row.values == 1]), axis=1)
# valid_df['labels'] = valid_df[SENT_LABELS[1:]].apply(lambda row: ' '.join(row.columns[row.values == 1], axis=1)

train_df['labels'] = train_df[SENT_LABELS[1:]].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
valid_df['labels'] = valid_df[SENT_LABELS[1:]].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)

train_df[['labels'] + SENT_LABELS[1:]].head()

Unnamed: 0,labels,is_very_positive,is_positive,is_very_negative,is_negative,is_suggestion,feels_threatened,has_profanity,is_nonsense
0,is_negative,0,0,0,1,0,0,0,0
1,is_negative,0,0,0,1,0,0,0,0
2,is_positive,0,1,0,0,0,0,0,0
3,is_positive,0,1,0,0,0,0,0,0
4,is_negative,0,0,0,1,0,0,0,0


In [14]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

15986

In [15]:
#export
def get_sentiment_train_data(train_config={}, trg_labels=SENT_LABELS[1:]):
    config = {**sentiment_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df['labels'] = train_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df['labels'] = valid_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [16]:
df = get_sentiment_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [17]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(STANDARD_THEME_SAW_LABELS)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [18]:
#hide
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    MultiCategoryBlock(encoded=True, vocab=SENT_LABELS[1:])
)

def get_x(inp): return ': '.join(inp[sentiment_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=ColReader(SENT_LABELS[1:]), 
                   splitter=ColSplitter(col='is_valid'))

In [19]:
#hide
dls = dblock.dataloaders(df, bs=sentiment_train_config['batch_size'])

In [20]:
#hide
print(f'The Target vocab has ({len(dls.vocab)} items)')

The Target vocab has (8 items)


In [21]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([8, 512]), torch.Size([8, 8]))

In [22]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"So another survey has come around and I anticipate it will have the same effect as all the other surveys … basically nothing will change, or should I say any changes will not be for the good of the staff. This years rant will consist of two items. First lets discuss the eternal issue of lack of pay. We all know that working for a University results in a paycheck 15 to 20% less than typical outside jobs of the same type but right now I know of people that are 20 to 30% under the going rate while we sit and watch upper echelon employees get anywhere from $2000 to over $50,000 dollars pay raises since 2008. We are still trying to figure out how some people can get $251,249.96 in “Extra Pay”. It is understood that keeping quality people is important and pay is a very big incentive to keep these people but the message it sends is that we have plenty of money for people that already earn plenty of money but we cant cough up a quarter more an hour for the custodians or the grounds people let alone actually pay anyone else near what they are worth. It has been shown time and time again that to achieve higher wages your best bet is to get another job and be prepared to leave, then (and only then) the purse strings open. This sends the message that there is no reason for loyalty to the University just be a mercenary and you will get the pay increase, meanwhile the employee that is dedicated and long term receives nothing for that dedication and loyalty. I want to see what the reason for no pay raises this year will be. In the last several years it has been “The State keeps cutting our budget” … yes the state that only is about 10% of the UC budget effects every pay scale except when someone threatens to leave. Now that Prop 30 has passed, which I believe had a lot of help due to the veiled threat of “Vote for this or you will lose your job”, what will be the new invented reason for cutbacks. Hers one you can use … “Well, actually you guys are not really State employees you are Reagent employees and we cant afford to give you raises … unless you are already earning over $100,00 a year, then we got plenty of money. BTW it is interesting that the Treasurer of the Regents has earned an average of over $280,000 in “Extra Pay” over the last 5 years on top of her $470,000 base salary.",is_very_negative;is_negative;feels_threatened
1,"I feel UCSD favors minorities PERIOD - I have seen a lot of reverse discrimination. We have a joke that whenever a minority is hired we call it ""diversity strikes again"". It is blatant when we have first hand knowledge that a positionis filled with a minority person only because they are a minority, with absolutely NO relation to their skill level. All qtns asked to confirm a hire are based on whether there was a minority in the interview pool. How about if they are qualified??? Then the diversity awards actually awards people for hiring minorities....who cares if they can't do the job? (Yeah, we win, we've hired 2 minorities.) I also feel people take complete advantage of stress/disability leave when it is their personal lives that are stressed out, NOT their work life. They just do not want to do their job. It truly affects all of us who have to pick up the pieces. If they can't handle the job, get another one, don't make everyone else stress out, just because the have some quack doctor signing a piece of paper. UCSD is great for people who know how to work the system. Having these people take advantage of it causes major morale problems. I feel that the lowly people like myself deserve raises. Money is always miraculously found when its needed. Some people are paid just because they threaten to sue if someone looks at them wrong, and then there are the ones, like myself, who actually want to earn a decent living and work hard to do it. I am flabberghasted when I see certain people doing nothing and getting paid the BIG bucks. I often wonder if HR is for the employee or on the side of the University. I don't feel like we are supported by the HR department. They seem more intent on screwing whomever they can, than doing what's best for the employee. I don't feel they can be trusted. I also don't understand why we have a separate HR department. What for??? Support staff do not make enough to live on. It is so frustrating. Don't think I'm a disgruntled employee, I thoroughly like my boss, like the work I do, and like being associated with the University. There are just a lot of work inequities because UCSD is so afraid of being sued and also the pay inequities between like positions. Some places give 25% pay increases, others say they can only give",is_positive;is_very_negative;is_negative;feels_threatened


In [23]:
#hide
# save dataloaders
# torch.save(dls, STANDARD_THEME_SAW_PATH/f'data_cls_sentiment_multilabel_hf.pkl')
# dls = torch.load(STANDARD_THEME_SAW_PATH/f'data_cls_sentiment_multilabel_hf.pkl')

In [24]:
#export
def get_sentiment_train_x(inp, corpus_cols): return ': '.join(inp[corpus_cols].values)

def get_sentiment_train_dls(df, hf_arch, hf_tokenizer, vocab=SENT_LABELS[1:], train_config={}, use_cache=True):
    
    config = {**sentiment_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
    
    blocks = (
        HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
        MultiCategoryBlock(encoded=True, vocab=vocab)
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_sentiment_train_x, corpus_cols=config['corpus_cols']), 
                       get_y=ColReader(vocab), 
                       splitter=ColSplitter(col='is_valid'))
    
    dls = dblock.dataloaders(df, bs=config['batch_size'])
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [25]:
df = get_sentiment_train_data()
dls = get_sentiment_train_dls(df, hf_arch, hf_tokenizer)

test_eq(dls.bs, sentiment_train_config['batch_size'])
test_eq(len(SENT_LABELS[1:]), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [26]:
dls = get_sentiment_train_dls(df, hf_arch, hf_tokenizer, use_cache=False)

test_eq(dls.bs, sentiment_train_config['batch_size'])
test_eq(len(SENT_LABELS[1:]), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [27]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"So another survey has come around and I anticipate it will have the same effect as all the other surveys … basically nothing will change, or should I say any changes will not be for the good of the staff. This years rant will consist of two items. First lets discuss the eternal issue of lack of pay. We all know that working for a University results in a paycheck 15 to 20% less than typical outside jobs of the same type but right now I know of people that are 20 to 30% under the going rate while we sit and watch upper echelon employees get anywhere from $2000 to over $50,000 dollars pay raises since 2008. We are still trying to figure out how some people can get $251,249.96 in “Extra Pay”. It is understood that keeping quality people is important and pay is a very big incentive to keep these people but the message it sends is that we have plenty of money for people that already earn plenty of money but we cant cough up a quarter more an hour for the custodians or the grounds people let alone actually pay anyone else near what they are worth. It has been shown time and time again that to achieve higher wages your best bet is to get another job and be prepared to leave, then (and only then) the purse strings open. This sends the message that there is no reason for loyalty to the University just be a mercenary and you will get the pay increase, meanwhile the employee that is dedicated and long term receives nothing for that dedication and loyalty. I want to see what the reason for no pay raises this year will be. In the last several years it has been “The State keeps cutting our budget” … yes the state that only is about 10% of the UC budget effects every pay scale except when someone threatens to leave. Now that Prop 30 has passed, which I believe had a lot of help due to the veiled threat of “Vote for this or you will lose your job”, what will be the new invented reason for cutbacks. Hers one you can use … “Well, actually you guys are not really State employees you are Reagent employees and we cant afford to give you raises … unless you are already earning over $100,00 a year, then we got plenty of money. BTW it is interesting that the Treasurer of the Regents has earned an average of over $280,000 in “Extra Pay” over the last 5 years on top of her $470,000 base salary.",is_very_negative;is_negative;feels_threatened
1,"I am concerned about the lack of support for groups that are less in number than others. Although university leadership does its best for everyone, sometimes when one group is supported it appears that another group is not. I've had Latino and Jewish students in my office sharing their fear of fellow students who hated them because of their ethnic or religious background. I remember one Caucasian student who after taking a History course and learning about American injustices against Latinos was too embarrassed to admit he was White. I had to remind him of the good things Americans have done. I've seen several students be ashamed of their Arts and Humanities major because so much attention is being given to STEM majors by the university and media. STEM students sometimes have made fun of A&H or Social Science students because they somehow think they are smarter or will be making more money after graduation. Students should not be feeling scared or inferior to any other student regardless of ethnic, political or religious background, or because they chose to major in Classical Studies. We need to do more for any group or individual that feels threatened or inferior in any way- do more in our teaching, advertising, campus articles, events, counseling, advising- in all services the university provides. The sense of superiority and arrogance sometimes found in a few faculty, staff and students should be set straight and not overlooked. Our all of students can win at life.",is_very_negative;is_negative;feels_threatened


## Standard Themes - S@W

Basic Configuration

In [28]:
#export
m_pre_standard_themes_saw = ''
m_suf_standard_themes_saw = '_multilabel_hf'

saw_standard_themes_train_config = {
    'm_pre': m_pre_standard_themes_saw,
    'm_suf': m_suf_standard_themes_saw,
    
    'batch_size': 8,
    'corpus_cols': ['answer_text'],
    'corpus_suf': '_ans',
    'train_data': STANDARD_THEME_SAW_PATH/'train.csv',
    'valid_data': STANDARD_THEME_SAW_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_SAW_PATH/'data_cls_standard_themes_saw.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'weighted',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'precision_score', 
    'save_model_comp': np.greater,
    'save_model_filename': f'{m_pre_standard_themes_saw}cls_bestmodel{m_suf_standard_themes_saw}',
    'export_filename': f'{m_pre_standard_themes_saw}export_clas{m_suf_standard_themes_saw}.pkl',

    'learner_path': STANDARD_THEME_SAW_PATH
}

Prepare the data source

In [29]:
train_df = pd.read_csv(saw_standard_themes_train_config['train_data'])
valid_df = pd.read_csv(saw_standard_themes_train_config['valid_data'])

In [30]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,adequate_staffing,advancement_and_training_opportunities,appropriate_stress_work_assigned_equitably,benefits,better_ways_recognized_participate_in_decisions,career_advancement,committed_to_diversity,communicates_essential_information,ethical_conduct_perform_responsibilities_spirit_of_cooperation,evaluated_fairly,experienced_discrimination,facilities_workspace_safety,faculty_value_contributions,favoritism_cliques,fear_of_retaliation_negative_consequences,feel_valued_by_department,flexibility_work_life_balance,good_use_of_skills,have_necessary_tools,have_voice_within_my_institution_valued_member_of_my_institution,internal_processes_effective,parking_transportation,salary_pay,satisfied_with_diversity_progams,supervisor_effectiveness_resolves_staff_issues
0,588941,1877,I enjoy our department potlucks and social activities. I am grateful that our department leader...,,English,212,9,SAW,UCSD,449396,,"59. If you would like to elaborate on your responses above, or if you have any additional feedba...",Verbatim,201,Comments re Work Environment at UCSD,Other,,,,1,24000.0,3834,999999.0,UC San Diego,8000.0,MARINE SCIENCES,24000.0,DIRECTORS OFFICE-SIO,,,,,,,,,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,402308,1877,"UCSD needs to keep up with the modern workplace in order to get stronger, more competitive candi...",,English,160,9,SAW,UCSD,272347,,"59. If you would like to elaborate on your responses above, or if you have any additional feedba...",Verbatim,201,Comments re Work Environment at UCSD,Other,,,,1,31404.0,3761,999999.0,UC San Diego,30000.0,Student Affairs,31400.0,Student Life,31404.0,Student Life Business Office,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [31]:
train_df.survey_id.value_counts()

212    1635
160     909
396     304
398     284
401      12
Name: survey_id, dtype: int64

Remove any rows whre the "corpus_cols" are nan

In [32]:
train_df.dropna(subset=saw_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=saw_standard_themes_train_config['corpus_cols'], inplace=True)

In [33]:
#hide
STANDARD_THEME_SAW_LABELS

['adequate_staffing',
 'advancement_and_training_opportunities',
 'appropriate_stress_work_assigned_equitably',
 'benefits',
 'better_ways_recognized_participate_in_decisions',
 'career_advancement',
 'committed_to_diversity',
 'communicates_essential_information',
 'ethical_conduct_perform_responsibilities_spirit_of_cooperation',
 'evaluated_fairly',
 'experienced_discrimination',
 'facilities_workspace_safety',
 'faculty_value_contributions',
 'favoritism_cliques',
 'fear_of_retaliation_negative_consequences',
 'feel_valued_by_department',
 'flexibility_work_life_balance',
 'good_use_of_skills',
 'have_necessary_tools',
 'have_voice_within_my_institution_valued_member_of_my_institution',
 'internal_processes_effective',
 'parking_transportation',
 'salary_pay',
 'satisfied_with_diversity_progams',
 'supervisor_effectiveness_resolves_staff_issues']

In [34]:
#hide

# If we want to add a "labels" column with all the labels space delimited (for exmaple, as we would with
# non-encoded labels)
train_df['labels'] = train_df[STANDARD_THEME_SAW_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
valid_df['labels'] = valid_df[STANDARD_THEME_SAW_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)

train_df[['labels'] + STANDARD_THEME_SAW_LABELS].head()

Unnamed: 0,labels,adequate_staffing,advancement_and_training_opportunities,appropriate_stress_work_assigned_equitably,benefits,better_ways_recognized_participate_in_decisions,career_advancement,committed_to_diversity,communicates_essential_information,ethical_conduct_perform_responsibilities_spirit_of_cooperation,evaluated_fairly,experienced_discrimination,facilities_workspace_safety,faculty_value_contributions,favoritism_cliques,fear_of_retaliation_negative_consequences,feel_valued_by_department,flexibility_work_life_balance,good_use_of_skills,have_necessary_tools,have_voice_within_my_institution_valued_member_of_my_institution,internal_processes_effective,parking_transportation,salary_pay,satisfied_with_diversity_progams,supervisor_effectiveness_resolves_staff_issues
0,committed_to_diversity feel_valued_by_department satisfied_with_diversity_progams,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,have_voice_within_my_institution_valued_member_of_my_institution,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,ethical_conduct_perform_responsibilities_spirit_of_cooperation experienced_discrimination,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,adequate_staffing salary_pay,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,appropriate_stress_work_assigned_equitably faculty_value_contributions,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

3494

In [36]:
#export
def get_saw_standard_theme_train_data(train_config={}, trg_labels=STANDARD_THEME_SAW_LABELS):
    config = {**saw_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df['labels'] = train_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df['labels'] = valid_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [37]:
df = get_saw_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [38]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(STANDARD_THEME_SAW_LABELS)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [39]:
#hide
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    MultiCategoryBlock(encoded=True, vocab=STANDARD_THEME_SAW_LABELS)
)

def get_x(inp): return ': '.join(inp[saw_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=ColReader(STANDARD_THEME_SAW_LABELS), 
                   splitter=ColSplitter(col='is_valid'))

In [40]:
#hide
dls = dblock.dataloaders(df, bs=saw_standard_themes_train_config['batch_size'])

In [41]:
#hide
print(f'The Target vocab has ({len(dls.vocab)} items)')

The Target vocab has (25 items)


In [42]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([8, 456]), torch.Size([8, 25]))

In [43]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",ethical_conduct_perform_responsibilities_spirit_of_cooperation;evaluated_fairly;favoritism_cliques;fear_of_retaliation_negative_consequences;supervisor_effectiveness_resolves_staff_issues
1,"1.\tSensitivity and bias training for SLBO management – The selection and search for the person to fill Evelyns position seemed inequitable. Evelyn having a say in her successor may not have been against campus hiring policies but did not seem appropriate since she had inside knowledge of some of the candidates and a possible bias for or against other candidates. \r\n I applied for the position because my applying was suggested by another member of Evelyns team that was not on the committee.\r\nAnna, a member of the committee, was also under the leadership of Evelyn. Anna, who now has been promoted into Jamies positions would not be in Jamies position if he had not gotten Evelyns position. In addition, there was no one in the room from HR ensuring that the interviews were fair and equal. I think that this Cluster is large enough to warrant additional oversite with the filling of positions. \r\nThough it may not matter to some but within the SLBO all of senior management (HR remains neutral) is now male while 99 % of the rest of the staff is female. \r\n2.\tWhen work is removed from my desk work of equal or greater substance should replace it (the work removed).\r\n3.\tTo remove budgetary work from me and replace it with transactional work is belittling along with a misuse of my high level skill set, experience, and education. I am often the person who trains or provides information to new staff on our tools within Financial Link. \r\n4.\tI want to see equality in secession training for management positions within the SLBO or other departments in the cluster.",advancement_and_training_opportunities;committed_to_diversity;favoritism_cliques


In [44]:
#hide
# save dataloaders
# torch.save(dls, STANDARD_THEME_SAW_PATH/f'data_cls_standard_themes_saw_multilabel_hf.pkl')
# dls = torch.load(STANDARD_THEME_SAW_PATH/f'data_cls_standard_themes_saw_multilabel_hf.pkl')

In [45]:
#export
def get_saw_standard_theme_train_x(inp, corpus_cols): return ': '.join(inp[corpus_cols].values)

def get_saw_standard_theme_train_dls(df, hf_arch, hf_tokenizer, vocab=STANDARD_THEME_SAW_LABELS, 
                                     train_config={}, use_cache=True):
    
    config = {**saw_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
    
    blocks = (
        HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
        MultiCategoryBlock(encoded=True, vocab=vocab)
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_saw_standard_theme_train_x, corpus_cols=config['corpus_cols']), 
                       get_y=ColReader(vocab), 
                       splitter=ColSplitter(col='is_valid'))
    
    dls = dblock.dataloaders(df, bs=config['batch_size'])
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [46]:
df = get_saw_standard_theme_train_data()
dls = get_saw_standard_theme_train_dls(df, hf_arch, hf_tokenizer)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_SAW_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [47]:
dls = get_saw_standard_theme_train_dls(df, hf_arch, hf_tokenizer, use_cache=False)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_SAW_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [48]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",ethical_conduct_perform_responsibilities_spirit_of_cooperation;evaluated_fairly;favoritism_cliques;fear_of_retaliation_negative_consequences;supervisor_effectiveness_resolves_staff_issues
1,"My team colleagues frequently gossip, talk, laugh around me (and I can hear them). They also frequently withhold information necessary for team work and for me to successfully do my job. They also complain their work which furthers a negative environment. This team is oriented towards dysfunctional individuals, not on the team or service for customers. This has caused problem with anxiety and insomnia which has been very difficult for me. I've had to spend a lot of time personally taking care of myself to offset the negative impact this has created. I am a service oriented, transparent individual and pride myself on treating others with kindness regardless of their position.\r\n\r\nOccasionally I've had mobility issues and brought ADA compliance issues to team members' attention like no elevator access due to a mechanical issue and the only option was climbing 3 flights of stairs one way. Instead of this being addressed as an ADA issued, I was targeted spoken to with increased volume and told I could call Facilities Management myself if I didn't like the response or blown off like I was un-useful because I couldn't assist with a request requiring mobility when this is not an issue caused by me. This is completely unacceptable. \r\n\r\nAlthough I am not old, I've also experienced ageism because I have senior level experience in position. I've also been mocked by 2 colleges because I'm a grandparent. This is completely unacceptable. This survey does not include age discrimination.",appropriate_stress_work_assigned_equitably;communicates_essential_information;ethical_conduct_perform_responsibilities_spirit_of_cooperation;experienced_discrimination;favoritism_cliques;have_voice_within_my_institution_valued_member_of_my_institution


## Standard Themes - CSS

Basic Configuration

In [49]:
#export
m_pre_standard_themes_css = ''
m_suf_standard_themes_css = '_multilabel_hf'

css_standard_themes_train_config = {
    'm_pre': m_pre_standard_themes_css,
    'm_suf': m_suf_standard_themes_css,
    
    'batch_size': 8,
    'corpus_cols': ['answer_text'],
    'corpus_suf': '_ans',
    'train_data': STANDARD_THEME_CSS_PATH/'train.csv',
    'valid_data': STANDARD_THEME_CSS_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_CSS_PATH/'data_cls_standard_themes_css.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'weighted',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'precision_score', 
    'save_model_comp': np.greater,
    'save_model_filename': f'{m_pre_standard_themes_css}cls_bestmodel{m_suf_standard_themes_css}',
    'export_filename': f'{m_pre_standard_themes_css}export_clas{m_suf_standard_themes_css}.pkl',

    'learner_path': STANDARD_THEME_CSS_PATH
}

Prepare the data source

In [50]:
train_df = pd.read_csv(css_standard_themes_train_config['train_data'])
valid_df = pd.read_csv(css_standard_themes_train_config['valid_data'])

In [51]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,accessible_to_customers,consistency_in_policies_information,cost_fees,courteous_professional_staff,effective_communications,effectively_uses_websites_online_documentation,helpful_staff,knowledgeable_staff,moving_in_a_positive_direction,overall_satisfaction,process_improvement,provides_effective_advice_guidance,provides_training_on_processes_applications,resolves_problems_effectively,responds_to_requests_within_an_acceptable_time,understands_my_needs_and_requirements
0,596266,12491,Just continue to communicate with individuals in the areas affected.,,English,215,15,CSS-FACULTY-STAFF-ONLY,UCSD,453250,Bldg Main & Repair,Let us know your suggestions on how to improve Building Maintenance and Repair Services.,Verbatim-Dept-Improve,134.0,Building_Improve,Building Maintenance & Repair Services,Facilities,"Building, Custodial, Facilities Maintenance",Facilities Maintenance,1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,594413,12281,Delaying payment by 2 weeks because of late timesheets is very difficult for staff,,English,215,15,CSS-FACULTY-STAFF-ONLY,UCSD,456790,Payroll,Let us know your suggestions on how to improve Payroll.,Verbatim-Dept-Improve,88.0,Payroll_Improve,Payroll,Human Resources,Payroll Services,Payroll,1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [52]:
train_df.survey_id.value_counts()

215    2734
131       1
124       1
Name: survey_id, dtype: int64

Remove any rows whre the "corpus_cols" are nan

In [53]:
train_df.dropna(subset=css_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=css_standard_themes_train_config['corpus_cols'], inplace=True)

In [54]:
#hide
STANDARD_THEME_CSS_LABELS

['accessible_to_customers',
 'consistency_in_policies_information',
 'cost_fees',
 'courteous_professional_staff',
 'effective_communications',
 'effectively_uses_websites_online_documentation',
 'helpful_staff',
 'knowledgeable_staff',
 'moving_in_a_positive_direction',
 'overall_satisfaction',
 'process_improvement',
 'provides_effective_advice_guidance',
 'provides_training_on_processes_applications',
 'resolves_problems_effectively',
 'responds_to_requests_within_an_acceptable_time',
 'understands_my_needs_and_requirements']

In [55]:
#hide

# If we want to add a "labels" column with all the labels space delimited (for exmaple, as we would with
# non-encoded labels)
train_df['labels'] = train_df[STANDARD_THEME_CSS_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
valid_df['labels'] = valid_df[STANDARD_THEME_CSS_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)

train_df[['labels'] + STANDARD_THEME_CSS_LABELS].head()

Unnamed: 0,labels,accessible_to_customers,consistency_in_policies_information,cost_fees,courteous_professional_staff,effective_communications,effectively_uses_websites_online_documentation,helpful_staff,knowledgeable_staff,moving_in_a_positive_direction,overall_satisfaction,process_improvement,provides_effective_advice_guidance,provides_training_on_processes_applications,resolves_problems_effectively,responds_to_requests_within_an_acceptable_time,understands_my_needs_and_requirements
0,effective_communications,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,resolves_problems_effectively,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,courteous_professional_staff,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,consistency_in_policies_information resolves_problems_effectively,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,responds_to_requests_within_an_acceptable_time,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [56]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

3041

In [57]:
#export
def get_css_standard_theme_train_data(train_config={}, trg_labels=STANDARD_THEME_CSS_LABELS):
    config = {**css_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df['labels'] = train_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df['labels'] = valid_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [58]:
df = get_css_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [59]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(STANDARD_THEME_CSS_LABELS)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [60]:
#hide
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    MultiCategoryBlock(encoded=True, vocab=STANDARD_THEME_CSS_LABELS)
)

def get_x(inp): return ': '.join(inp[css_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=ColReader(STANDARD_THEME_CSS_LABELS), 
                   splitter=ColSplitter(col='is_valid'))

In [61]:
#hide
dls = dblock.dataloaders(df, bs=css_standard_themes_train_config['batch_size'])

In [62]:
#hide
print(f'The Target vocab has ({len(dls.vocab)} items)')

The Target vocab has (16 items)


In [63]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([8, 324]), torch.Size([8, 16]))

In [64]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"Expand Shuttle Services throughout the Greater San Diego area beyond UCSD campus stops. \r\nI recently relocated to San Diego from Seattle, while in Seattle I worked for Microsoft for quite some time. Microsoft has developed a network of shuttles that connects neighborhoods directly to their campuses that run regularly Monday-Friday. San Diego MTS is not optimized to transport people to UCSD's campus so there are obviously many holes. Even once the trolley expansion is complete key neighborhoods will still not have a convenient option to get to UCSD via public transportation. Neighborhood-specific shuttles would help address these issues directly.\r\n\r\nCampus-wide create a mandate for all eligible employees to work from home at least one day a week.\r\nWhen I started with UCSD I was very surprised to find out how far behind UCSD is in terms of letting employees work from home. Having worked in the tech industry in Seattle my previous employer allowed a very flexible schedule, we would work from home as we saw fit, which usually amounted to approx. 2 days a week.\r\nPersonally, for my position at UCSD I could work from home 3-4 days a week joining all meetings via Zoom. The old school mentality of supervisors in my department makes it so my colleagues and I have to suffer through the awful commute to/from campus and pay for parking 5 days a week. I suspect this old school mentality is present in many parts of the campus, which indirectly tells employees that they are not trusted.",process_improvement
1,"Last year the vanpool program was running smoothly and I gave it glowing reviews.Then with only six weeks notice the program changed and we had to contract our own vanpool. There was little explanation for the change and the whole thing was very badly handled by the University. The gas cards don't work half the time and we have had to come out of pocket for gasoline in addition to what we've already paid (after tax) without reimbursement or credit from Enterprise to date. This is NOT acceptable. \r\nIn addition Sorrento Valley Coaster Shuttle service has been shut down by the University for the North County commuters. This University talks about lowering our carbon footprint but removes the very programs designed to do so. It's disgusting!!! \r\nMore people have been put on the roads to campus while parking is quickly disappearing for those who work so hard to make UCSD a world-class university. There seems to be no plan in place for fixing the mess that has been made. If there is a plan it is being poorly communicated. These changes should have been made AFTER the trolley service was up and running, but now people are scrambling on their own to fix a problem that was manufactured by Commute Solutions. Everything about how these programs were handled is WRONG!!!\r\nThe only good thing is that we are still able to get 10 days per quarter for parking if needed and there is the free ride home program for emergencies. Let's not mess that up please.",effective_communications


In [65]:
#hide
# save dataloaders
# torch.save(dls, STANDARD_THEME_CSS_PATH/f'data_cls_standard_themes_css_multilabel_hf.pkl')
# dls = torch.load(STANDARD_THEME_CSS_PATH/f'data_cls_standard_themes_css_multilabel_hf.pkl')

In [66]:
#export
def get_css_standard_theme_train_x(inp, corpus_cols): return ': '.join(inp[corpus_cols].values)

def get_css_standard_theme_train_dls(df, hf_arch, hf_tokenizer, vocab=STANDARD_THEME_CSS_LABELS, 
                                     train_config={}, use_cache=True):
    
    config = {**css_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
    
    blocks = (
        HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
        MultiCategoryBlock(encoded=True, vocab=vocab)
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_css_standard_theme_train_x, corpus_cols=config['corpus_cols']), 
                       get_y=ColReader(vocab), 
                       splitter=ColSplitter(col='is_valid'))
    
    dls = dblock.dataloaders(df, bs=config['batch_size'])
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [67]:
df = get_css_standard_theme_train_data()
dls = get_css_standard_theme_train_dls(df, hf_arch, hf_tokenizer)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_CSS_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [68]:
dls = get_css_standard_theme_train_dls(df, hf_arch, hf_tokenizer, use_cache=False)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_CSS_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [69]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"Expand Shuttle Services throughout the Greater San Diego area beyond UCSD campus stops. \r\nI recently relocated to San Diego from Seattle, while in Seattle I worked for Microsoft for quite some time. Microsoft has developed a network of shuttles that connects neighborhoods directly to their campuses that run regularly Monday-Friday. San Diego MTS is not optimized to transport people to UCSD's campus so there are obviously many holes. Even once the trolley expansion is complete key neighborhoods will still not have a convenient option to get to UCSD via public transportation. Neighborhood-specific shuttles would help address these issues directly.\r\n\r\nCampus-wide create a mandate for all eligible employees to work from home at least one day a week.\r\nWhen I started with UCSD I was very surprised to find out how far behind UCSD is in terms of letting employees work from home. Having worked in the tech industry in Seattle my previous employer allowed a very flexible schedule, we would work from home as we saw fit, which usually amounted to approx. 2 days a week.\r\nPersonally, for my position at UCSD I could work from home 3-4 days a week joining all meetings via Zoom. The old school mentality of supervisors in my department makes it so my colleagues and I have to suffer through the awful commute to/from campus and pay for parking 5 days a week. I suspect this old school mentality is present in many parts of the campus, which indirectly tells employees that they are not trusted.",process_improvement
1,"I have had three visits to the parking office and both have been a terrible experience. The first time, the lady at the counter did not allow me to get the eco pass trial for the summer, on my second visit the staff member confirmed that I actually had the right to it and it was her mistake. During that second visit, most of the staff (young people) were not proactive or really wanted to work. Some of them were speaking in a bad way to each other (either joking or not) and were rude to the young lady that was helping me. That caused a very bad impression to me because it seems even the supervisor was not helpful. The good thing of this second visit was that the lady allowed to get one moth of the eco pass. My last visit was to try to change the dates of my eco pass trial and all the staff was terrible at this, they did not know how to help so they gave me a phone number to call, when I called the same person that was helping me replied and said that he picked up the phone but that was not his phone so that I should try again, I tried and of course no one replied this time. I finally get the name of the person in charge that coincidentally left early that day and left no one in charge to help me with my problem. I get desperate and decided not to insist on this issue. I only have a bad experience from this office. I suggest you have an audit to this office, especially focusing on knowledge of the staff and customer service.",courteous_professional_staff


## Standard Themes - Meta

Basic configuration

In [70]:
#export
m_pre_standard_themes_meta = ''
m_suf_standard_themes_meta = '_multitask_hf'

meta_standard_themes_train_config = {
    'm_pre': m_pre_standard_themes_meta,
    'm_suf': m_suf_standard_themes_meta,
    
    'batch_size': 8,
    'corpus_cols': ['theme', 'answer_text'],
    'corpus_suf': '_multitask',
    'train_data': STANDARD_THEME_META_PATH/'train.csv',
    'valid_data': STANDARD_THEME_META_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_META_PATH/'data_mm_standard_themes_meta.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'binary',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'valid_loss', 
    'save_model_comp': np.less,
    'save_model_filename': f'{m_pre_standard_themes_meta}mm_bestmodel{m_suf_standard_themes_meta}',
    'export_filename': f'{m_pre_standard_themes_meta}export_mm{m_suf_standard_themes_meta}.pkl',

    'learner_path': STANDARD_THEME_META_PATH
}

Prepare the data source

In [71]:
train_df = pd.read_csv(STANDARD_THEME_META_PATH/'train.csv')
valid_df = pd.read_csv(STANDARD_THEME_META_PATH/'test.csv')

In [72]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,standard_theme_id,theme,url_friendly_theme,theme_display_order,avg_sentiment,is_example
0,594369,12281,Handling things such as child support and wage garnishment/levies is inconsistent and has led to...,,English,215,15,CSS-FACULTY-STAFF-ONLY,UCSD,453733,Payroll,Let us know your suggestions on how to improve Payroll.,Verbatim-Dept-Improve,88.0,Payroll_Improve,Payroll,Human Resources,Payroll Services,Payroll,1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,35,"Consistency in Policies, Information",ConsistencyInPoliciesInformation,1,2.0,0
1,589686,2576,Diversity is such an important part o UC mission and yet some places its not seen as much it is ...,,English,212,9,SAW,UCSD,447092,C&B,21. If you would like to elaborate on any of your answers to the conduct and behavioral question...,Verbatim,1240.0,Conduct & Behavioral - Comments,Conduct & Behavioral,,,,3.0,90890.0,3662,999999.0,UC San Diego,90000.0,VICE CHANCELLOR HEALTH SCIENCES,93000.0,SCHOOL OF MEDICINE,90890.0,DERMATOLOGY,,,,,,,,,17,Satisfied with Diversity Progams,SatisfiedWithDiversityProgams,1,2.5,0


In [73]:
train_df.is_example.dtype

dtype('int64')

Remove any rows whre the "corpus_cols" are nan

In [74]:
train_df.dropna(subset=meta_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=meta_standard_themes_train_config['corpus_cols'], inplace=True)

In [75]:
#hide
# list(set(train_df.theme.unique()))

In [76]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

10484

In [77]:
#export
def get_meta_standard_theme_train_data(train_config={}):
    config = {**meta_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [78]:
df = get_meta_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [79]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [80]:
#hide
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    RegressionBlock(),
    CategoryBlock()
)

def get_x(inp): return ': '.join(inp[meta_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=[ColReader('avg_sentiment'), ColReader('is_example')], 
                   splitter=ColSplitter(col='is_valid'), 
                   n_inp=1)

In [81]:
#hide
dls = dblock.dataloaders(df, bs=saw_standard_themes_train_config['batch_size'])

In [82]:
dls.vocab

(#2) [0,1]

In [83]:
dls.before_batch[0].hf_tokenizer.vocab_size

50265

In [84]:
#hide
print((f'The inputs vocab ({dls.before_batch[0].hf_tokenizer.vocab_size} items), '
       f'and the targets ({len(dls.vocab)} items)'))

The inputs vocab (50265 items), and the targets (2 items)


In [85]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape, b[2].shape, b[1].type(), b[2].type()

(3,
 torch.Size([8, 465]),
 torch.Size([8]),
 torch.Size([8]),
 'torch.cuda.FloatTensor',
 'torch.cuda.LongTensor')

In [86]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,text_,category
0,"Supervisor Effectiveness/Resolves Staff Issues: In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",1.0,0
1,"Evaluated Fairly: In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",1.0,0


In [87]:
#export
def get_meta_standard_theme_train_x(inp, corpus_cols): return 'theme: ' + ' comment: '.join(inp[corpus_cols].values)

def get_meta_standard_theme_train_dls(df, hf_arch, hf_tokenizer, train_config={}, use_cache=True):
    
    config = {**meta_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
    
    blocks = (
        HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
        RegressionBlock(),
        CategoryBlock()
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_meta_standard_theme_train_x, corpus_cols=config['corpus_cols']),
                       get_y=[ColReader('avg_sentiment'), ColReader('is_example')], 
                       splitter=ColSplitter(col='is_valid'), 
                       n_inp=1)
    
    dls = dblock.dataloaders(df, bs=config['batch_size'])
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [88]:
df = get_meta_standard_theme_train_data()

dls = get_meta_standard_theme_train_dls(df, hf_arch, hf_tokenizer)

test_eq(dls.bs, meta_standard_themes_train_config['batch_size'])
test_eq(2, len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 3)
test_eq(b[1].shape[0], dls.bs)    # = regression task
test_eq(b[2].shape[0], dls.bs)    # = classification task

In [89]:
dls = get_meta_standard_theme_train_dls(df, hf_arch, hf_tokenizer, use_cache=False)

test_eq(dls.bs, meta_standard_themes_train_config['batch_size'])
test_eq(2, len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 3)
test_eq(b[1].shape[0], dls.bs)    # = regression task
test_eq(b[2].shape[0], dls.bs)    # = classification task

In [90]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,text_,category
0,"theme: Supervisor Effectiveness/Resolves Staff Issues comment: In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",1.0,0
1,"theme: Supervisor Effectiveness/Resolves Staff Issues comment: The change from Aline to Dave Kimber has been a meaningful improvement for our department. Aline was pervasively corrosive, but Dave approaches employees with respect and is working toward resolving many of the problems she introduced; a change for which I'm grateful. Working for Dr. Sirlin is a truly exceptional experience. He actively and consistently works to improve diversity and inclusion within our lab, treats employees well, fosters our career growth, and is the best example I have worked with of a male leader in a position of power who treats female colleagues and employees as equals. Dr. Norbash is also an exemplar of inclusion and respect and sets a resoundingly positive and progressive tone for our department. It is a pleasure to work under the leadership of Drs. Sirlin and Norbash and I consistently learn from them. I would like to see improvement in the procedures for employee promotion and growth. The current structures make it difficult to promote my employees who have earned promotion. E.g., an employee shouldn't have to go through open recruitment and wait eight months for the job card they are already performing. If campus would like to better engage HS employees in the their mission, one idea would be providing educational resources that are more affordable, such as undergraduate courses for staff development, or really any affordable educational resources. I liked Dr. Chung's message asking for our prospective faith and positivity; Dr. Chung, I'm in.",3.0,0


## Cleanup

In [91]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 02a_verbatims-core.ipynb.
Converted 02b_verbatims-sentiment.ipynb.
Converted 02c_verbatims-standard-themes-saw-training.ipynb.
Converted 02d_verbatims-standard-themes-css-training.ipynb.
Converted 02e_verbatims-standard-themes-meta-training.ipynb.
Converted 99_verbatims-inference.ipynb.
Converted index.ipynb.
