In [1]:
#default_exp verbatims/core

In [2]:
#all_slow

In [3]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Verbatims - Core

> This module defines the training configuration objects for all verbatim ML tasks, DataBlocks, and helper methods to build DataLoaders for each

In [4]:
#export
import os, datetime
import sklearn.metrics as skm
from tritonlytics_ai.utils import *

from fastai.text.all import *
from transformers import *
from blurr.utils import *
from blurr.data.core import *
from blurr.modeling.core import *

from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

In [5]:
#hide
import pdb, gc

# pandas and plotting config
import seaborn as sns
sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

from nbdev.showdoc import *
from fastcore.test import *

In [6]:
#hide
from fastai import __version__ as fa_version
from torch import __version__ as pt_version
from transformers import __version__ as hft_version

print(f'Using pytorch {pt_version}')
print(f'Using fastai {fa_version}')
print(f'Using transformers {hft_version}')

Using pytorch 1.7.1+cu110
Using fastai 2.2.7
Using transformers 4.3.3


In [7]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Sentiment

Basic Configuration

In [8]:
#export
m_pre_sentiment = ''
m_suf_sentiment = '_multilabel_hf'
base_model_name_sentiment = 'verbatim_sent'

sentiment_train_config = {
    'm_pre': m_pre_sentiment,
    'm_suf': m_suf_sentiment,
    'base_model_name': base_model_name_sentiment,
    
    'max_seq_length': None,
    'batch_size': 8,
    'corpus_cols': ['answer_text'],
    'corpus_suf': '_ans',
    'train_data': SENTIMENT_CLS_PATH/'train.csv',
    'valid_data': SENTIMENT_CLS_PATH/'test.csv',
    'cache_data_path': SENTIMENT_CLS_PATH/f'data_{base_model_name_sentiment}.pkl',
    
    'opt_beta': 1, 
    'opt_beta_average': 'weighted',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'fbeta_score', 
    'save_model_comp': np.greater,
    'save_model_filename': f'{m_pre_sentiment}{base_model_name_sentiment}{m_suf_sentiment}_bestmodel',
    'export_filename': f'{m_pre_sentiment}{base_model_name_sentiment}{m_suf_sentiment}_export.pkl',

    'learner_path': SENTIMENT_CLS_PATH
}

Prepare the data source

In [9]:
train_df = pd.read_csv(sentiment_train_config['train_data'])
valid_df = pd.read_csv(sentiment_train_config['valid_data'])

In [10]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,overall_sentiment,is_very_positive,is_positive,is_very_negative,is_negative,is_suggestion,feels_threatened,has_profanity,is_nonsense
0,72661,1877,The major area of concern for me is the communication between the divisions/department and the p...,,English,110,9,SAW,UCSD,179154,SAT,"If you would like to elaborate on your responses above, or if you have any additional feedback r...",Verbatim,114.0,Comments re Work Environment at UCSD,Satisfaction with UC San Diego,,,,1.0,90860.0,3659,999999.0,UC San Diego,90000.0,VICE CHANCELLOR HEALTH SCIENCES,93000.0,SCHOOL OF MEDICINE,90800.0,MEDICINE,90860.0,CARDIOLOGY,,,,,,,2.0,0,0,0,1,0,0,0,0
1,162330,8475,"First and foremost, there needs to be more parking spots. FOR UNDERGRADUATE STUDENTS - not visit...",,English,132,7,CSS-STUDENT-ONLY,UCSD,190733,Parking,Let us know your suggestions on how to improve Campus Parking,Verbatim-Dept-Improve,9.0,Parking-Improve,Campus Parking,Parking & Transportation,Parking & Transportation,"Parking, Commuter Services",1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,2.0,0,0,0,1,1,0,0,0


Remove any rows whre the "corpus_cols" are nan

In [11]:
train_df.dropna(subset=sentiment_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=sentiment_train_config['corpus_cols'], inplace=True)

In [12]:
train_df.feels_threatened.value_counts(), valid_df.feels_threatened.value_counts()

(0    14039
 1      351
 Name: feels_threatened, dtype: int64,
 0    1560
 1      36
 Name: feels_threatened, dtype: int64)

In [13]:
#hide

# If we want to add a "labels" column with all the labels space delimited (for exmaple, as we would with
# non-encoded labels)
# train_df['labels'] = train_df[SENT_LABELS[1:]].apply(lambda row: ' '.join(row.columns[row.values == 1]), axis=1)
# valid_df['labels'] = valid_df[SENT_LABELS[1:]].apply(lambda row: ' '.join(row.columns[row.values == 1], axis=1)

train_df['labels'] = train_df[SENT_LABELS[1:]].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
valid_df['labels'] = valid_df[SENT_LABELS[1:]].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)

train_df[['labels'] + SENT_LABELS[1:]].head()

Unnamed: 0,labels,is_very_positive,is_positive,is_very_negative,is_negative,is_suggestion,feels_threatened,has_profanity,is_nonsense
0,is_negative,0,0,0,1,0,0,0,0
1,is_negative is_suggestion,0,0,0,1,1,0,0,0
2,is_negative,0,0,0,1,0,0,0,0
3,is_negative,0,0,0,1,0,0,0,0
4,is_negative,0,0,0,1,0,0,0,0


In [14]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

15986

In [15]:
#export
def get_sentiment_train_data(train_config={}, trg_labels=SENT_LABELS[1:]):
    config = {**sentiment_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    train_df['labels'] = train_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df.reset_index(drop=True, inplace=True)
        valid_df['labels'] = valid_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [16]:
df = get_sentiment_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [17]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(STANDARD_THEME_SAW_LABELS)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

In [18]:
#hide
blocks = (
    HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), 
    MultiCategoryBlock(encoded=True, vocab=SENT_LABELS[1:])
)

def get_x(inp): return ': '.join(inp[sentiment_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=ColReader(SENT_LABELS[1:]), 
                   splitter=ColSplitter(col='is_valid'))

In [19]:
#hide
set_seed(TL_RAND_SEED)
dls = dblock.dataloaders(df, bs=sentiment_train_config['batch_size'], num_workers=0)

In [20]:
#hide
print(f'The Target vocab has ({len(dls.vocab)} items)')

The Target vocab has (8 items)


In [21]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([8, 512]), torch.Size([8, 8]))

In [22]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"So another survey has come around and I anticipate it will have the same effect as all the other surveys … basically nothing will change, or should I say any changes will not be for the good of the staff. This years rant will consist of two items. First lets discuss the eternal issue of lack of pay. We all know that working for a University results in a paycheck 15 to 20% less than typical outside jobs of the same type but right now I know of people that are 20 to 30% under the going rate while we sit and watch upper echelon employees get anywhere from $2000 to over $50,000 dollars pay raises since 2008. We are still trying to figure out how some people can get $251,249.96 in “Extra Pay”. It is understood that keeping quality people is important and pay is a very big incentive to keep these people but the message it sends is that we have plenty of money for people that already earn plenty of money but we cant cough up a quarter more an hour for the custodians or the grounds people let alone actually pay anyone else near what they are worth. It has been shown time and time again that to achieve higher wages your best bet is to get another job and be prepared to leave, then (and only then) the purse strings open. This sends the message that there is no reason for loyalty to the University just be a mercenary and you will get the pay increase, meanwhile the employee that is dedicated and long term receives nothing for that dedication and loyalty. I want to see what the reason for no pay raises this year will be. In the last several years it has been “The State keeps cutting our budget” … yes the state that only is about 10% of the UC budget effects every pay scale except when someone threatens to leave. Now that Prop 30 has passed, which I believe had a lot of help due to the veiled threat of “Vote for this or you will lose your job”, what will be the new invented reason for cutbacks. Hers one you can use … “Well, actually you guys are not really State employees you are Reagent employees and we cant afford to give you raises … unless you are already earning over $100,00 a year, then we got plenty of money. BTW it is interesting that the Treasurer of the Regents has earned an average of over $280,000 in “Extra Pay” over the last 5 years on top of her $470,000 base salary.",is_very_negative;is_negative;feels_threatened
1,"Staff should receive an EQUITY INCREASE.The effectiveness of the supervisor is an area that needs improving. Supervisors should not use their status to threaten subordinates in order to gain control. Supervisors should be responsible for their area by assisting their staff to succeed. They should not set their area up for failure. Supervisors should be professional and not carry negative attitudes because it affects the morale of the staff in their area. Supervisors should not excessively socialize during business hours - while subordinates are reprimanded if they socialize, supervisors get away without getting reprimanded. Supervisors should be as productive as, or even more productive than the staff in their area. Lack of effort and motivation from supervisors create feelings of bad morale. Supervisors should not abuse their staff by demanding high productivity, when the supervisors themself do not contribute to their success. Some supervisors do not have enough work to do - they delegate every work they have to their subordinates - even though their subordinates are extremely overworked, and then the supervisors are socializing among other supervisors for hours and they do not get reprimanded for being unproductive because the subordinates do have anyone they can go to, to report the unproductiveness of their supervisor.It is difficult to respect a supervisor if they do not have much knowledge in their area (when staff asks questions to their supervisor, they will not assist the staff, but the supervisors will simply say ""I don't know"" and they will not provide any guidance nor any resources to help the staff.) Yet this supervisor will demand that their subordinates respect them - respect is earned, it should not be taken for granted.A supervisor should not become a supervisor, just because they have been in the department for xx number of years. Currently, there is a supervisor who does not have much knowledge about their area, they stay afloat only because of the dedicated and knowledgeable staff that work under them; however, that supervisor does not see it that way and takes everything for granted and tries to intimidate the staff by threatening them. Intimidation and threats by a supervisor should not be a means to try to motivate staff - intimidation and threats will have a negative impact on staff. Supervisors should be respectful of deadlines in their area, even though they do not have any work to do for the deadline.",is_very_negative;is_negative;is_suggestion;feels_threatened


In [23]:
#hide
# save dataloaders
# torch.save(dls, STANDARD_THEME_SAW_PATH/f'data_cls_sentiment_multilabel_hf.pkl')
# dls = torch.load(STANDARD_THEME_SAW_PATH/f'data_cls_sentiment_multilabel_hf.pkl')

In [24]:
#export
def get_sentiment_train_x(inp, corpus_cols): return ': '.join(inp[corpus_cols].values)

def get_sentiment_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, 
                            vocab=SENT_LABELS[1:], train_config={}, use_cache=True):
    
    config = {**sentiment_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
            return dls
    
    blocks = (
        HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=config['max_seq_length']), 
        MultiCategoryBlock(encoded=True, vocab=vocab)
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_sentiment_train_x, corpus_cols=config['corpus_cols']), 
                       get_y=ColReader(vocab), 
                       splitter=ColSplitter(col='is_valid'))
    
    set_seed(TL_RAND_SEED)
    dls = dblock.dataloaders(df, bs=config['batch_size'], num_workers=0)
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [25]:
df = get_sentiment_train_data()
dls = get_sentiment_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model)

test_eq(dls.bs, sentiment_train_config['batch_size'])
test_eq(len(SENT_LABELS[1:]), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [26]:
dls = get_sentiment_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, use_cache=False)

test_eq(dls.bs, sentiment_train_config['batch_size'])
test_eq(len(SENT_LABELS[1:]), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [27]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"So another survey has come around and I anticipate it will have the same effect as all the other surveys … basically nothing will change, or should I say any changes will not be for the good of the staff. This years rant will consist of two items. First lets discuss the eternal issue of lack of pay. We all know that working for a University results in a paycheck 15 to 20% less than typical outside jobs of the same type but right now I know of people that are 20 to 30% under the going rate while we sit and watch upper echelon employees get anywhere from $2000 to over $50,000 dollars pay raises since 2008. We are still trying to figure out how some people can get $251,249.96 in “Extra Pay”. It is understood that keeping quality people is important and pay is a very big incentive to keep these people but the message it sends is that we have plenty of money for people that already earn plenty of money but we cant cough up a quarter more an hour for the custodians or the grounds people let alone actually pay anyone else near what they are worth. It has been shown time and time again that to achieve higher wages your best bet is to get another job and be prepared to leave, then (and only then) the purse strings open. This sends the message that there is no reason for loyalty to the University just be a mercenary and you will get the pay increase, meanwhile the employee that is dedicated and long term receives nothing for that dedication and loyalty. I want to see what the reason for no pay raises this year will be. In the last several years it has been “The State keeps cutting our budget” … yes the state that only is about 10% of the UC budget effects every pay scale except when someone threatens to leave. Now that Prop 30 has passed, which I believe had a lot of help due to the veiled threat of “Vote for this or you will lose your job”, what will be the new invented reason for cutbacks. Hers one you can use … “Well, actually you guys are not really State employees you are Reagent employees and we cant afford to give you raises … unless you are already earning over $100,00 a year, then we got plenty of money. BTW it is interesting that the Treasurer of the Regents has earned an average of over $280,000 in “Extra Pay” over the last 5 years on top of her $470,000 base salary.",is_very_negative;is_negative;feels_threatened
1,"Staff should receive an EQUITY INCREASE.The effectiveness of the supervisor is an area that needs improving. Supervisors should not use their status to threaten subordinates in order to gain control. Supervisors should be responsible for their area by assisting their staff to succeed. They should not set their area up for failure. Supervisors should be professional and not carry negative attitudes because it affects the morale of the staff in their area. Supervisors should not excessively socialize during business hours - while subordinates are reprimanded if they socialize, supervisors get away without getting reprimanded. Supervisors should be as productive as, or even more productive than the staff in their area. Lack of effort and motivation from supervisors create feelings of bad morale. Supervisors should not abuse their staff by demanding high productivity, when the supervisors themself do not contribute to their success. Some supervisors do not have enough work to do - they delegate every work they have to their subordinates - even though their subordinates are extremely overworked, and then the supervisors are socializing among other supervisors for hours and they do not get reprimanded for being unproductive because the subordinates do have anyone they can go to, to report the unproductiveness of their supervisor.It is difficult to respect a supervisor if they do not have much knowledge in their area (when staff asks questions to their supervisor, they will not assist the staff, but the supervisors will simply say ""I don't know"" and they will not provide any guidance nor any resources to help the staff.) Yet this supervisor will demand that their subordinates respect them - respect is earned, it should not be taken for granted.A supervisor should not become a supervisor, just because they have been in the department for xx number of years. Currently, there is a supervisor who does not have much knowledge about their area, they stay afloat only because of the dedicated and knowledgeable staff that work under them; however, that supervisor does not see it that way and takes everything for granted and tries to intimidate the staff by threatening them. Intimidation and threats by a supervisor should not be a means to try to motivate staff - intimidation and threats will have a negative impact on staff. Supervisors should be respectful of deadlines in their area, even though they do not have any work to do for the deadline.",is_very_negative;is_negative;is_suggestion;feels_threatened


## Standard Themes - S@W

Basic Configuration

In [28]:
#export
m_pre_standard_themes_saw = ''
m_suf_standard_themes_saw = '_multilabel_hf'
base_model_name_standard_themes_saw  = 'verbatim_standard_theme_saw'

saw_standard_themes_train_config = {
    'm_pre': m_pre_standard_themes_saw,
    'm_suf': m_suf_standard_themes_saw,
    'base_model_name': base_model_name_standard_themes_saw,
    
    'max_seq_length': None,
    'batch_size': 8,
    'corpus_cols': ['answer_text'],
    'corpus_suf': '_ans',
    'train_data': STANDARD_THEME_SAW_PATH/'train.csv',
    'valid_data': STANDARD_THEME_SAW_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_SAW_PATH/f'data_{base_model_name_standard_themes_saw}.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'weighted',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'precision_score', 
    'save_model_comp': np.greater,
    'save_model_filename': f'{m_pre_standard_themes_saw}{base_model_name_standard_themes_saw}{m_suf_standard_themes_saw}_bestmodel',
    'export_filename': f'{m_pre_standard_themes_saw}{base_model_name_standard_themes_saw}{m_suf_standard_themes_saw}_export.pkl',

    'learner_path': STANDARD_THEME_SAW_PATH
}

Prepare the data source

In [29]:
train_df = pd.read_csv(saw_standard_themes_train_config['train_data'])
valid_df = pd.read_csv(saw_standard_themes_train_config['valid_data'])

In [30]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,adequate_staffing,advancement_and_training_opportunities,appropriate_stress_work_assigned_equitably,benefits,better_ways_recognized_participate_in_decisions,career_advancement,committed_to_diversity,communicates_essential_information,ethical_conduct_perform_responsibilities_spirit_of_cooperation,evaluated_fairly,experienced_discrimination,facilities_workspace_safety,faculty_value_contributions,favoritism_cliques,fear_of_retaliation_negative_consequences,feel_valued_by_department,flexibility_work_life_balance,good_use_of_skills,have_necessary_tools,have_voice_within_my_institution_valued_member_of_my_institution,internal_processes_effective,parking_transportation,salary_pay,satisfied_with_diversity_progams,supervisor_effectiveness_resolves_staff_issues
0,589315,1877,"From day one, my department and colleagues made me feel welcomed. Most importantly, I sincerely ...",,English,212,9,SAW,UCSD,451357,SAT,"If you would like to elaborate on your responses above, or if you have any additional feedback r...",Verbatim,114,Comments re Work Environment at UCSD,Satisfaction with UC San Diego,,,,1,11926.0,3267,999999.0,UC San Diego,10000.0,ACADEMIC AFFAIRS,10017.0,EVC,11926.0,EVC - TEACHING + LEARNING COMMONS,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,682696,1877,"I am overly satisfied with how my department is ran. My supervisor is very supportive, my Chair...",,English,401,9,SAW,UCSD,492788,SAT,"If you would like to elaborate on your responses above, or if you have any additional feedback r...",Verbatim,114,Comments re Work Environment at UCSD,Satisfaction with UC San Diego,,,,1,91510.0,3691,999999.0,UC San Diego,90000.0,VICE CHANCELLOR HEALTH SCIENCES,93000.0,SCHOOL OF MEDICINE,91500.0,PHARMACOLOGY,91510.0,BUSINESS OFFICE - STAFF,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [31]:
train_df.survey_id.value_counts()

212    1629
401    1425
160     914
396     319
398     290
Name: survey_id, dtype: int64

Remove any rows whre the "corpus_cols" are nan

In [32]:
train_df.dropna(subset=saw_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=saw_standard_themes_train_config['corpus_cols'], inplace=True)

In [33]:
#hide
STANDARD_THEME_SAW_LABELS

['adequate_staffing',
 'advancement_and_training_opportunities',
 'appropriate_stress_work_assigned_equitably',
 'benefits',
 'better_ways_recognized_participate_in_decisions',
 'career_advancement',
 'committed_to_diversity',
 'communicates_essential_information',
 'ethical_conduct_perform_responsibilities_spirit_of_cooperation',
 'evaluated_fairly',
 'experienced_discrimination',
 'facilities_workspace_safety',
 'faculty_value_contributions',
 'favoritism_cliques',
 'fear_of_retaliation_negative_consequences',
 'feel_valued_by_department',
 'flexibility_work_life_balance',
 'good_use_of_skills',
 'have_necessary_tools',
 'have_voice_within_my_institution_valued_member_of_my_institution',
 'internal_processes_effective',
 'parking_transportation',
 'salary_pay',
 'satisfied_with_diversity_progams',
 'supervisor_effectiveness_resolves_staff_issues']

In [34]:
#hide

# If we want to add a "labels" column with all the labels space delimited (for exmaple, as we would with
# non-encoded labels)
train_df['labels'] = train_df[STANDARD_THEME_SAW_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
valid_df['labels'] = valid_df[STANDARD_THEME_SAW_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)

train_df[['labels'] + STANDARD_THEME_SAW_LABELS].head()

Unnamed: 0,labels,adequate_staffing,advancement_and_training_opportunities,appropriate_stress_work_assigned_equitably,benefits,better_ways_recognized_participate_in_decisions,career_advancement,committed_to_diversity,communicates_essential_information,ethical_conduct_perform_responsibilities_spirit_of_cooperation,evaluated_fairly,experienced_discrimination,facilities_workspace_safety,faculty_value_contributions,favoritism_cliques,fear_of_retaliation_negative_consequences,feel_valued_by_department,flexibility_work_life_balance,good_use_of_skills,have_necessary_tools,have_voice_within_my_institution_valued_member_of_my_institution,internal_processes_effective,parking_transportation,salary_pay,satisfied_with_diversity_progams,supervisor_effectiveness_resolves_staff_issues
0,ethical_conduct_perform_responsibilities_spirit_of_cooperation feel_valued_by_department,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,feel_valued_by_department supervisor_effectiveness_resolves_staff_issues,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,fear_of_retaliation_negative_consequences,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,communicates_essential_information internal_processes_effective,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,satisfied_with_diversity_progams,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [35]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

5086

In [36]:
#export
def get_saw_standard_theme_train_data(train_config={}, trg_labels=STANDARD_THEME_SAW_LABELS):
    config = {**saw_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    train_df['labels'] = train_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df.reset_index(drop=True, inplace=True)
        valid_df['labels'] = valid_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [37]:
df = get_saw_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [38]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(STANDARD_THEME_SAW_LABELS)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

In [39]:
#hide
blocks = (
    HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), 
    MultiCategoryBlock(encoded=True, vocab=STANDARD_THEME_SAW_LABELS)
)

def get_x(inp): return ': '.join(inp[saw_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=ColReader(STANDARD_THEME_SAW_LABELS), 
                   splitter=ColSplitter(col='is_valid'))

In [40]:
#hide
set_seed(TL_RAND_SEED)
dls = dblock.dataloaders(df, bs=saw_standard_themes_train_config['batch_size'], num_workers=0)

In [41]:
#hide
print(f'The Target vocab has ({len(dls.vocab)} items)')

The Target vocab has (25 items)


In [42]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([8, 456]), torch.Size([8, 25]))

In [43]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",ethical_conduct_perform_responsibilities_spirit_of_cooperation;evaluated_fairly;favoritism_cliques;fear_of_retaliation_negative_consequences;supervisor_effectiveness_resolves_staff_issues
1,"Not sure what exactly contributes to this but UCSD is a very segregated and stoic environment. I am comfortable emailing or calling people who I've made connections with as it relates to my work but not certain that I could say about others or the willingness to want to engage with others/collaborate especially within my department. \r\n\r\nSecondly, while my supervisor might be ""newer"" to supervising, she has never given us the opportunity to offer feedback, doesn't build a relationship with us, mentor us, or effectively use meeting time to delegate tasks and give clear deadlines. I think she needs training, feedback, and people to hold her accountable. This is not new due to the pandemic, this has always been her style. \r\n\r\nGenerally, diversity, equity, and inclusion are terms university leaders use to address these issues when incidents arise. To this day, I have not seen my department discuss how they plan to implement diversity, equity, or inclusion. We don't have any Black, Latino, or Native faculty or students to my knowledge and when asked to the previous chair if we have a committee or task force dedicated to this, they said we have a couple of staff/faculty but no known mission (to my understanding). To our faculty of admissions, we are diverse because we bring in so many international students. To me this is just one facet of diversity, not everything diversity can be. To me, this indicates our faculty are not trained nor are not called to be trained on these concepts.",committed_to_diversity;supervisor_effectiveness_resolves_staff_issues


In [44]:
#hide
# save dataloaders
# torch.save(dls, STANDARD_THEME_SAW_PATH/f'data_cls_standard_themes_saw_multilabel_hf.pkl')
# dls = torch.load(STANDARD_THEME_SAW_PATH/f'data_cls_standard_themes_saw_multilabel_hf.pkl')

In [45]:
#export
def get_saw_standard_theme_train_x(inp, corpus_cols): return ': '.join(inp[corpus_cols].values)

def get_saw_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, 
                                     vocab=STANDARD_THEME_SAW_LABELS, train_config={}, use_cache=True):
    
    config = {**saw_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
            return dls
    
    blocks = (
        HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=config['max_seq_length']), 
        MultiCategoryBlock(encoded=True, vocab=vocab)
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_saw_standard_theme_train_x, corpus_cols=config['corpus_cols']), 
                       get_y=ColReader(vocab), 
                       splitter=ColSplitter(col='is_valid'))
    
    set_seed(TL_RAND_SEED)
    dls = dblock.dataloaders(df, bs=config['batch_size'], num_workers=0)
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [46]:
df = get_saw_standard_theme_train_data()
dls = get_saw_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_SAW_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [47]:
dls = get_saw_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, use_cache=False)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_SAW_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [48]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",ethical_conduct_perform_responsibilities_spirit_of_cooperation;evaluated_fairly;favoritism_cliques;fear_of_retaliation_negative_consequences;supervisor_effectiveness_resolves_staff_issues
1,"Not sure what exactly contributes to this but UCSD is a very segregated and stoic environment. I am comfortable emailing or calling people who I've made connections with as it relates to my work but not certain that I could say about others or the willingness to want to engage with others/collaborate especially within my department. \r\n\r\nSecondly, while my supervisor might be ""newer"" to supervising, she has never given us the opportunity to offer feedback, doesn't build a relationship with us, mentor us, or effectively use meeting time to delegate tasks and give clear deadlines. I think she needs training, feedback, and people to hold her accountable. This is not new due to the pandemic, this has always been her style. \r\n\r\nGenerally, diversity, equity, and inclusion are terms university leaders use to address these issues when incidents arise. To this day, I have not seen my department discuss how they plan to implement diversity, equity, or inclusion. We don't have any Black, Latino, or Native faculty or students to my knowledge and when asked to the previous chair if we have a committee or task force dedicated to this, they said we have a couple of staff/faculty but no known mission (to my understanding). To our faculty of admissions, we are diverse because we bring in so many international students. To me this is just one facet of diversity, not everything diversity can be. To me, this indicates our faculty are not trained nor are not called to be trained on these concepts.",committed_to_diversity;supervisor_effectiveness_resolves_staff_issues


## Standard Themes - CSS

Basic Configuration

In [49]:
#export
m_pre_standard_themes_css = ''
m_suf_standard_themes_css = '_multilabel_hf'
base_model_name_standard_themes_css = 'verbatim_standard_theme_css'

css_standard_themes_train_config = {
    'm_pre': m_pre_standard_themes_css,
    'm_suf': m_suf_standard_themes_css,
    'base_model_name': base_model_name_standard_themes_css,
    
    'max_seq_length': None,
    'batch_size': 8,
    'corpus_cols': ['answer_text'],
    'corpus_suf': '_ans',
    'train_data': STANDARD_THEME_CSS_PATH/'train.csv',
    'valid_data': STANDARD_THEME_CSS_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_CSS_PATH/f'data_{base_model_name_standard_themes_css}.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'weighted',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'precision_score', 
    'save_model_comp': np.greater,
    'save_model_filename': f'{m_pre_standard_themes_css}{base_model_name_standard_themes_css}{m_suf_standard_themes_css}_bestmodel',
    'export_filename': f'{m_pre_standard_themes_css}{base_model_name_standard_themes_css}{m_suf_standard_themes_css}_export.pkl',

    'learner_path': STANDARD_THEME_CSS_PATH
}

Prepare the data source

In [50]:
train_df = pd.read_csv(css_standard_themes_train_config['train_data'])
valid_df = pd.read_csv(css_standard_themes_train_config['valid_data'])

In [51]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,accessible_to_customers,consistency_in_policies_information,cost_fees,courteous_professional_staff,effective_communications,effectively_uses_websites_online_documentation,helpful_staff,knowledgeable_staff,moving_in_a_positive_direction,overall_satisfaction,process_improvement,provides_effective_advice_guidance,provides_training_on_processes_applications,resolves_problems_effectively,responds_to_requests_within_an_acceptable_time,understands_my_needs_and_requirements
0,602983,38342,"In the past 1-2 years, the office changed their policy to only give international postdocs J1 vi...",,English,215,15,CSS-FACULTY-STAFF-ONLY,UCSD,454199,ISPO,Let us know your suggestions on how to improve the International Students & Programs Office.,Verbatim-Dept-Improve,636.0,ISPO_Improve,International Students & Programs Office,Student,Global Education,International Program,1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,597736,12605,"The custodians are kind and competent people. However, the unit, at least in the research buildi...",,English,215,15,CSS-FACULTY-STAFF-ONLY,UCSD,454041,Custodial Services,Let us know your suggestions on how to improve Custodial Services.,Verbatim-Dept-Improve,140.0,Custodial_Improve,Custodial Services,Facilities,"Building, Custodial, Facilities Maintenance",Custodial Services,1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [52]:
train_df.survey_id.value_counts()

215    2818
124       1
131       1
Name: survey_id, dtype: int64

Remove any rows whre the "corpus_cols" are nan

In [53]:
train_df.dropna(subset=css_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=css_standard_themes_train_config['corpus_cols'], inplace=True)

In [54]:
#hide
STANDARD_THEME_CSS_LABELS

['accessible_to_customers',
 'consistency_in_policies_information',
 'cost_fees',
 'courteous_professional_staff',
 'effective_communications',
 'effectively_uses_websites_online_documentation',
 'helpful_staff',
 'knowledgeable_staff',
 'moving_in_a_positive_direction',
 'overall_satisfaction',
 'process_improvement',
 'provides_effective_advice_guidance',
 'provides_training_on_processes_applications',
 'resolves_problems_effectively',
 'responds_to_requests_within_an_acceptable_time',
 'understands_my_needs_and_requirements']

In [55]:
#hide

# If we want to add a "labels" column with all the labels space delimited (for exmaple, as we would with
# non-encoded labels)
train_df['labels'] = train_df[STANDARD_THEME_CSS_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
valid_df['labels'] = valid_df[STANDARD_THEME_CSS_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)

train_df[['labels'] + STANDARD_THEME_CSS_LABELS].head()

Unnamed: 0,labels,accessible_to_customers,consistency_in_policies_information,cost_fees,courteous_professional_staff,effective_communications,effectively_uses_websites_online_documentation,helpful_staff,knowledgeable_staff,moving_in_a_positive_direction,overall_satisfaction,process_improvement,provides_effective_advice_guidance,provides_training_on_processes_applications,resolves_problems_effectively,responds_to_requests_within_an_acceptable_time,understands_my_needs_and_requirements
0,consistency_in_policies_information,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,courteous_professional_staff,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,courteous_professional_staff,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,resolves_problems_effectively,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,helpful_staff,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [56]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

3137

In [57]:
#export
def get_css_standard_theme_train_data(train_config={}, trg_labels=STANDARD_THEME_CSS_LABELS):
    config = {**css_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    train_df['labels'] = train_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df.reset_index(drop=True, inplace=True)
        valid_df['labels'] = valid_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [58]:
df = get_css_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [59]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(STANDARD_THEME_CSS_LABELS)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

In [60]:
#hide
blocks = (
    HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), 
    MultiCategoryBlock(encoded=True, vocab=STANDARD_THEME_CSS_LABELS)
)

def get_x(inp): return ': '.join(inp[css_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=ColReader(STANDARD_THEME_CSS_LABELS), 
                   splitter=ColSplitter(col='is_valid'))

In [61]:
#hide
set_seed(TL_RAND_SEED)
dls = dblock.dataloaders(df, bs=css_standard_themes_train_config['batch_size'], num_workers=0)

In [62]:
#hide
print(f'The Target vocab has ({len(dls.vocab)} items)')

The Target vocab has (16 items)


In [63]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([8, 331]), torch.Size([8, 16]))

In [64]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"Expand Shuttle Services throughout the Greater San Diego area beyond UCSD campus stops. \r\nI recently relocated to San Diego from Seattle, while in Seattle I worked for Microsoft for quite some time. Microsoft has developed a network of shuttles that connects neighborhoods directly to their campuses that run regularly Monday-Friday. San Diego MTS is not optimized to transport people to UCSD's campus so there are obviously many holes. Even once the trolley expansion is complete key neighborhoods will still not have a convenient option to get to UCSD via public transportation. Neighborhood-specific shuttles would help address these issues directly.\r\n\r\nCampus-wide create a mandate for all eligible employees to work from home at least one day a week.\r\nWhen I started with UCSD I was very surprised to find out how far behind UCSD is in terms of letting employees work from home. Having worked in the tech industry in Seattle my previous employer allowed a very flexible schedule, we would work from home as we saw fit, which usually amounted to approx. 2 days a week.\r\nPersonally, for my position at UCSD I could work from home 3-4 days a week joining all meetings via Zoom. The old school mentality of supervisors in my department makes it so my colleagues and I have to suffer through the awful commute to/from campus and pay for parking 5 days a week. I suspect this old school mentality is present in many parts of the campus, which indirectly tells employees that they are not trusted.",process_improvement
1,"The fact that maintenance crews tried to diagnose the symptoms of the problem and not the root of the problems I have requested something to be weary about. I worked in the Math Department and there was a leak coming from the 3rd floor into a 2nd floor storage room. This has resulted into damaged and rusty filing cabinets. \r\n\r\nMy department has placed requests numerous times over this issue over the years and nothing has been resolved. We had determined that it is cracked concrete allowing the water to leak down below from the sanitation machines above that will occasionally leak water. The project needs new resealing of the concrete and flooring on the 3rd floor and instead maintenance has suggested we cover the filing cabinets or ""just wait it out."" This is only treating the symptom and not the root of the problem. The problem was also passed off to 2 other facilities maintenance employees. My impression is there are two out of 4 tickets still open for this issue to this day.\r\n\r\nThe maintenance urgent phone line was very respectable and very friendly. They apologized on behalf of the maintenance workers for the lack of communication/follow-up/work.\r\n\r\nFacilities Maintenance on Campus is well-known to have a poor reputation of communication/responsibility/accountability/responsiveness. Perhaps this is a big reason as to why certain departments take matter into their own hands.",courteous_professional_staff;knowledgeable_staff;resolves_problems_effectively;responds_to_requests_within_an_acceptable_time


In [65]:
#hide
# save dataloaders
# torch.save(dls, STANDARD_THEME_CSS_PATH/f'data_cls_standard_themes_css_multilabel_hf.pkl')
# dls = torch.load(STANDARD_THEME_CSS_PATH/f'data_cls_standard_themes_css_multilabel_hf.pkl')

In [66]:
#export
def get_css_standard_theme_train_x(inp, corpus_cols): return ': '.join(inp[corpus_cols].values)

def get_css_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, 
                                     vocab=STANDARD_THEME_CSS_LABELS, train_config={}, use_cache=True):
    
    config = {**css_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
            return dls
    
    blocks = (
        HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=config['max_seq_length']), 
        MultiCategoryBlock(encoded=True, vocab=vocab)
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_css_standard_theme_train_x, corpus_cols=config['corpus_cols']), 
                       get_y=ColReader(vocab), 
                       splitter=ColSplitter(col='is_valid'))
    
    set_seed(TL_RAND_SEED)
    dls = dblock.dataloaders(df, bs=config['batch_size'], num_workers=0)
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [67]:
df = get_css_standard_theme_train_data()
dls = get_css_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_CSS_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [68]:
dls = get_css_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, use_cache=False)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_CSS_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [69]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"Expand Shuttle Services throughout the Greater San Diego area beyond UCSD campus stops. \r\nI recently relocated to San Diego from Seattle, while in Seattle I worked for Microsoft for quite some time. Microsoft has developed a network of shuttles that connects neighborhoods directly to their campuses that run regularly Monday-Friday. San Diego MTS is not optimized to transport people to UCSD's campus so there are obviously many holes. Even once the trolley expansion is complete key neighborhoods will still not have a convenient option to get to UCSD via public transportation. Neighborhood-specific shuttles would help address these issues directly.\r\n\r\nCampus-wide create a mandate for all eligible employees to work from home at least one day a week.\r\nWhen I started with UCSD I was very surprised to find out how far behind UCSD is in terms of letting employees work from home. Having worked in the tech industry in Seattle my previous employer allowed a very flexible schedule, we would work from home as we saw fit, which usually amounted to approx. 2 days a week.\r\nPersonally, for my position at UCSD I could work from home 3-4 days a week joining all meetings via Zoom. The old school mentality of supervisors in my department makes it so my colleagues and I have to suffer through the awful commute to/from campus and pay for parking 5 days a week. I suspect this old school mentality is present in many parts of the campus, which indirectly tells employees that they are not trusted.",process_improvement
1,"The fact that maintenance crews tried to diagnose the symptoms of the problem and not the root of the problems I have requested something to be weary about. I worked in the Math Department and there was a leak coming from the 3rd floor into a 2nd floor storage room. This has resulted into damaged and rusty filing cabinets. \r\n\r\nMy department has placed requests numerous times over this issue over the years and nothing has been resolved. We had determined that it is cracked concrete allowing the water to leak down below from the sanitation machines above that will occasionally leak water. The project needs new resealing of the concrete and flooring on the 3rd floor and instead maintenance has suggested we cover the filing cabinets or ""just wait it out."" This is only treating the symptom and not the root of the problem. The problem was also passed off to 2 other facilities maintenance employees. My impression is there are two out of 4 tickets still open for this issue to this day.\r\n\r\nThe maintenance urgent phone line was very respectable and very friendly. They apologized on behalf of the maintenance workers for the lack of communication/follow-up/work.\r\n\r\nFacilities Maintenance on Campus is well-known to have a poor reputation of communication/responsibility/accountability/responsiveness. Perhaps this is a big reason as to why certain departments take matter into their own hands.",courteous_professional_staff;knowledgeable_staff;resolves_problems_effectively;responds_to_requests_within_an_acceptable_time


## Standard Themes - Meta

Basic configuration

In [70]:
#export
m_pre_standard_themes_meta = ''
m_suf_standard_themes_meta = '_multitask_hf'
base_model_name_standard_themes_meta = 'verbatim_standard_theme_meta'

meta_standard_themes_train_config = {
    'm_pre': m_pre_standard_themes_meta,
    'm_suf': m_suf_standard_themes_meta,
    'base_model_name': base_model_name_standard_themes_meta,
    
    'max_seq_length': None,
    'batch_size': 8,
    'corpus_cols': ['theme', 'answer_text'],
    'corpus_suf': '_multitask',
    'train_data': STANDARD_THEME_META_PATH/'train.csv',
    'valid_data': STANDARD_THEME_META_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_META_PATH/f'data_{base_model_name_standard_themes_meta}.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'binary',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'valid_loss', 
    'save_model_comp': np.less,
    'save_model_filename': f'{m_pre_standard_themes_meta}{base_model_name_standard_themes_meta}{m_suf_standard_themes_meta}_bestmodel',
    'export_filename': f'{m_pre_standard_themes_meta}{base_model_name_standard_themes_meta}{m_suf_standard_themes_meta}_export.pkl',

    'learner_path': STANDARD_THEME_META_PATH
}

Prepare the data source

In [71]:
train_df = pd.read_csv(STANDARD_THEME_META_PATH/'train.csv')
valid_df = pd.read_csv(STANDARD_THEME_META_PATH/'test.csv')

In [72]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,standard_theme_id,theme,url_friendly_theme,theme_display_order,avg_sentiment,is_example
0,660454,93069,"""Academics at UC ANR value my contributions.""\r\n""Staff members at UC ANR value my contributions...",,English,396,47,SAW,UCANR,480552,,Please provide any additional feedback regarding the work environment at UC ANR. Your comments w...,Verbatim-Comments,1141.0,Comments re Work Environment at UC ANR,Comments,,,,3.0,250400.0,6984,999999.0,UC Agriculture & Natural Resources,200000.0,AVP Programs and Initiatives,250000.0,Strategic Institutes and Statewide Programs,250400.0,Statewide IPM,,,,,,,,,10,Have Voice within my Institution/Valued Member of my Institution,HaveVoiceWithinMyInstitutionValuedMemberOfMyInstitution,1,2.0,0
1,589692,2576,*The MSO of this department consistently takes unfair advantage of power dynamics to intimidate ...,,English,212,9,SAW,UCSD,447156,C&B,If you would like to elaborate on any of your answers to the conduct and behavioral questions ab...,Verbatim,1240.0,Conduct & Behavioral - Comments,Conduct & Behavioral,,,,3.0,10104.0,3437,999999.0,UC San Diego,10000.0,ACADEMIC AFFAIRS,10002.0,DIVISIONS/SCHOOLS,10003.0,ARTS & HUMANITIES,10104.0,MUSIC,,,,,,,19,Supervisor Effectiveness/Resolves Staff Issues,SupervisorEffectivenessResolvesStaffIssues,1,1.0,0


In [73]:
train_df.is_example.dtype

dtype('int64')

Remove any rows whre the "corpus_cols" are nan

In [74]:
train_df.dropna(subset=meta_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=meta_standard_themes_train_config['corpus_cols'], inplace=True)

In [75]:
#hide
# list(set(train_df.theme.unique()))

In [76]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

13413

In [77]:
#export
def get_meta_standard_theme_train_data(train_config={}):
    config = {**meta_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df.reset_index(drop=True, inplace=True)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [78]:
df = get_meta_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [79]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

In [80]:
#hide
blocks = (
    HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), 
    RegressionBlock(),
    CategoryBlock()
)

def get_x(inp): return ': '.join(inp[meta_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=[ColReader('avg_sentiment'), ColReader('is_example')], 
                   splitter=ColSplitter(col='is_valid'), 
                   n_inp=1)

In [81]:
#hide
set_seed(TL_RAND_SEED)
dls = dblock.dataloaders(df, bs=saw_standard_themes_train_config['batch_size'], num_workers=0)

In [82]:
dls.vocab

[0, 1]

In [83]:
dls.before_batch[0].hf_tokenizer.vocab_size

50265

In [84]:
#hide
print((f'The inputs vocab ({dls.before_batch[0].hf_tokenizer.vocab_size} items), '
       f'and the targets ({len(dls.vocab)} items)'))

The inputs vocab (50265 items), and the targets (2 items)


In [85]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape, b[2].shape, b[1].type(), b[2].type()

(3,
 torch.Size([8, 465]),
 torch.Size([8]),
 torch.Size([8]),
 'torch.cuda.FloatTensor',
 'torch.cuda.LongTensor')

In [86]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,text_,category
0,"Supervisor Effectiveness/Resolves Staff Issues: In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",1.0,0
1,"Have Voice within my Institution/Valued Member of my Institution: It is tough to answer the questions this year. \r\nI used to know my resources, and feel positively about training. I do not feel the same since ESR. RSC has limited job functionality because OFC was rolled-out without appropriate reporting and or cost transfers. We have spent many frustrating hours trying to figure out how to access the systems and functions that are up and running (Path). We are left on the front lines with faculty and departments with inadequate tools to help.\r\nThe issues above do not make me feel valued as a member of the UC San Diego community or like leadership listens. It feels like no one even stopped to think about what a fund manager does, and how they will be able to continue those functions in the new systems. \r\nI realize now it is hard for me to comment on inclusion. I think we do a good job, or at least try to, but I am not in a group that would have ever felt excluded based on my background or orientation. \r\nAmount of work remains an issue. The job expands to the hours allotted. We could do a lot better, more thorough and thoughtful work if we had some space to breath and think. Instead we are running from one fire to the next. Many areas had to learn 1 new ESR system. We had to learn ALL of them. It's WAY too much. On top of already having more than enough work to fill every minute of a 40 hr week. And we have learned that somewhere in the ESR process, it was decided (without our knowledge/input/concurrence) that we would take on more duties. Yay.",2.0,0


In [87]:
#export
def get_meta_standard_theme_train_x(inp, corpus_cols): return 'theme: ' + ' comment: '.join(inp[corpus_cols].values)

def get_meta_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, 
                                      train_config={}, use_cache=True):
    
    config = {**meta_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
            return dls
    
    blocks = (
        HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=config['max_seq_length']), 
        RegressionBlock(),
        CategoryBlock()
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_meta_standard_theme_train_x, corpus_cols=config['corpus_cols']),
                       get_y=[ColReader('avg_sentiment'), ColReader('is_example')], 
                       splitter=ColSplitter(col='is_valid'), 
                       n_inp=1)
    
    set_seed(TL_RAND_SEED)
    dls = dblock.dataloaders(df, bs=config['batch_size'], num_workers=0)
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [88]:
df = get_meta_standard_theme_train_data()

dls = get_meta_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model)

test_eq(dls.bs, meta_standard_themes_train_config['batch_size'])
test_eq(2, len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 3)
test_eq(b[1].shape[0], dls.bs)    # = regression task
test_eq(b[2].shape[0], dls.bs)    # = classification task

In [89]:
dls = get_meta_standard_theme_train_dls(df, hf_arch, hf_config, hf_tokenizer, hf_model, use_cache=False)

test_eq(dls.bs, meta_standard_themes_train_config['batch_size'])
test_eq(2, len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 3)
test_eq(b[1].shape[0], dls.bs)    # = regression task
test_eq(b[2].shape[0], dls.bs)    # = classification task

In [90]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,text_,category
0,"theme: Supervisor Effectiveness/Resolves Staff Issues comment: In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",1.0,0
1,"theme: Have Voice within my Institution/Valued Member of my Institution comment: It is tough to answer the questions this year. \r\nI used to know my resources, and feel positively about training. I do not feel the same since ESR. RSC has limited job functionality because OFC was rolled-out without appropriate reporting and or cost transfers. We have spent many frustrating hours trying to figure out how to access the systems and functions that are up and running (Path). We are left on the front lines with faculty and departments with inadequate tools to help.\r\nThe issues above do not make me feel valued as a member of the UC San Diego community or like leadership listens. It feels like no one even stopped to think about what a fund manager does, and how they will be able to continue those functions in the new systems. \r\nI realize now it is hard for me to comment on inclusion. I think we do a good job, or at least try to, but I am not in a group that would have ever felt excluded based on my background or orientation. \r\nAmount of work remains an issue. The job expands to the hours allotted. We could do a lot better, more thorough and thoughtful work if we had some space to breath and think. Instead we are running from one fire to the next. Many areas had to learn 1 new ESR system. We had to learn ALL of them. It's WAY too much. On top of already having more than enough work to fill every minute of a 40 hr week. And we have learned that somewhere in the ESR process, it was decided (without our knowledge/input/concurrence) that we would take on more duties. Yay.",2.0,0


## Cleanup

In [91]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 02a_verbatims-core.ipynb.
Converted 02b_verbatims-sentiment.ipynb.
Converted 02c_verbatims-standard-themes-saw-training.ipynb.
Converted 02d_verbatims-standard-themes-css-training.ipynb.
Converted 02e_verbatims-standard-themes-meta-training.ipynb.
Converted 99_verbatims-inference.ipynb.
Converted index.ipynb.
