In [1]:
#default_exp verbatims/core

In [2]:
#all_slow

In [3]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Verbatims - Core

> This module defines the training configuration objects for all verbatim ML tasks, DataBlocks, and helper methods to build DataLoaders for each

In [4]:
#export
import os, datetime
import sklearn.metrics as skm
from tritonlytics_ai.utils import *

from fastai.text.all import *
from transformers import *
from blurr.utils import *
from blurr.data.core import *
from blurr.modeling.core import *

In [5]:
#hide
import pdb, gc

# pandas and plotting config
import seaborn as sns
sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

from nbdev.showdoc import *
from fastcore.test import *

In [6]:
#hide
from fastai import __version__ as fa_version
from torch import __version__ as pt_version
from transformers import __version__ as hft_version

print(f'Using pytorch {pt_version}')
print(f'Using fastai {fa_version}')
print(f'Using transformers {hft_version}')

Using pytorch 1.6.0
Using fastai 2.0.16
Using transformers 3.3.1


In [7]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Standard Themes - S@W

Basic Configuration

In [8]:
#export
m_pre_standard_themes_saw = ''
m_suf_standard_themes_saw = '_multilabel_hf'

saw_standard_themes_train_config = {
    'm_suf': m_suf_standard_themes_saw,
    'm_pre': m_pre_standard_themes_saw,
    
    'batch_size': 8,
    'corpus_cols': ['answer_text'],
    'corpus_suf': '_ans',
    'train_data': STANDARD_THEME_SAW_PATH/'train.csv',
    'valid_data': STANDARD_THEME_SAW_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_SAW_PATH/'data_cls_standard_themes_saw.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'weighted',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'precision_score', 
    'save_model_comp': np.greater,
    'save_model_filename': f'{m_pre_standard_themes_saw}cls_bestmodel{m_suf_standard_themes_saw}',
    'export_filename': f'{m_pre_standard_themes_saw}export_clas{m_suf_standard_themes_saw}.pkl',

    'learner_path': STANDARD_THEME_SAW_PATH
}

Prepare the data source

In [9]:
train_df = pd.read_csv(saw_standard_themes_train_config['train_data'])
valid_df = pd.read_csv(saw_standard_themes_train_config['valid_data'])

In [10]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,adequate_staffing,advancement_and_training_opportunities,appropriate_stress_work_assigned_equitably,benefits,better_ways_recognized_participate_in_decisions,career_advancement,committed_to_diversity,communicates_essential_information,ethical_conduct_perform_responsibilities_spirit_of_cooperation,evaluated_fairly,experienced_discrimination,facilities_workspace_safety,faculty_value_contributions,favoritism_cliques,fear_of_retaliation_negative_consequences,feel_valued_by_department,flexibility_work_life_balance,good_use_of_skills,have_necessary_tools,have_voice_within_my_institution_valued_member_of_my_institution,internal_processes_effective,parking_transportation,salary_pay,satisfied_with_diversity_progams,supervisor_effectiveness_resolves_staff_issues
0,588941,1877,I enjoy our department potlucks and social activities. I am grateful that our department leader...,,English,212,9,SAW,UCSD,449396,,"59. If you would like to elaborate on your responses above, or if you have any additional feedba...",Verbatim,201,Comments re Work Environment at UCSD,Other,,,,1,24000.0,3834,999999.0,UC San Diego,8000.0,MARINE SCIENCES,24000.0,DIRECTORS OFFICE-SIO,,,,,,,,,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,402308,1877,"UCSD needs to keep up with the modern workplace in order to get stronger, more competitive candi...",,English,160,9,SAW,UCSD,272347,,"59. If you would like to elaborate on your responses above, or if you have any additional feedba...",Verbatim,201,Comments re Work Environment at UCSD,Other,,,,1,31404.0,3761,999999.0,UC San Diego,30000.0,Student Affairs,31400.0,Student Life,31404.0,Student Life Business Office,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [11]:
train_df.survey_id.value_counts()

212    1635
160     909
396     304
398     284
401      12
Name: survey_id, dtype: int64

Remove any rows whre the "corpus_cols" are nan

In [12]:
train_df.dropna(subset=saw_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=saw_standard_themes_train_config['corpus_cols'], inplace=True)

In [13]:
#hide
STANDARD_THEME_SAW_LABELS

['adequate_staffing',
 'advancement_and_training_opportunities',
 'appropriate_stress_work_assigned_equitably',
 'benefits',
 'better_ways_recognized_participate_in_decisions',
 'career_advancement',
 'committed_to_diversity',
 'communicates_essential_information',
 'ethical_conduct_perform_responsibilities_spirit_of_cooperation',
 'evaluated_fairly',
 'experienced_discrimination',
 'facilities_workspace_safety',
 'faculty_value_contributions',
 'favoritism_cliques',
 'fear_of_retaliation_negative_consequences',
 'feel_valued_by_department',
 'flexibility_work_life_balance',
 'good_use_of_skills',
 'have_necessary_tools',
 'have_voice_within_my_institution_valued_member_of_my_institution',
 'internal_processes_effective',
 'parking_transportation',
 'salary_pay',
 'satisfied_with_diversity_progams',
 'supervisor_effectiveness_resolves_staff_issues']

In [14]:
#hide

# If we want to add a "labels" column with all the labels space delimited (for exmaple, as we would with
# non-encoded labels)
train_df['labels'] = train_df[STANDARD_THEME_SAW_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
valid_df['labels'] = valid_df[STANDARD_THEME_SAW_LABELS].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)

train_df[['labels'] + STANDARD_THEME_SAW_LABELS].head()

Unnamed: 0,labels,adequate_staffing,advancement_and_training_opportunities,appropriate_stress_work_assigned_equitably,benefits,better_ways_recognized_participate_in_decisions,career_advancement,committed_to_diversity,communicates_essential_information,ethical_conduct_perform_responsibilities_spirit_of_cooperation,evaluated_fairly,experienced_discrimination,facilities_workspace_safety,faculty_value_contributions,favoritism_cliques,fear_of_retaliation_negative_consequences,feel_valued_by_department,flexibility_work_life_balance,good_use_of_skills,have_necessary_tools,have_voice_within_my_institution_valued_member_of_my_institution,internal_processes_effective,parking_transportation,salary_pay,satisfied_with_diversity_progams,supervisor_effectiveness_resolves_staff_issues
0,committed_to_diversity feel_valued_by_department satisfied_with_diversity_progams,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,have_voice_within_my_institution_valued_member_of_my_institution,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,ethical_conduct_perform_responsibilities_spirit_of_cooperation experienced_discrimination,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,adequate_staffing salary_pay,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,appropriate_stress_work_assigned_equitably faculty_value_contributions,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

3494

In [16]:
#export
def get_saw_standard_theme_train_data(train_config={}, trg_labels=STANDARD_THEME_SAW_LABELS):
    config = {**saw_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df['labels'] = train_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df['labels'] = valid_df[trg_labels].apply(lambda x: ' '.join(x.index[x.astype(bool)]), axis=1)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [17]:
df = get_saw_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [18]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(STANDARD_THEME_SAW_LABELS)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [19]:
#hide
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    MultiCategoryBlock(encoded=True, vocab=STANDARD_THEME_SAW_LABELS)
)

def get_x(inp): return ': '.join(inp[saw_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=ColReader(STANDARD_THEME_SAW_LABELS), 
                   splitter=ColSplitter(col='is_valid'))

In [20]:
#hide
dls = dblock.dataloaders(df, bs=saw_standard_themes_train_config['batch_size'])

In [21]:
#hide
print(f'The Target vocab has ({len(dls.vocab)} items)')

The Target vocab has (25 items)


In [22]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([8, 456]), torch.Size([8, 25]))

In [23]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",ethical_conduct_perform_responsibilities_spirit_of_cooperation;evaluated_fairly;favoritism_cliques;fear_of_retaliation_negative_consequences;supervisor_effectiveness_resolves_staff_issues
1,"The campus should ensure that each dept:\r\n-carefully reviews job descriptions and knows who is doing what work so that staff can be appropriately recognized and compensated for the work they do, as opposed to that recognition and compensation going to someone else.\r\n-has succession plans for each area; as staff leave/retire this should make it possible for current staff to move up. The campus should ensure staff are not required to leave their unit simply to advance professionally.\r\n-MSO mentors the managers within the unit so that the managers can be the most effective. MSOs also need to actually allow supervisors to supervise.\r\n-head (staff and faculty) is held to a non-tolerance standard for speaking negatively about staff within their unit and/or calling names. \r\n-head (staff and faculty) and each manager is transparent and timely in the communication of information to members of the unit. Instead of excuses and apologies, corrective actions should be taken and repeats of problematic situations should not reoccur.\r\n-has adequate funds and adequate space for the members of their unit, and the customers the unit serves. All staff areas within the unit should have comparable accommodations.\r\n-is safe for staff. Panic buttons/alarms, safety and active shooter training, and appropriate access barriers are absolutely necessary for all areas and in all units.\r\n-allows flex-schedules and work-from-home in as much as possible. “Service” should be reimagined to match current technology.",fear_of_retaliation_negative_consequences;salary_pay


In [24]:
#hide
# save dataloaders
# torch.save(dls, STANDARD_THEME_SAW_PATH/f'data_cls_standard_themes_saw_multilabel_hf.pkl')
# dls = torch.load(STANDARD_THEME_SAW_PATH/f'data_cls_standard_themes_saw_multilabel_hf.pkl')

In [25]:
#export
def get_saw_standard_theme_train_x(inp, corpus_cols): return ': '.join(inp[corpus_cols].values)

def get_saw_standard_theme_train_dls(df, hf_arch, hf_tokenizer, vocab=STANDARD_THEME_SAW_LABELS, 
                                     train_config={}, use_cache=True):
    
    config = {**saw_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
    
    blocks = (
        HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
        MultiCategoryBlock(encoded=True, vocab=vocab)
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_saw_standard_theme_train_x, corpus_cols=config['corpus_cols']), 
                       get_y=ColReader(vocab), 
                       splitter=ColSplitter(col='is_valid'))
    
    dls = dblock.dataloaders(df, bs=config['batch_size'])
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [26]:
df = get_saw_standard_theme_train_data()
dls = get_saw_standard_theme_train_dls(df, hf_arch, hf_tokenizer)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_SAW_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [27]:
dls = get_saw_standard_theme_train_dls(df, hf_arch, hf_tokenizer, use_cache=False)

test_eq(dls.bs, saw_standard_themes_train_config['batch_size'])
test_eq(len(STANDARD_THEME_SAW_LABELS), len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 2)
test_eq(b[1].shape[1], len(dls.vocab))

In [28]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",ethical_conduct_perform_responsibilities_spirit_of_cooperation;evaluated_fairly;favoritism_cliques;fear_of_retaliation_negative_consequences;supervisor_effectiveness_resolves_staff_issues
1,"In high-visibility areas, workload inequities have gone down, but in smaller units with less exposure to outside groups, these imbalances are fairly common. Over time this poisons office culture and causes our more skilled, liked, and hardworking staff to leave the department. The inability of middle-management to redress these issues contributes to our high turnover rates and poor perception among the greater UCSD community.\r\n\r\nFor years we've heard that groups within our department, VCHS, and beyond are too'silo'd.' While it may be that duplication of effort has decreased with some services becoming centralized, the spirits of communication, cooperation, and especially collaboration are not alive and well. More than ever groups jealously guard their secrets and responsibilities, or even burn bridges with others over petty displays of power. Rumor is rife and discretion lacking.\r\n\r\nWe are all family at UCSD, and word gets around fast. Our challenge is to clean up staff workloads and get our middle-management some leadership training. Many are soft skill-deficient, and in misguided attempts to entrench themselves in the confines of their immediate staff for protection, they lose respect and the ability to work with others.\r\n\r\nLastly, hard skills are easy to pick up. We really ought to be recruiting people for their character, work ethic, and desire to contribute to the mission. We should encourage, promote, and demonstrate that mission to all employees and get us working together.",appropriate_stress_work_assigned_equitably;committed_to_diversity;communicates_essential_information;supervisor_effectiveness_resolves_staff_issues


## Standard Themes - Meta

Basic configuration

In [29]:
#export
m_pre_standard_themes_meta = ''
m_suf_standard_themes_meta = '_multitask_hf'

meta_standard_themes_train_config = {
    'm_pre': m_pre_standard_themes_meta,
    'm_suf': m_suf_standard_themes_meta,
    
    'batch_size': 8,
    'corpus_cols': ['theme', 'answer_text'],
    'corpus_suf': '_multitask',
    'train_data': STANDARD_THEME_META_PATH/'train.csv',
    'valid_data': STANDARD_THEME_META_PATH/'test.csv',
    'cache_data_path': STANDARD_THEME_META_PATH/'data_mm_standard_themes_meta.pkl',
    
    'opt_beta': 0.5, 
    'opt_beta_average': 'binary',
    'opt_beta_sample_weight': None,
    'opt_start': 0.08, 
    'opt_end': 0.7,
    
    'save_model_monitor': 'valid_loss', 
    'save_model_comp': np.less,
    'save_model_filename': f'{m_pre_standard_themes_meta}mm_bestmodel{m_suf_standard_themes_meta}',
    'export_filename': f'{m_pre_standard_themes_meta}export_mm{m_suf_standard_themes_meta}.pkl',

    'learner_path': STANDARD_THEME_META_PATH
}

Prepare the data source

In [30]:
train_df = pd.read_csv(STANDARD_THEME_META_PATH/'train.csv')
valid_df = pd.read_csv(STANDARD_THEME_META_PATH/'test.csv')

In [31]:
train_df.head(2)

Unnamed: 0,id,question_ans_id,answer_text,answer_text_non_english,language,survey_id,survey_type_id,benchmark_survey_type,client_id,rsp_id,question_category_abbr,question_text,question_class,question_category_id,question_report_abbr,question_category_label,benchmark_level1,benchmark_level2,benchmark_level3,client_benchmark_level,group_code,group_id,group_level1_code,group_level1_name,group_level2_code,group_level2_name,group_level3_code,group_level3_name,group_level4_code,group_level4_name,group_level5_code,group_level5_name,group_level6_code,group_level6_name,group_level7_code,group_level7_name,group_level8_code,group_level8_name,standard_theme_id,theme,url_friendly_theme,theme_display_order,avg_sentiment,is_example
0,594369,12281,Handling things such as child support and wage garnishment/levies is inconsistent and has led to...,,English,215,15,CSS-FACULTY-STAFF-ONLY,UCSD,453733,Payroll,Let us know your suggestions on how to improve Payroll.,Verbatim-Dept-Improve,88.0,Payroll_Improve,Payroll,Human Resources,Payroll Services,Payroll,1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,35,"Consistency in Policies, Information",ConsistencyInPoliciesInformation,1,2.0,0
1,589686,2576,Diversity is such an important part o UC mission and yet some places its not seen as much it is ...,,English,212,9,SAW,UCSD,447092,C&B,21. If you would like to elaborate on any of your answers to the conduct and behavioral question...,Verbatim,1240.0,Conduct & Behavioral - Comments,Conduct & Behavioral,,,,3.0,90890.0,3662,999999.0,UC San Diego,90000.0,VICE CHANCELLOR HEALTH SCIENCES,93000.0,SCHOOL OF MEDICINE,90890.0,DERMATOLOGY,,,,,,,,,17,Satisfied with Diversity Progams,SatisfiedWithDiversityProgams,1,2.5,0


In [32]:
train_df.is_example.dtype

dtype('int64')

Remove any rows whre the "corpus_cols" are nan

In [33]:
train_df.dropna(subset=meta_standard_themes_train_config['corpus_cols'], inplace=True)
valid_df.dropna(subset=meta_standard_themes_train_config['corpus_cols'], inplace=True)

In [34]:
#hide
# list(set(train_df.theme.unique()))

In [35]:
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
len(df)

10484

In [36]:
#export
def get_meta_standard_theme_train_data(train_config={}):
    config = {**meta_standard_themes_train_config, **train_config}
    
    train_df = pd.read_csv(config['train_data'])
    train_df.dropna(subset=config['corpus_cols'], inplace=True)
    train_df['is_valid'] = False
    
    if ('valid_data' in config and config['valid_data'] is not None):
        valid_df = pd.read_csv(config['valid_data'])
        valid_df.dropna(subset=config['corpus_cols'], inplace=True)
        valid_df['is_valid'] = True
        
        return pd.concat([train_df, valid_df])
    
    return train_df

In [37]:
df = get_meta_standard_theme_train_data()
test(len(df), 0, operator.ge)

Using the mid-level `DataBlocks` API

In [38]:
#hide
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [39]:
#hide
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    RegressionBlock(),
    CategoryBlock()
)

def get_x(inp): return ': '.join(inp[meta_standard_themes_train_config['corpus_cols']].values)

dblock = DataBlock(blocks=blocks, 
                   get_x=get_x, 
                   get_y=[ColReader('avg_sentiment'), ColReader('is_example')], 
                   splitter=ColSplitter(col='is_valid'), 
                   n_inp=1)

In [40]:
#hide
dls = dblock.dataloaders(df, bs=saw_standard_themes_train_config['batch_size'])

In [41]:
dls.vocab

(#2) [0,1]

In [42]:
dls.before_batch[0].hf_tokenizer.vocab_size

50265

In [43]:
#hide
print((f'The inputs vocab ({dls.before_batch[0].hf_tokenizer.vocab_size} items), '
       f'and the targets ({len(dls.vocab)} items)'))

The inputs vocab (50265 items), and the targets (2 items)


In [44]:
#hide
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape, b[2].shape, b[1].type(), b[2].type()

(3,
 torch.Size([8, 465]),
 torch.Size([8]),
 torch.Size([8]),
 'torch.cuda.FloatTensor',
 'torch.cuda.LongTensor')

In [45]:
#hide
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,text_,category
0,"Supervisor Effectiveness/Resolves Staff Issues: In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",1.0,0
1,"Fear of Retaliation, Negative Consequences: The campus should ensure that each dept:\r\n-carefully reviews job descriptions and knows who is doing what work so that staff can be appropriately recognized and compensated for the work they do, as opposed to that recognition and compensation going to someone else.\r\n-has succession plans for each area; as staff leave/retire this should make it possible for current staff to move up. The campus should ensure staff are not required to leave their unit simply to advance professionally.\r\n-MSO mentors the managers within the unit so that the managers can be the most effective. MSOs also need to actually allow supervisors to supervise.\r\n-head (staff and faculty) is held to a non-tolerance standard for speaking negatively about staff within their unit and/or calling names. \r\n-head (staff and faculty) and each manager is transparent and timely in the communication of information to members of the unit. Instead of excuses and apologies, corrective actions should be taken and repeats of problematic situations should not reoccur.\r\n-has adequate funds and adequate space for the members of their unit, and the customers the unit serves. All staff areas within the unit should have comparable accommodations.\r\n-is safe for staff. Panic buttons/alarms, safety and active shooter training, and appropriate access barriers are absolutely necessary for all areas and in all units.\r\n-allows flex-schedules and work-from-home in as much as possible. “Service” should be reimagined to match current technology.",3.0,0


In [46]:
#export
def get_meta_standard_theme_train_x(inp, corpus_cols): return 'theme: ' + ' comment: '.join(inp[corpus_cols].values)

def get_meta_standard_theme_train_dls(df, hf_arch, hf_tokenizer, train_config={}, use_cache=True):
    
    config = {**meta_standard_themes_train_config, **train_config}
    cache_path = config['cache_data_path'] if ('cache_data_path' in config) else None
    
    if (use_cache and cache_path is not None):
        if (os.path.isfile(cache_path)): 
            dls = torch.load(cache_path)
            dls.bs = config['batch_size']
    
    blocks = (
        HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
        RegressionBlock(),
        CategoryBlock()
    )

    dblock = DataBlock(blocks=blocks, 
                       get_x=partial(get_meta_standard_theme_train_x, corpus_cols=config['corpus_cols']),
                       get_y=[ColReader('avg_sentiment'), ColReader('is_example')], 
                       splitter=ColSplitter(col='is_valid'), 
                       n_inp=1)
    
    dls = dblock.dataloaders(df, bs=config['batch_size'])
    if (cache_path is not None): torch.save(dls, config['cache_data_path'])
        
    return dls

Tests

In [47]:
df = get_meta_standard_theme_train_data()

dls = get_meta_standard_theme_train_dls(df, hf_arch, hf_tokenizer)

test_eq(dls.bs, meta_standard_themes_train_config['batch_size'])
test_eq(2, len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 3)
test_eq(b[1].shape[0], dls.bs)    # = regression task
test_eq(b[2].shape[0], dls.bs)    # = classification task

In [48]:
dls = get_meta_standard_theme_train_dls(df, hf_arch, hf_tokenizer, use_cache=False)

test_eq(dls.bs, meta_standard_themes_train_config['batch_size'])
test_eq(2, len(dls.vocab))

b = dls.one_batch()
test_eq(len(b), 3)
test_eq(b[1].shape[0], dls.bs)    # = regression task
test_eq(b[2].shape[0], dls.bs)    # = classification task

In [49]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,text_,category
0,"theme: Supervisor Effectiveness/Resolves Staff Issues comment: In the Enterprise Network and Telecommunications group of ITS, the environment continues to be toxic, retaliatory, abusive, and discriminatory as in past years. Under the direction of James Seddon, supervisor Malerie Samadi harasses and talks down to employees. Manipulates and edits official documentation to make staff look and give lower performance appraisal ratings or retaliate against them. Also, use the same practice to provide their friends with higher performance ratings and award them with higher merit increases. Senior management continues to harbor this behavior without any consequence and or accountability. James and Malerie exploit minorities and give preferential treatment to personnel hired by them. Regularly abuse the power that the University of California gives them and exercise nepotism because they are both product of such practices.\r\nOther supervisors like Ynez Hicks also participates in the same practices of despotism, nepotism, and favoritism. \r\nIt is unacceptable that this behavior and practices continue to be used by management without any accountability. Principles of the community are ignored continuously daily. When this type of situation is brought up to Human Resources, they ignore them because no one is enforcing the UC Principles. James Seddon and Malerie Samadi in the Datacom group don't care about staff promotion, compensation, and well being. They only care about themselves and their friends. Hiring practices are unfair; they manipulate the process so they can hire barely qualified personnel into experienced positions. Existing staff is overworked because newer personnel can not pull their weight, yet Malerie and James make it look like they are in their appraisals. They mentally abuse staff and minimize their work performance. Recently one of the team member past away while working at home. A stroke caused by the stress and the pressure that Malerie was putting on David Ramirez. She used him to get her promotion to supervisor and make her look good in front of others.\r\nBoth James and Malerie are the perfect examples of bad management. Somehow they continue to occupy their positions, and Senior Management doesn't do anything about it. \r\nThe lowest level the Datacom team has been in years, all because of James and Malerie's arrogance and lack of ethics.",1.0,0
1,"theme: Advancement and Training Opportunities comment: 1.\tSensitivity and bias training for SLBO management – The selection and search for the person to fill Evelyns position seemed inequitable. Evelyn having a say in her successor may not have been against campus hiring policies but did not seem appropriate since she had inside knowledge of some of the candidates and a possible bias for or against other candidates. \r\n I applied for the position because my applying was suggested by another member of Evelyns team that was not on the committee.\r\nAnna, a member of the committee, was also under the leadership of Evelyn. Anna, who now has been promoted into Jamies positions would not be in Jamies position if he had not gotten Evelyns position. In addition, there was no one in the room from HR ensuring that the interviews were fair and equal. I think that this Cluster is large enough to warrant additional oversite with the filling of positions. \r\nThough it may not matter to some but within the SLBO all of senior management (HR remains neutral) is now male while 99 % of the rest of the staff is female. \r\n2.\tWhen work is removed from my desk work of equal or greater substance should replace it (the work removed).\r\n3.\tTo remove budgetary work from me and replace it with transactional work is belittling along with a misuse of my high level skill set, experience, and education. I am often the person who trains or provides information to new staff on our tools within Financial Link. \r\n4.\tI want to see equality in secession training for management positions within the SLBO or other departments in the cluster.",3.0,0


## Cleanup

In [50]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 02a_verbatims-core.ipynb.
Converted 02c_verbatims-standard-themes-saw-training.ipynb.
Converted 02e_verbatims-standard-themes-meta-training.ipynb.
Converted 99_verbatims-inference.ipynb.
Converted Untitled.ipynb.
Converted index.ipynb.
