In [1]:
# default_exp inference

In [2]:
#all_slow

In [3]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# inference

> All inference related methods

In [4]:
#export
import datetime, warnings, gc
from inspect import signature
import sklearn.metrics as skm

from fastai.text.all import *

from tritonlytics_ai.utils import *
from tritonlytics_ai.verbatims.core import *

In [5]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [6]:
#hide
from fastai import __version__ as fa_version
from torch import __version__ as pt_version
from transformers import __version__ as hft_version

print(f'Using pytorch {pt_version}')
print(f'Using fastai {fa_version}')
print(f'Using transformers {hft_version}')

Using pytorch 1.6.0
Using fastai 2.0.16
Using transformers 3.3.1


In [7]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Utility 

In [8]:
#export
def concat_pool(raw_outputs):
    last_rnn_layer = raw_outputs[:,-1,None] # (e.g. (bs,n_hid,emb_sz) => (bs,1,emb_sz)) 
    bsz = last_rnn_layer.shape[0] 
    
    avg_pool = F.adaptive_avg_pool1d(last_rnn_layer.permute(0,2,1), 1).view(bsz, -1)
    max_pool = F.adaptive_max_pool1d(last_rnn_layer.permute(0,2,1), 1).view(bsz, -1)
    last_outp = last_rnn_layer[:,-1,:]

    return torch.cat([last_outp, max_pool, avg_pool], 1)

## What models should be run?

In [9]:
f = json.loads((RAW_DATA_PATH/'verbatim-inference.json').read_text())

In [10]:
models_df = pd.DataFrame(f['models'])
models_df.head()

Unnamed: 0,model_run_id,survey_id,model_id,model_name,model_type_id,model_type_name
0,126,401,6,20201023_verbatim_sent_multilabel_hf_export.pkl,2,verbatim-classification-sentiment
1,127,401,8,20201023_verbatim_standard_theme_saw_multilabel_hf_export.pkl,4,verbatim-classification-saw-themes


In [11]:
inf_df = pd.DataFrame(f['data'])
print(len(inf_df));

7610


In [12]:
yyyymmdd = datetime.today().strftime("%Y%m%d")

device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda:1


## Sentiment

In [13]:
from tritonlytics_ai.verbatims.sentiment.training import get_sentiment_preds

sent_df, inf_probs, inf_labels = get_sentiment_preds(inf_df, device=device, yyyymmdd=20201023)
print(sent_df.shape, inf_probs.shape, len(inf_labels))

0
(7482, 90) torch.Size([7482, 8]) 8


## Standard Themes - S@W

In [14]:
from tritonlytics_ai.verbatims.standard_themes_saw.training import get_standard_theme_preds as get_saw_theme_preds

saw_themes_df, inf_probs, inf_labels = get_saw_theme_preds(inf_df, device=device, yyyymmdd=20201023)
print(saw_themes_df.shape, inf_probs.shape, len(inf_labels))

0
(7482, 124) torch.Size([7482, 25]) 25


In [15]:
pred_cols = [f'pred_{lbl}' for lbl in inf_labels]
saw_themes_df[['answer_text']+pred_cols].head(2)

Unnamed: 0,answer_text,pred_adequate_staffing,pred_advancement_and_training_opportunities,pred_appropriate_stress_work_assigned_equitably,pred_benefits,pred_better_ways_recognized_participate_in_decisions,pred_career_advancement,pred_committed_to_diversity,pred_communicates_essential_information,pred_ethical_conduct_perform_responsibilities_spirit_of_cooperation,...,pred_feel_valued_by_department,pred_flexibility_work_life_balance,pred_good_use_of_skills,pred_have_necessary_tools,pred_have_voice_within_my_institution_valued_member_of_my_institution,pred_internal_processes_effective,pred_parking_transportation,pred_salary_pay,pred_satisfied_with_diversity_progams,pred_supervisor_effectiveness_resolves_staff_issues
0,"I appreciate the work environment that UC San Diego provides. I find that the leadership of my department, and the leadership throughout the University, is exceptional and creates a great workplace and a place that I excited to finish my career. I feel like my efforts are well recognized and meaningful and that I am making a difference.",0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,"Some of the few negatives that I have would be the reaction of campus clients towards the ESR changes. Many of them have acted spoiled and expect my department to answer all of their questions immediately without first attempting to find the solution to the problem themselves or consulting Blink/Knowledge Base Articles. Some campus clients have been over the top in their complaints and seem to only want to complain. Others have been disrespectful when interacting with me.\r\n\r\nThe other issue is the compensation - the salary that I am receiving, while ok for a new hire, is barely enough ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Standard Themes - CSS

In [16]:
from tritonlytics_ai.verbatims.standard_themes_css.training import get_standard_theme_preds as get_css_theme_preds

css_themes_df, inf_probs, inf_labels = get_css_theme_preds(inf_df, device=device, yyyymmdd=20201024)
print(css_themes_df.shape, inf_probs.shape, len(inf_labels))

0
(7482, 106) torch.Size([7482, 16]) 16


In [17]:
pred_cols = [f'pred_{lbl}' for lbl in inf_labels]
css_themes_df[['answer_text']+pred_cols].head(2)

Unnamed: 0,answer_text,pred_accessible_to_customers,pred_consistency_in_policies_information,pred_cost_fees,pred_courteous_professional_staff,pred_effective_communications,pred_effectively_uses_websites_online_documentation,pred_helpful_staff,pred_knowledgeable_staff,pred_moving_in_a_positive_direction,pred_overall_satisfaction,pred_process_improvement,pred_provides_effective_advice_guidance,pred_provides_training_on_processes_applications,pred_resolves_problems_effectively,pred_responds_to_requests_within_an_acceptable_time,pred_understands_my_needs_and_requirements
0,"I appreciate the work environment that UC San Diego provides. I find that the leadership of my department, and the leadership throughout the University, is exceptional and creates a great workplace and a place that I excited to finish my career. I feel like my efforts are well recognized and meaningful and that I am making a difference.",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Some of the few negatives that I have would be the reaction of campus clients towards the ESR changes. Many of them have acted spoiled and expect my department to answer all of their questions immediately without first attempting to find the solution to the problem themselves or consulting Blink/Knowledge Base Articles. Some campus clients have been over the top in their complaints and seem to only want to complain. Others have been disrespectful when interacting with me.\r\n\r\nThe other issue is the compensation - the salary that I am receiving, while ok for a new hire, is barely enough ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Standard Themes - Meta

In [18]:
from tritonlytics_ai.verbatims.standard_themes_meta.training import get_standard_theme_meta_preds, build_meta_inf_df
from tritonlytics_ai.verbatims.standard_themes_meta.training import sentiment_mse, is_example_acc
from tritonlytics_ai.verbatims.standard_themes_meta.training import Meta_MM, Meta_MM_HF_BaseModelCallback

# use f2 threshold value as goal is to identify theme probabilities that capture anything > f2 (e.g., f1, f05) later
meta_inf_df = build_meta_inf_df(saw_themes_df, 
                                theme_prob_threshold=0.13, 
                                fixed_cols=list(TASK_LM_DTYPES_SC.keys()))

meta_df, inf_probs_sent, inf_probs_is_example, inf_labels = get_standard_theme_meta_preds(meta_inf_df, 
                                                                                          device=device, 
                                                                                          yyyymmdd=20201025)
print(meta_df.shape, inf_probs_sent.shape, inf_probs_is_example.shape, len(inf_labels))

0
1000
(16489, 55) torch.Size([16489, 1]) torch.Size([16489, 2]) 2


In [19]:
meta_df[['theme', 'answer_text', 'theme_prob', 'prob_avg_sentiment', 'pred_is_example']].head()

Unnamed: 0,theme,answer_text,theme_prob,prob_avg_sentiment,pred_is_example
0,Adequate Staffing,"Some of the few negatives that I have would be the reaction of campus clients towards the ESR changes. Many of them have acted spoiled and expect my department to answer all of their questions immediately without first attempting to find the solution to the problem themselves or consulting Blink/Knowledge Base Articles. Some campus clients have been over the top in their complaints and seem to only want to complain. Others have been disrespectful when interacting with me.\r\n\r\nThe other issue is the compensation - the salary that I am receiving, while ok for a new hire, is barely enough ...",0.145658,1.978378,0
1,Adequate Staffing,"I probably work on one of the best and hardest working team on campus, but it would be nice to have a couple more plumbers or BMWs to help us out with our work load.",0.527928,2.845432,0
2,Adequate Staffing,"Due to our staffing shortage we are required to work overtime. The department recently limited the amount of hours we can earn as compensatory time off to use at a later time. This has created an atmosphere where employees are no longer as willing to cover shifts if they are unable to work for comp time. The department believes that this practice will save money, however, the amount of overtime being paid out is substantially more than it was when we were able to earn and use comp time. In my experience, employees try to use their earned comp time when there is no impact on staffing so...",0.162934,2.642706,0
3,Adequate Staffing,"With regards to pay - as a PA we are not paid equally compared to our NP counterpart. We do the sam exact job and paid less. It needs to be changed and I am hopeful the university can recognize this and make changes to relflect equality amongst the profession. As APPs we should be treated equal. With regards to our job, we are not utilized appropriate in the UC sector and thus not practicing to the top of our license. I know management is aware of this and there are being changes made. \r\n\r\nI personally feel there are too many employees and as a result making the system more complicated...",0.664323,2.158186,0
4,Adequate Staffing,It has been difficult to keep up with our workload without student research assistants.,0.447336,2.195768,0


## Cleanup

In [20]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 02a_verbatims-core.ipynb.
Converted 02b_verbatims-sentiment.ipynb.
Converted 02c_verbatims-standard-themes-saw-training.ipynb.
Converted 02d_verbatims-standard-themes-css-training.ipynb.
Converted 02e_verbatims-standard-themes-meta-training.ipynb.
Converted 99_verbatims-inference.ipynb.
Converted index.ipynb.


## Playground

In [21]:
other_cols = [col for col in sent_df.columns if col.startswith('prob') or col.startswith('pred') ]
final_sent_df = sent_df[['id', 'survey_id', 'question_ans_id'] + other_cols]

print(len(final_sent_df), len(sent_df))
final_sent_df.head()

7482 7482


Unnamed: 0,id,survey_id,question_ans_id,prob_is_very_positive,prob_is_positive,prob_is_very_negative,prob_is_negative,prob_is_suggestion,prob_feels_threatened,prob_has_profanity,prob_is_nonsense,pred_is_very_positive,pred_is_positive,pred_is_very_negative,pred_is_negative,pred_is_suggestion,pred_feels_threatened,pred_has_profanity,pred_is_nonsense
0,683411,401,1877,0.479776,0.994643,0.000794,0.002152,0.003524,0.000467,0.000177,0.001386,0,1,0,0,0,0,0,0
1,683412,401,1877,0.003697,0.02198,0.423642,0.990766,0.026343,0.049515,0.00072,0.001643,0,0,0,1,0,0,0,0
2,683413,401,1877,0.047683,0.418201,0.601509,0.978111,0.016698,0.026635,0.00125,0.001042,0,0,1,1,0,0,0,0
3,683414,401,1877,0.002403,0.039993,0.043385,0.912123,0.040625,0.079433,4e-05,0.010515,0,0,0,1,0,0,0,0
4,683415,401,1877,0.010977,0.497433,0.056186,0.943342,0.083735,0.027257,0.000786,0.000971,0,0,0,1,0,0,0,0


In [22]:
final_sent_df.to_csv('~/sent_inf.csv', index=False)

In [23]:
other_theme_cols = [col for col in saw_themes_df.columns if col.startswith('prob') or col.startswith('pred') ]
other_meta_cols = [ 
    col for col in meta_df.columns 
    if col.startswith('is_example') or col.startswith('sentiment') or col.startswith('prob') or col.startswith('pred') 
]

final_themes = pd.merge(
    meta_df[['id', 'survey_id', 'question_ans_id', 'theme', 'url_friendly_theme', 'theme_prob'] + other_meta_cols],
    saw_themes_df[['id'] + other_theme_cols],
    on='id'
)

print(len(final_themes), len(meta_df))
final_themes.head()

16489 16489


Unnamed: 0,id,survey_id,question_ans_id,theme,url_friendly_theme,theme_prob,prob_avg_sentiment,prob_is_example,pred_is_example,sentiment_mse,...,pred_feel_valued_by_department,pred_flexibility_work_life_balance,pred_good_use_of_skills,pred_have_necessary_tools,pred_have_voice_within_my_institution_valued_member_of_my_institution,pred_internal_processes_effective,pred_parking_transportation,pred_salary_pay,pred_satisfied_with_diversity_progams,pred_supervisor_effectiveness_resolves_staff_issues
0,683412,401,1877,Adequate Staffing,AdequateStaffing,0.145658,1.978378,0.009127,0,0.445856,...,0,0,0,0,0,0,0,1,0,0
1,683412,401,1877,Appropriate Stress Work Assigned Equitably,AppropriateStressWorkAssignedEquitably,0.172275,1.971118,0.008935,0,0.445856,...,0,0,0,0,0,0,0,1,0,0
2,683412,401,1877,Fear Of Retaliation Negative Consequences,FearOfRetaliationNegativeConsequences,0.5877,1.961309,0.008048,0,0.445856,...,0,0,0,0,0,0,0,1,0,0
3,683412,401,1877,Salary Pay,SalaryPay,0.811004,1.938436,0.009826,0,0.445856,...,0,0,0,0,0,0,0,1,0,0
4,683929,401,1877,Adequate Staffing,AdequateStaffing,0.527928,2.845432,0.011065,0,0.445856,...,0,0,0,0,0,0,0,0,0,0


In [24]:
final_themes.to_csv('~/themes_inf.csv', index=False)

In [25]:
meta_df.columns

Index(['index', 'id', 'question_ans_id', 'answer_text',
       'answer_text_non_english', 'language', 'survey_id', 'survey_type_id',
       'benchmark_survey_type', 'client_id', 'rsp_id',
       'question_category_abbr', 'question_text', 'question_class',
       'question_category_id', 'question_report_abbr',
       'question_category_label', 'benchmark_level1', 'benchmark_level2',
       'benchmark_level3', 'client_benchmark_level', 'group_code', 'group_id',
       'group_level1_code', 'group_level1_name', 'group_level2_code',
       'group_level2_name', 'group_level3_code', 'group_level3_name',
       'group_level4_code', 'group_level4_name', 'group_level5_code',
       'group_level5_name', 'group_level6_code', 'group_level6_name',
       'group_level7_code', 'group_level7_name', 'group_level8_code',
       'group_level8_name', 'theme', 'theme_prob', 'url_friendly_theme',
       'prob_avg_sentiment', 'prob_is_example', 'pred_is_example',
       'valid_loss', 'sentiment_mse', 'sentime