# Tritonlytics - Redaction (using spacy models)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('..')

In [2]:
import html, pdb
from collections import Counter, defaultdict
import multiprocessing as mp
import tqdm

from fastai.text import *
from tritonlytics import Metrics as metrics_util, DataGeneration as dg_util, PandasUtil as pd_util
from tritonlytics.callbacks import RocAucEvaluation

import dill as pickle

import spacy
spacy_en = spacy.load('en')
spacy_es = spacy.load('es')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
import seaborn as sns
sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

In [3]:
print(f'fastai version: {__version__}')

fastai version: 1.0.45


In [4]:
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}')

Using GPU #1


## Configuration

In [5]:
# various default, LM, and classification paths
RAW_DATA_PATH = Path('../data/raw')
PATH = Path('../data/entity_identification')

(PATH/'models').mkdir(parents=True, exist_ok=True)
(PATH/'tmp').mkdir(exist_ok=True)

In [6]:
# [child for child in PATH.iterdir()]
# %ls {str(PATH)}

In [7]:
# dataframe config
verbatims_raw_filename = 'verbatims-raw.csv'
verbatims_clean_filename = 'verbatims-clean-entities.csv'

# basic columns
col_dtypes = { 
    'AnswerText': str, 'AnswerText_NonEnglish': str, 'Language': str,
    
    'SurveyTypeID': int, 'BenchmarkSurveyType': str, 'ClientId': str,
    
    'QuestionReportAbbr': str, 'QuestionText': str, 'QuestionClass': str, 
    
    'QuestionCategoryID': float, 'QuestionCategoryAbbr': str, 'QuestionCategoryLabel': str, 
    'BenchmarkLevel1': str, 'BenchmarkLevel2': str, 'BenchmarkLevel3': str, 'ClientBenchmarkLevel': str,
    
    'GroupCode': float, 'GroupName': str, 
    'GroupLevel1Code': float, 'GroupLevel1Name': str, 'GroupLevel2Code': float, 'GroupLevel2Name': str, 
    'GroupLevel3Code': float, 'GroupLevel3Name': str, 'GroupLevel4Code': float, 'GroupLevel4Name': str,
    'GroupLevel5Code': float, 'GroupLevel5Name': str, 'GroupLevel6Code': float, 'GroupLevel6Name': str, 
    'GroupLevel7Code': float, 'GroupLevel7Name': str, 'GroupLevel8Code': float, 'GroupLevel8Name': str,
    
    'TagCount': int
}

# sentiment and entity labels
sent_dtypes = { 
    'OverallSentiment': int, 'IsVeryPositive': int, 'IsPositive': int, 'IsVeryNegative': int, 'IsNegative' : int, 
    'IsSuggestion' : int, 'FeelsThreatened' : int, 'HasProfanity' : int, 'IsNonsense' : int 
}

# date columns
date_cols = ['LastTaggedOn']

dtypes = {**col_dtypes, **sent_dtypes }


# string columns (that can be used in language modeling), labels and classes
TXT_COLS = [
    'AnswerText', 'AnswerText_NonEnglish', 'Language',
    'BenchmarkSurveyType', 'ClientId',
    'QuestionReportAbbr', 'QuestionText', 'QuestionClass', 
    'QuestionCategoryAbbr', 'QuestionCategoryLabel',
    'BenchmarkLevel1', 'BenchmarkLevel2', 'BenchmarkLevel3', 'ClientBenchmarkLevel',
    'GroupName', 
    'GroupLevel1Name', 'GroupLevel2Name', 'GroupLevel3Name', 'GroupLevel4Name', 
    'GroupLevel5Name', 'GroupLevel6Name', 'GroupLevel7Name', 'GroupLevel8Name'
]

LABELS_SENT = list(sent_dtypes.keys())
LABELS = LABELS_SENT

CLASSES = [['Very Negative', 'Negative', 'Neutral', 'Positive', 'VeryPositive'], ['no', 'yes']]

## Utility Methods

## Data Preparation

Review data and cleanup as necessary

In [8]:
survey_id = 199 # 199 latest UW

In [9]:
df = pd.read_csv(RAW_DATA_PATH/verbatims_raw_filename, dtype=dtypes, parse_dates=date_cols)
df = df[df.SurveyID == survey_id].copy()

display(len(df))
display(df.head(1))

21360

Unnamed: 0,Id,SurveyID,QuestionAnsID,RspID,QuestionCategoryID,GroupID,AnswerText,AnswerText_NonEnglish,Language,SurveyTypeID,BenchmarkSurveyType,ClientId,QuestionReportAbbr,QuestionClass,QuestionText,QuestionCategoryAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupName,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name,OverallSentiment,IsVeryPositive,IsPositive,IsVeryNegative,IsNegative,IsSuggestion,FeelsThreatened,HasProfanity,IsNonsense,TagCount,LastTaggedOn
323189,424934,199,27977,384580,361.0,-1,"Unfortunately, I don't find the academic coaching helpful. Even though the issues are outlined ...",,English,24,CSS,UW,HousingDisabilities_Experience,Verbatim-Dept-Question,Is there anything else we should know about your experience with Disability Resources for Students?,Acad Housing Students w Disabilities,Academic & Housing Accommodations for Students with Disabilities,Student,Student Life,Student Activities,1,-1.0,,-1.0,,,,,,,,,,,,,,,,3,0,0,0,0,0,0,0,0,0,NaT


Replace new lines with space

In [10]:
# df.loc[(pd.notnull(df.AnswerText)) & (df.AnswerText.str.contains(r"\\r\\n")), 
#        'AnswerText'] = df.AnswerText.str.replace(r"\r\n", ' ', regex=False)

In [11]:
# len(df.loc[(pd.notnull(df.AnswerText)) & (df.AnswerText.str.contains(r"\\r\\n"))])

In [12]:
# df.loc[(pd.notnull(df.AnswerText_NonEnglish)) & (df.AnswerText_NonEnglish.str.contains(r"\\r\\n")), 
#        'AnswerText_NonEnglish'] = df.AnswerText_NonEnglish.str.replace(r"\r\n", ' ', regex=False)

In [13]:
# len(df.loc[(pd.notnull(df.AnswerText_NonEnglish)) & (df.AnswerText_NonEnglish.str.contains(r"\\r\\n"))])

In [14]:
# df.head(2)

## Redaction methods

In [15]:
spacy_models = collections.defaultdict(lambda: None, { 'English': spacy_en, 'Spanish': spacy_es })
lang_cols = collections.defaultdict(lambda: 'AnswerText_NonEnglish', { 'English': 'AnswerText'})

In [16]:
pre_identified_entities = {} #{ 'TITLE': ['Mr.', 'Mrs.', 'Dr.'] }
entities = ['PERSON']

In [17]:
def get_redacted_text(df, entities_to_remove=['PERSON'], pre_identified_entities_to_remove={}, 
                      lang_col='Language', ):
    
    redacted_items = []
    
    for index, row in df.iterrows():
        # get the correct spacy model for the language
        spacy_fn = spacy_models[row[lang_col]]
        if (spacy_fn == None): continue;
            
        # only process the text field if something is there
        txt_col = lang_cols[row[lang_col]]
        txt = str(row[txt_col])
        if (txt == None): continue
            
        # process the text first using spacy model since it may use tokens that
        # we replace via "pre_identified_entities_to_remove"
        doc = spacy_fn(txt)
            
        # 1. replace pre-identified entities first
        for k,v in pre_identified_entities_to_remove.items():
            ent_label = k
            for ent_value in v: txt = txt.replace(ent_value, f'<{ent_label}>')
        
        # 2. replace entities based on model
        for ent in doc.ents:
            if (ent.label_ not in entities_to_remove): continue
                    
            #print(ent.text, ent.start_char, ent.end_char, ent.label_)
            txt = txt.replace(ent.text, f'<{ent.label_}>')
            
        redacted_items.append({ 'Id': row['Id'], 'RedactedAnswerText': txt })
    
    return redacted_items

In [18]:
items = get_redacted_text(df, entities, pre_identified_entities)

In [19]:
redacted_df = pd.DataFrame(items)
redacted_df.shape

(21360, 2)

In [20]:
df = df.merge(redacted_df, on='Id')
df.head(2)

Unnamed: 0,Id,SurveyID,QuestionAnsID,RspID,QuestionCategoryID,GroupID,AnswerText,AnswerText_NonEnglish,Language,SurveyTypeID,BenchmarkSurveyType,ClientId,QuestionReportAbbr,QuestionClass,QuestionText,QuestionCategoryAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupName,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name,OverallSentiment,IsVeryPositive,IsPositive,IsVeryNegative,IsNegative,IsSuggestion,FeelsThreatened,HasProfanity,IsNonsense,TagCount,LastTaggedOn,RedactedAnswerText
0,424934,199,27977,384580,361.0,-1,"Unfortunately, I don't find the academic coaching helpful. Even though the issues are outlined ...",,English,24,CSS,UW,HousingDisabilities_Experience,Verbatim-Dept-Question,Is there anything else we should know about your experience with Disability Resources for Students?,Acad Housing Students w Disabilities,Academic & Housing Accommodations for Students with Disabilities,Student,Student Life,Student Activities,1,-1.0,,-1.0,,,,,,,,,,,,,,,,3,0,0,0,0,0,0,0,0,0,NaT,"Unfortunately, I don't find the academic coaching helpful. Even though the issues are outlined ..."
1,424936,199,27977,384822,361.0,-1,Its apparent that theyre understaffed.,,English,24,CSS,UW,HousingDisabilities_Experience,Verbatim-Dept-Question,Is there anything else we should know about your experience with Disability Resources for Students?,Acad Housing Students w Disabilities,Academic & Housing Accommodations for Students with Disabilities,Student,Student Life,Student Activities,1,-1.0,,-1.0,,,,,,,,,,,,,,,,3,0,0,0,0,0,0,0,0,0,NaT,Its apparent that theyre understaffed.


### Save predictions

In [21]:
import datetime
file_prefix = datetime.date.today().strftime("%Y%m%d")

In [22]:
df.to_csv(PATH/f'{survey_id}_{file_prefix}_verbatims_redaction.csv', index=False)

## Review redacted results

In [23]:
df[df.RedactedAnswerText.str.contains('<PERSON>')].head(2)

Unnamed: 0,Id,SurveyID,QuestionAnsID,RspID,QuestionCategoryID,GroupID,AnswerText,AnswerText_NonEnglish,Language,SurveyTypeID,BenchmarkSurveyType,ClientId,QuestionReportAbbr,QuestionClass,QuestionText,QuestionCategoryAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupName,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name,OverallSentiment,IsVeryPositive,IsPositive,IsVeryNegative,IsNegative,IsSuggestion,FeelsThreatened,HasProfanity,IsNonsense,TagCount,LastTaggedOn,RedactedAnswerText
30,425050,199,27977,397945,361.0,-1,I understand that there has been a significant staff turn over in the DRS office. I question why...,,English,24,CSS,UW,HousingDisabilities_Experience,Verbatim-Dept-Question,Is there anything else we should know about your experience with Disability Resources for Students?,Acad Housing Students w Disabilities,Academic & Housing Accommodations for Students with Disabilities,Student,Student Life,Student Activities,1,-1.0,,-1.0,,,,,,,,,,,,,,,,3,0,0,0,0,0,0,0,0,0,NaT,I understand that there has been a significant staff turn over in the DRS office. I question why...
74,425232,199,28321,400398,369.0,-1,"This is a program I believe in, and can get more exposure in my opinion. I don't like it, for in...",,English,24,CSS,UW,BusDivPurAw_Experience,Verbatim-Dept-Question,Is there anything else we should know about your experience with Purchasing Awareness?,Bus Div Purchasing Awareness,Business Diversity Purchasing Awareness,Finance,Procurement & Contracts,Business Outreach,1,-1.0,,-1.0,,,,,,,,,,,,,,,,3,0,0,0,0,0,0,0,0,0,NaT,"This is a program I believe in, and can get more exposure in my opinion. I don't like it, for in..."
