# Tritonlytics Entity Identification

Experiments related entity identification models using Tritonlytics verbatims.  The objective here is define ways to identify entities models are already aware of, train models to identify custom entities, and also improve both.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('..')

In [2]:
import html, pdb
from collections import Counter, defaultdict
import multiprocessing as mp

from fastai.text import *
import tqdm

from tritonlytics import Metrics as metrics_util, DataGeneration as dg_util, PandasUtil as pd_util
from tritonlytics.callbacks import RocAucEvaluation

import dill as pickle

import spacy
from spacy.matcher import Matcher
spacy_en = spacy.load('en')
spacy_es = spacy.load('es')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
import seaborn as sns
sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

In [3]:
print(f'fastai version: {__version__}')

fastai version: 1.0.59


In [4]:
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}')

Using GPU #1


## Utility

In [5]:
def convert_to_snakecase(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower().replace('__', '_') 

## Configuration

In [6]:
# various default, LM, and classification paths
RAW_DATA_PATH = Path('../data/raw')
PATH = Path('../data/ner')

(PATH/'models').mkdir(parents=True, exist_ok=True)
(PATH/'tmp').mkdir(exist_ok=True)

In [7]:
# [child for child in PATH.iterdir()]
# %ls {str(PATH)}

In [8]:
# dataframe config
verbatims_raw_filename = 'verbatims-raw.csv'
verbatims_clean_filename = 'verbatims-clean-entities.csv'

# basic columns
# basic columns
lm_dtypes = { 
    'Id': int, 'QuestionAnsID': int, 'AnswerText': str, 'AnswerText_NonEnglish': str, 'Language': str,
    
    'SurveyID': int, 'SurveyTypeID': int, 'BenchmarkSurveyType': str, 'ClientId': str,'RspID': int,
    
    'QuestionCategoryAbbr': str, 'QuestionText': str, 'QuestionClass': str, 
    
    'QuestionCategoryID': float, 'QuestionReportAbbr': str, 'QuestionCategoryLabel': str, 
    'BenchmarkLevel1': str, 'BenchmarkLevel2': str, 'BenchmarkLevel3': str, 'ClientBenchmarkLevel': str,
    
    'GroupCode': float, 'GroupID': str, 
    'GroupLevel1Code': float, 'GroupLevel1Name': str,
    'GroupLevel2Code': float, 'GroupLevel2Name': str,
    'GroupLevel3Code': float, 'GroupLevel3Name': str,
    'GroupLevel4Code': float, 'GroupLevel4Name': str,
    'GroupLevel5Code': float, 'GroupLevel5Name': str,
    'GroupLevel6Code': float, 'GroupLevel6Name': str,
    'GroupLevel7Code': float, 'GroupLevel7Name': str,
    'GroupLevel8Code': float, 'GroupLevel8Name': str,
}

lm_dtypes_sc = { convert_to_snakecase(k):v for k,v in lm_dtypes.items() }

# sentiment and entity labels
sent_dtypes = { 
    'overall_sentiment': int, 
    'is_very_positive': int, 'is_positive': int, 'is_very_negative': int, 'is_negative' : int, 
    'is_suggestion' : int, 'feels_threatened' : int, 'has_profanity' : int, 'is_nonsense' : int 
}

# standard css themes
standard_theme_css_dtypes = { 
    'accessible_to_customers': int,
    'consistency_in_policies_information': int,
    'cost_fees': int,
    'courteous_professional_staff': int,
    'effective_communications': int,
    'effectively_uses_websites_online_documentation': int,
    'helpful_staff': int,
    'knowledgeable_staff': int,
    'moving_in_a_positive_direction': int,
    'overall_satisfaction': int,
    'process_improvement': int,
    'provides_effective_advice_guidance': int,
    'provides_training_on_processes_applications': int,
    'resolves_problems_effectively': int,
    'responds_to_requests_within_an_acceptable_time': int,
    'understands_my_needs_and_requirements': int
}

# standard saw themes
standard_theme_saw_dtypes = { 
    'adequate_staffing': int,
    'advancement_and_training_opportunities': int,
    'appropriate_stress_work_assigned_equitably': int,
    'benefits': int,
    'better_ways_recognized_participate_in_decisions': int,
    'career_advancement': int,
    'committed_to_diversity': int,
    'communicates_essential_information': int,
    'ethical_conduct_perform_responsibilities_spirit_of_cooperation': int,
    'evaluated_fairly': int,
    'experienced_discrimination': int,
    'facilities_workspace_safety': int,
    'faculty_value_contributions': int,
    'favoritism_cliques': int,
    'fear_of_retaliation_negative_consequences': int,
    'feel_valued_by_department': int,
    'flexibility_work_life_balance': int,
    'good_use_of_skills': int,
    'have_necessary_tools': int,
    'have_voice_on_campus_valued_member_of_ucsd': int,
    'internal_processes_effective': int,
    'parking_transportation': int,
    'salary_pay': int,
    'satisfied_with_diversity_progams': int,
    'supervisor_effectiveness_resolves_staff_issues': int
}

standard_theme_meta_dtypes = {
    'standard_theme_id': int,
    'theme': str,
    'url_friendly_theme': str,
    'theme_display_order': int,
    'avg_sentiment': float,
    'is_example': int
}

# date columns
date_cols = []


SENT_LABELS = list(sent_dtypes.keys())
STANDARD_THEME_CSS_LABELS = list(standard_theme_css_dtypes.keys())
STANDARD_THEME_SAW_LABELS = list(standard_theme_saw_dtypes.keys())
STANDARD_THEME_META_LABELS = list(standard_theme_meta_dtypes.keys())[-2:]

## Data Preparation

Review data and cleanup as necessary

In [9]:
lm_cols = list(lm_dtypes.keys())
                   
df = pd.read_csv(RAW_DATA_PATH/verbatims_raw_filename, dtype=lm_dtypes, parse_dates=[])
df = df[lm_cols]

display(len(df))
display(df.head(1))

592134

Unnamed: 0,Id,QuestionAnsID,AnswerText,AnswerText_NonEnglish,Language,SurveyID,SurveyTypeID,BenchmarkSurveyType,ClientId,RspID,QuestionCategoryAbbr,QuestionText,QuestionClass,QuestionCategoryID,QuestionReportAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupID,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name
0,172,1906,Parking services needs serious revamping,,English,16,7,CSS-STUDENT-ONLY,UCSD,38767,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,


Verify that the data is in an expected format for learning and do any cleanup (address missing values, incorrect data, incorrect datatypes, etc...)

In [10]:
df.describe()

Unnamed: 0,Id,QuestionAnsID,SurveyID,SurveyTypeID,RspID,QuestionCategoryID,GroupCode,GroupLevel1Code,GroupLevel2Code,GroupLevel3Code,GroupLevel4Code,GroupLevel5Code,GroupLevel6Code,GroupLevel7Code,GroupLevel8Code
count,592134.0,592134.0,592134.0,592134.0,592134.0,539643.0,576154.0,576154.0,23185.0,22677.0,17564.0,10189.0,2435.0,102.0,0.0
mean,297246.984073,25976.61431,141.32988,17.570009,253234.660744,308.365923,5288.785,37512.596262,131996.5,135527.4,114943.3,68131.922073,42561.823819,197009.529412,
std,170808.319567,22276.229679,51.328348,10.226696,114808.147009,292.761719,98376.88,188425.189373,477950.6,476305.1,511073.7,81499.186058,66132.633292,1213.092397,
min,1.0,1877.0,9.0,6.0,7065.0,5.0,-1.0,-1.0,2.0,802.0,2.0,26.0,147.0,196431.0,
25%,149406.25,8471.0,109.0,7.0,169172.0,24.0,-1.0,-1.0,10000.0,10002.0,10003.0,826.0,462.0,196432.0,
50%,297439.5,20627.0,138.0,15.0,225857.0,222.0,-1.0,-1.0,50000.0,50704.0,31405.0,90453.0,10305.0,196432.0,
75%,445019.75,35121.0,198.0,29.0,383480.0,497.0,-1.0,-1.0,90000.0,104000.0,96000.0,91672.0,90412.0,196432.0,
max,593052.0,87860.0,214.0,41.0,455677.0,1068.0,7000017.0,1000000.0,7000000.0,7000003.0,7000017.0,702340.0,304622.0,199533.0,


Replace new lines with space

In [11]:
df.loc[(pd.notnull(df.AnswerText)) & (df.AnswerText.str.contains(r"\\r\\n")), 
       'AnswerText'] = df.AnswerText.str.replace(r"\r\n", ' ', regex=False)

In [12]:
len(df.loc[(pd.notnull(df.AnswerText)) & (df.AnswerText.str.contains(r"\\r\\n"))])

0

In [13]:
df.loc[(pd.notnull(df.AnswerText_NonEnglish)) & (df.AnswerText_NonEnglish.str.contains(r"\\r\\n")), 
       'AnswerText_NonEnglish'] = df.AnswerText_NonEnglish.str.replace(r"\r\n", ' ', regex=False)

In [14]:
len(df.loc[(pd.notnull(df.AnswerText_NonEnglish)) & (df.AnswerText_NonEnglish.str.contains(r"\\r\\n"))])

0

In [15]:
df.head(2)

Unnamed: 0,Id,QuestionAnsID,AnswerText,AnswerText_NonEnglish,Language,SurveyID,SurveyTypeID,BenchmarkSurveyType,ClientId,RspID,QuestionCategoryAbbr,QuestionText,QuestionClass,QuestionCategoryID,QuestionReportAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupID,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name
0,172,1906,Parking services needs serious revamping,,English,16,7,CSS-STUDENT-ONLY,UCSD,38767,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,
1,173,1906,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,,English,16,7,CSS-STUDENT-ONLY,UCSD,38811,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,


Save cleaned file

In [16]:
df.to_csv(PATH/verbatims_clean_filename, index=False)

## Metadata and Entity Identification

**Metadata**

We'll capture a bunch of metadata that may be helpful in building various models (e.g., length of document, # of words, # of puncutation marks, # of captializations, # of entities, etc...)

**Entity Identification**

We'll use spacy to capture various entities (e.g., names, organizations, events, etc...) within each verbatim. Based on tagged verbatims, we can further improve the spacy models to identify specific entities like org names, contact info, and person names.  Save this data in the DW and it may be useful in building models that improve the accuracy for predicting the `LABELS_ENT` dependent variables

In [17]:
chunksize = 24000

lang_cols = collections.defaultdict(lambda: 'AnswerText_NonEnglish', { 'English': 'AnswerText'})

spacy_models = collections.defaultdict(lambda: None, { 'English': spacy_en, 'Spanish': spacy_es })

spacy_matchers = collections.defaultdict(lambda: None, { 
    'English': Matcher(spacy_en.vocab), 
    'Spanish': Matcher(spacy_es.vocab) })

In [18]:
title_pattern = [
    {'LOWER': {
        'IN': [
            'mr.', 'ms.', 'mrs.', 'miss', 'dr.', 'prof.', 'doctor' 'professor',
            'mr', 'ms', 'mrs', 'dr', 'prof', 
            'associate', 'chancellor', 'vc', 'vice chancellor', 'avc', 'evc', 'assistant vice chancellor',
            'executive', 'director', 'dean', 'president', 'coach', 'reverend', 'provost', 'regent',
            'cfo', 'ceo', 'cao', 'coo', 'cio', 'cto', 'chief', 'officer', 'captain', 'sergeant',
            'president'
        ]
    }}
]

profanity_pattern = [
    {'LEMMA': {
        'IN': [
            'fuck', 'shit', 'damn', 'ass', 'bitch', 'cunt', 'dick', 'cock',
            'pussy', 'fag', 'slut', 'douche', 'bastard'
        ]
    }}
]


for k, matcher in spacy_matchers.items():
    matcher.add("ENT_TITLE", None, title_pattern)
    matcher.add("ENT_PROFANITY", None, profanity_pattern)

In [19]:
def get_meta(df, id_cols=['id'], lang_col='lang'):
    
    rows = []
    ent_rows = []
    
    pbar = tqdm.tqdm(df.iterrows())

    for index, row in pbar:
        # get the correct spacy model for the language
        spacy_fn = spacy_models[row[lang_col]]
        if (spacy_fn == None): continue;
            
        matcher = spacy_matchers[row[lang_col]]
            
        # only process the text field if something is there
        txt_col = lang_cols[row[lang_col]]
        txt = str(row[txt_col])
        if (txt == None): continue
            
        # will prepend dictionary of ids to both cols and ent_cols dicts
        idd = OrderedDict({ el:row[el] for el in id_cols })
        
        cols = {
            **idd,
            'token_count': 0,
            'named_entity_count': 0,
            'word_count': 0,
            'unique_word_count': 0,
            'unique_word_pct': 0.0,
            'upper_word_count': 0,
            'avg_word_len': 0.0,
            'char_count': 0,
            'unique_char_count': 0,
            'upper_char_count': 0,
            
            'stopwords_count': 0,
            'punctuation_count': 0,
            'punctuation_pct': 0.0,
            'symbol_count': 0,
            'number_count': 0,
            'alpha_count': 0,
            'noun_count': 0,
            'verb_count': 0,
            'adj_count': 0,
            'proper_name_count': 0,

            # named entity types
            'named_entity_count': 0,
            'ENT_PERSON': [],
            'ENT_NORP': [],
            'ENT_FACILITY': [],
            'ENT_ORG': [],
            'ENT_LOC': [],         # include GPEs in LOC,
            'ENT_PRODUCT': [],
            'ENT_EVENT': [],
            'ENT_WORK_OF_ART': [],
            'ENT_LAW': [],
            'ENT_LANGUAGE': [],
            'ENT_DATE': [],
            'ENT_TIME': [],
            'ENT_PERCENT': [],
            'ENT_MONEY': [],
            'ENT_QUANTITY': [],
            'ENT_ORDINAL': [],
            'ENT_CARDINAL': [],
            
            'ENT_TITLE': [],
            'ENT_PROFANITY': []
        }
        
        # grab tokens, entities, and word tokens
        tokens = spacy_fn(txt)
        ents = tokens.ents
        words = txt.split()
        
        # get counts
        cols['token_count'] = len(tokens)
        cols['ent_count'] = len(list(set([ e.text for e in ents if f'ENT_{e.label_}' in cols ])))
        cols['word_count'] = len(words)
        cols['unique_word_count'] = len(set(words))
        cols['upper_word_count'] = len([w for w in words if (w.isupper())])
        cols['avg_word_len'] = np.mean([len(w) for w in words]) if len(words) > 0 else 0
        
        cols['char_count'] = len(txt)
        cols['unique_char_count'] = len(set(txt))
        cols['upper_char_count'] = len([c for c in txt if (c.isupper())])
        
        for t in tokens:
            if (t.is_punct): cols['punctuation_count'] += 1
            if (t.is_stop): cols['stopwords_count'] += 1
            if (t.pos_ == 'SYM'): cols['symbol_count'] += 1
            if (t.pos_ == 'NUM'): cols['number_count'] += 1
            if (t.is_alpha): cols['alpha_count'] += 1
                
            if (t.pos_ == 'NOUN') : cols['noun_count'] += 1
            if (t.pos_ == 'VERB') : cols['verb_count'] += 1
            if (t.pos_ == 'ADJ') : cols['adj_count'] += 1
            if (t.pos_ == 'PROPN') : cols['proper_name_count'] += 1
                
        for ent in ents:
            ent_d = { **idd, 'Text': txt, 'Language': row[lang_col], 'Label': ent.label_, 'Value': ent.text }
            ent_rows.append(ent_d)
            
            ent_label = f'ENT_{ent.label_}'
            
            if (ent_label in cols):
                if ent.text.strip() == '': continue
                    
                if (ent_label == 'ENT_GPE' or ent_label == 'ENT_LOC'):
                    cols['ENT_LOC'].append(ent.text)
                else:
                    cols[ent_label].append(ent.text)
        
        matches = matcher(tokens)
        for match_id, start, end in matches: 
            string_id = spacy_fn.vocab.strings[match_id]  # Get string representation
            cols[string_id].append(tokens[start:end].text)
            
        for k in cols.keys():
            if (k.startswith('ENT_')): cols[k] = ','.join(list(set(cols[k])))
            
        cols['unique_word_pct'] = (cols['unique_word_count'] + 1) / (cols['word_count'] + 1)
        cols['unique_char_pct'] = (cols['unique_char_count'] + 1) / (cols['char_count'] + 1)
        cols['punctuation_pct'] = (cols['punctuation_count'] + 1) / (cols['token_count'] + 1)
        
        rows.append(cols)
        
    return (pd.merge(df, pd.DataFrame(rows, columns=cols.keys()), on=id_cols), 
                      pd.DataFrame(ent_rows, columns=ent_rows[0].keys()))


**Why I'm using pd.merge instead of pd.concat**

I was using `pd concat([d1, d2], axis=1)` originally to merge the df chunk with `rows`, ***but*** that method joins on index and will stack the dataframes on top of each other if it can't match them.  

Each `chunk` retains its original index (e.g, 0-24000 and then 24001-48000 and so forth) while rows will be 0-24000 each time through, resulting in the first chunk being processed correctly (will have len of 24000) while all remaining chunks will not (they will have len of 48,000 do to the behavior of `pd.concat`)

In [20]:
meta_iter = pd.read_csv(PATH/verbatims_clean_filename, dtype=lm_dtypes, parse_dates=[], chunksize=chunksize)

### Different ways to process the .csv file

In [21]:
# How zipping works with lists of tuples using * and not using it

# x = [(1,2), (4,8), (16,32)]
# print(list(zip(*x)))
# print(list(zip(x)))

This is the most basic approach and takes about 90 minutes to run on a dataset of some 400k+ records.

In [22]:
# %%time

# meta_info = list(zip(*[ get_meta(sub_df, ['Id'], 'Language') for i, sub_df in enumerate(meta_iter) ]))

# meta_df, ent_df = pd.concat(meta_info[0]), pd.concat(meta_info[1])

The following two approaches demonstrate a couple of ways to use multiple CPUs in the task.  Each takes about 50 minutes to run.

`processes=mp.cpu_count()-1 or 1` = says "use all processors -1 but at least one"

In [23]:
# %%time
# import multiprocessing as mp
# pool = mp.Pool(processes=mp.cpu_count()-1 or 1)
# results = pool.starmap(get_meta, [ (sub_df, ['Id'], 'Language') for i, sub_df in enumerate(meta_iter) ])
# pool.close()

# meta_info = list(zip(*results))
# meta_df, ent_df = pd.concat(meta_info[0]), pd.concat(meta_info[1])

In [24]:
%%time

import multiprocessing as mp
pool = mp.Pool(processes=4) #mp.cpu_count()-1 or 1)
results = [ pool.apply_async(get_meta, args=(sub_df, ['Id'], 'Language')) for i, sub_df in enumerate(meta_iter) ]
pool.close()
           
meta_info = list(zip(*[ p.get() for p in results ]))
meta_df, ent_df = pd.concat(meta_info[0]), pd.concat(meta_info[1])

21091it [10:02, 42.71it/s]
24000it [10:25, 38.36it/s]
24000it [10:47, 37.09it/s]
24000it [11:12, 35.70it/s]
24000it [08:53, 45.02it/s]
24000it [08:26, 47.37it/s]
24000it [10:42, 37.33it/s]
24000it [10:34, 37.83it/s]
24000it [08:56, 44.76it/s]
24000it [10:01, 39.92it/s]
24000it [09:39, 41.42it/s]
24000it [10:40, 37.50it/s]
24000it [09:13, 43.37it/s]
24000it [10:25, 38.36it/s]
24000it [10:05, 39.66it/s]
24000it [12:50, 31.14it/s]
24000it [08:36, 46.51it/s]
24000it [08:50, 45.26it/s]
24000it [11:23, 35.13it/s]
24000it [10:04, 39.71it/s]
15100it [06:14, 42.16it/s]
24000it [11:36, 34.47it/s]
24000it [12:19, 32.47it/s]
16134it [13:02, 20.63it/s]
24000it [18:32, 21.58it/s]


CPU times: user 55 s, sys: 31.7 s, total: 1min 26s
Wall time: 1h 11min 59s


### Review and Clean up

In [25]:
ent_df.head()

Unnamed: 0,Id,Text,Language,Label,Value
0,173,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,English,ORG,Price Center
1,173,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,English,ORG,Wendy's
2,176,Please keep the Hillcrest/Medical Center shuttle stop on campus at Russell Lane. The old stop b...,English,ORG,the Hillcrest/Medical Center
3,176,Please keep the Hillcrest/Medical Center shuttle stop on campus at Russell Lane. The old stop b...,English,PERSON,Russell Lane
4,176,Please keep the Hillcrest/Medical Center shuttle stop on campus at Russell Lane. The old stop b...,English,ORG,the Medical School


In [26]:
print(len(meta_df), len(ent_df))

display(meta_df.head(2))
display(ent_df.head(2))

593386 401895


Unnamed: 0,Id,QuestionAnsID,AnswerText,AnswerText_NonEnglish,Language,SurveyID,SurveyTypeID,BenchmarkSurveyType,ClientId,RspID,QuestionCategoryAbbr,QuestionText,QuestionClass,QuestionCategoryID,QuestionReportAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupID,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name,token_count,named_entity_count,word_count,unique_word_count,unique_word_pct,upper_word_count,avg_word_len,char_count,unique_char_count,upper_char_count,stopwords_count,punctuation_count,punctuation_pct,symbol_count,number_count,alpha_count,noun_count,verb_count,adj_count,proper_name_count,ENT_PERSON,ENT_NORP,ENT_FACILITY,ENT_ORG,ENT_LOC,ENT_PRODUCT,ENT_EVENT,ENT_WORK_OF_ART,ENT_LAW,ENT_LANGUAGE,ENT_DATE,ENT_TIME,ENT_PERCENT,ENT_MONEY,ENT_QUANTITY,ENT_ORDINAL,ENT_CARDINAL,ENT_TITLE,ENT_PROFANITY,ent_count,unique_char_pct
0,172,1906,Parking services needs serious revamping,,English,16,7,CSS-STUDENT-ONLY,UCSD,38767,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,5,0,5,5,1.0,0,7.2,40,17,1,1,0,0.166667,0,0,5,3,1,1,0,,,,,,,,,,,,,,,,,,,,0,0.439024
1,173,1906,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,,English,16,7,CSS-STUDENT-ONLY,UCSD,38811,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,33,0,30,28,0.935484,2,4.066667,152,28,5,18,1,0.058824,0,0,30,6,1,3,3,,,,"Wendy's,Price Center",,,,,,,,,,,,,,,,2,0.189542


Unnamed: 0,Id,Text,Language,Label,Value
0,173,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,English,ORG,Price Center
1,173,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,English,ORG,Wendy's


1. Remove where value = NaN because it identified "N/A" as an ORG

In [27]:
ent_df = ent_df[pd.notnull(ent_df.Value)]

In [28]:
len(ent_df[pd.isnull(ent_df.Value)])

0

2. Remove rows were model says it found something, but the Value = ""

In [29]:
ent_df = ent_df[ent_df.Value.str.strip() != '']

In [30]:
len(ent_df[ent_df.Value.str.strip() == ''])

0

In [31]:
len(ent_df), len(meta_df)

(401895, 593386)

### Save predictions

In [32]:
import datetime
file_suffix = datetime.date.today().strftime("%Y%m%d")

In [33]:
meta_df.to_csv(PATH/f'{file_suffix}_verbatims_meta.csv', index=False)
ent_df.to_csv(PATH/f'{file_suffix}_verbatims_entities.csv', index=False)

## Review results

NER dataset

In [34]:
ent_df = pd.read_csv(PATH/f'{file_suffix}_verbatims_entities.csv')
ent_df.head()

Unnamed: 0,Id,Text,Language,Label,Value
0,173,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,English,ORG,Price Center
1,173,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,English,ORG,Wendy's
2,176,Please keep the Hillcrest/Medical Center shuttle stop on campus at Russell Lane. The old stop b...,English,ORG,the Hillcrest/Medical Center
3,176,Please keep the Hillcrest/Medical Center shuttle stop on campus at Russell Lane. The old stop b...,English,PERSON,Russell Lane
4,176,Please keep the Hillcrest/Medical Center shuttle stop on campus at Russell Lane. The old stop b...,English,ORG,the Medical School


In [35]:
ent_df[(ent_df.Label == 'PERSON') & (ent_df.Value.str.contains('kevin chou', case=False))][:5]

Unnamed: 0,Id,Text,Language,Label,Value
12153,22828,Kevin Chou's and his team have been extremely helpful in all situations. Kevin's response time ...,English,PERSON,Kevin Chou's
12251,22900,We are very fortunate to have Kevin Chou leading this group and to have a great group of staff w...,English,PERSON,Kevin Chou
14272,24489,"Kevin Chou and the Reserach IT team are great; responsive, caring, creative and knowledgable. E...",English,PERSON,Kevin Chou
14290,24498,"The best thing to happen to ACT in years is Kevin Chou; he is remarkably bright, enthusiastic, a...",English,PERSON,Kevin Chou
16883,25289,"Kevin Chou is reallyl great. But the unit is plagued by turnover, and folks that do not know th...",English,PERSON,Kevin Chou


METADATA dataset

In [36]:
meta_df = pd.read_csv(PATH/f'{file_suffix}_verbatims_meta.csv', dtype=lm_dtypes, parse_dates=date_cols)
meta_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Id,QuestionAnsID,AnswerText,AnswerText_NonEnglish,Language,SurveyID,SurveyTypeID,BenchmarkSurveyType,ClientId,RspID,QuestionCategoryAbbr,QuestionText,QuestionClass,QuestionCategoryID,QuestionReportAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupID,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name,token_count,named_entity_count,word_count,unique_word_count,unique_word_pct,upper_word_count,avg_word_len,char_count,unique_char_count,upper_char_count,stopwords_count,punctuation_count,punctuation_pct,symbol_count,number_count,alpha_count,noun_count,verb_count,adj_count,proper_name_count,ENT_PERSON,ENT_NORP,ENT_FACILITY,ENT_ORG,ENT_LOC,ENT_PRODUCT,ENT_EVENT,ENT_WORK_OF_ART,ENT_LAW,ENT_LANGUAGE,ENT_DATE,ENT_TIME,ENT_PERCENT,ENT_MONEY,ENT_QUANTITY,ENT_ORDINAL,ENT_CARDINAL,ENT_TITLE,ENT_PROFANITY,ent_count,unique_char_pct
0,172,1906,Parking services needs serious revamping,,English,16,7,CSS-STUDENT-ONLY,UCSD,38767,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,5,0,5,5,1.0,0,7.2,40,17,1,1,0,0.166667,0,0,5,3,1,1,0,,,,,,,,,,,,,,,,,,,,0,0.439024
1,173,1906,shogun eatery in Price Center is the best. A number of staff in Wendy's are not very courteous ...,,English,16,7,CSS-STUDENT-ONLY,UCSD,38811,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,33,0,30,28,0.935484,2,4.066667,152,28,5,18,1,0.058824,0,0,30,6,1,3,3,,,,"Wendy's,Price Center",,,,,,,,,,,,,,,,2,0.189542
2,174,1906,I would love an iPod!,,English,16,7,CSS-STUDENT-ONLY,UCSD,38877,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,6,0,5,5,1.0,1,3.4,21,14,2,3,1,0.285714,0,0,5,0,2,0,1,,,,,,,,,,,,,,,,,,,,0,0.681818
3,175,1906,I'd really like to win that ipod!!! =),,English,16,7,CSS-STUDENT-ONLY,UCSD,38893,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,12,0,8,8,1.0,0,3.875,38,20,1,5,3,0.307692,0,0,7,0,3,0,1,,,,,,,,,,,,,,,,,,,,0,0.538462
4,176,1906,Please keep the Hillcrest/Medical Center shuttle stop on campus at Russell Lane. The old stop b...,,English,16,7,CSS-STUDENT-ONLY,UCSD,38950,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,36,0,28,25,0.896552,0,4.714286,161,32,10,15,5,0.162162,1,0,29,4,3,1,7,Russell Lane,,,"the Medical School,the Hillcrest/Medical Center",,,,,,,,,,,,,,,,3,0.203704


In [37]:
len(meta_df[meta_df.ENT_PROFANITY.notnull()])

602

In [40]:
meta_df[meta_df.ENT_PROFANITY.notnull()].head()

Unnamed: 0,Id,QuestionAnsID,AnswerText,AnswerText_NonEnglish,Language,SurveyID,SurveyTypeID,BenchmarkSurveyType,ClientId,RspID,QuestionCategoryAbbr,QuestionText,QuestionClass,QuestionCategoryID,QuestionReportAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupID,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name,token_count,named_entity_count,word_count,unique_word_count,unique_word_pct,upper_word_count,avg_word_len,char_count,unique_char_count,upper_char_count,stopwords_count,punctuation_count,punctuation_pct,symbol_count,number_count,alpha_count,noun_count,verb_count,adj_count,proper_name_count,ENT_PERSON,ENT_NORP,ENT_FACILITY,ENT_ORG,ENT_LOC,ENT_PRODUCT,ENT_EVENT,ENT_WORK_OF_ART,ENT_LAW,ENT_LANGUAGE,ENT_DATE,ENT_TIME,ENT_PERCENT,ENT_MONEY,ENT_QUANTITY,ENT_ORDINAL,ENT_CARDINAL,ENT_TITLE,ENT_PROFANITY,ent_count,unique_char_pct
1395,5235,1900,The wireless internet in the basement of the library sucks. The single is so weak that most of ...,,English,25,7,CSS-STUDENT-ONLY,UCSD,78664,,Any additional comments for ELECTRONIC COMMUNICATIONS?,Verbatim-Dept-Question,15.0,ECommunications-Comments,ELECTRONIC COMMUNICATIONS,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,46,0,39,30,0.775,0,4.384615,211,26,3,24,4,0.106383,0,0,40,10,4,6,0,,,,,,,,,,,,,,,,,,,damn,0,0.127358
2247,6124,1900,"I came to campus at I-House to study, and unfortunately I had to join the resnet network instead...",,English,25,7,CSS-STUDENT-ONLY,UCSD,79839,,Any additional comments for ELECTRONIC COMMUNICATIONS?,Verbatim-Dept-Question,15.0,ECommunications-Comments,ELECTRONIC COMMUNICATIONS,,,,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,56,0,48,36,0.755102,5,4.145833,247,37,10,30,5,0.105263,0,1,49,10,8,1,3,,,,"I-House,UCSD",,,,,,,,20 more minutes,,,,,,,ass,3,0.153226
3025,7201,1894,It sucks: permit price astronomical & not worth it considering if I want a parking space during ...,,English,25,7,CSS-STUDENT-ONLY,UCSD,80665,Parking,Any additional comments for CAMPUS PARKING?,Verbatim-Dept-Question,9.0,Parking-Comments,Campus Parking,Parking & Transportation,Parking & Transportation,"Parking, Commuter Services",1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,51,0,42,38,0.906977,2,4.833333,244,32,6,25,6,0.134615,0,0,43,9,8,2,3,,Nazi,,,,,,,,,the week days,,,,,,,,bitches,2,0.134694
3436,7623,1894,"I don't know why everyone is bitching about parking, tell them to go park in east or regents and...",,English,25,7,CSS-STUDENT-ONLY,UCSD,79061,Parking,Any additional comments for CAMPUS PARKING?,Verbatim-Dept-Question,9.0,Parking-Comments,Campus Parking,Parking & Transportation,Parking & Transportation,"Parking, Commuter Services",1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,34,0,31,29,0.9375,1,4.354839,165,28,1,19,2,0.085714,0,0,31,7,6,1,1,,,,,,,,,,,,,,,,,,,"bitching,asses",0,0.174699
3441,7628,1896,"im still pissed about getting caught by that little mexican kid for stealing a cup of powerade, ...",,English,25,7,CSS-STUDENT-ONLY,UCSD,79061,Housing,Any additional comments for HOUSING SERVICES?,Verbatim-Dept-Question,11.0,Housing-Comments,Housing Maintenance and Custodial,Facilities,"Building, Custodial, Facilities Maintenance",Facilities Maintenance,1,-1.0,-1,-1.0,,,,,,,,,,,,,,,,34,0,31,28,0.90625,0,4.129032,158,26,0,17,2,0.085714,0,0,32,7,7,3,0,,mexican,,,,,,,,,,,,,,,,,ass,1,0.169811


In [38]:
len(meta_df[meta_df.ENT_TITLE.notnull()])

8060

In [39]:
meta_df[meta_df.ENT_TITLE.notnull()].head()

Unnamed: 0,Id,QuestionAnsID,AnswerText,AnswerText_NonEnglish,Language,SurveyID,SurveyTypeID,BenchmarkSurveyType,ClientId,RspID,QuestionCategoryAbbr,QuestionText,QuestionClass,QuestionCategoryID,QuestionReportAbbr,QuestionCategoryLabel,BenchmarkLevel1,BenchmarkLevel2,BenchmarkLevel3,ClientBenchmarkLevel,GroupCode,GroupID,GroupLevel1Code,GroupLevel1Name,GroupLevel2Code,GroupLevel2Name,GroupLevel3Code,GroupLevel3Name,GroupLevel4Code,GroupLevel4Name,GroupLevel5Code,GroupLevel5Name,GroupLevel6Code,GroupLevel6Name,GroupLevel7Code,GroupLevel7Name,GroupLevel8Code,GroupLevel8Name,token_count,named_entity_count,word_count,unique_word_count,unique_word_pct,upper_word_count,avg_word_len,char_count,unique_char_count,upper_char_count,stopwords_count,punctuation_count,punctuation_pct,symbol_count,number_count,alpha_count,noun_count,verb_count,adj_count,proper_name_count,ENT_PERSON,ENT_NORP,ENT_FACILITY,ENT_ORG,ENT_LOC,ENT_PRODUCT,ENT_EVENT,ENT_WORK_OF_ART,ENT_LAW,ENT_LANGUAGE,ENT_DATE,ENT_TIME,ENT_PERCENT,ENT_MONEY,ENT_QUANTITY,ENT_ORDINAL,ENT_CARDINAL,ENT_TITLE,ENT_PROFANITY,ent_count,unique_char_pct
41,299,1906,Textbooks (particularly Science) in the UCSD Bookstore are often being sold at higher prices tha...,,English,16,7,CSS-STUDENT-ONLY,UCSD,39066,Classification,Any additional general comments?,Verbatim-Dept-Question,16.0,Comments-General,Classification Details,,,,1.0,-1.0,-1,-1.0,,,,,,,,,,,,,,,,229,0,199,142,0.715,4,5.095477,1216,49,37,99,20,0.091304,1,0,204,43,26,18,25,"Ventana,Eleanor Roosevelt College",,,"Oceanview Terrace,Residence Life Office,Panda Express",,,,,,,each quarter,"hours,the evening",,hundreds of dollars,,,,dean,,9,0.041085
110,1622,27172,"Greater use of our Faculty and students through community based Internships, Expanded Projects ...",,English,131,23,,UCSD,186586,,Is there a way that UC San Diego could more effectively support regional economic growth?,Verbatim-Dept-Improve,,APLU_Improve,,,,,,-1.0,-1,-1.0,,,,,,,,,,,,,,,,89,0,81,64,0.792683,4,5.271605,508,44,28,25,10,0.122222,0,1,77,24,7,5,14,Mary,,,"Internships,RMP,EV,Tele Medicine,the Food nexus inUrban Planning,Research VC",,the Med Student,,,,,,,,,,,,VC,,8,0.088409
227,1739,27172,"Bring departmental Chairs together (I.e., Geriatrics, Psychiatry, Neurosciences, Family Medicine...",,English,131,23,,UCSD,187117,,Is there a way that UC San Diego could more effectively support regional economic growth?,Verbatim-Dept-Improve,,APLU_Improve,,,,,,-1.0,-1,-1.0,,,,,,,,,,,,,,,,128,0,106,85,0.803738,3,5.943396,735,48,28,46,18,0.147287,2,1,106,25,13,8,16,Howard Feldman,,,"UCSD,Chairs,UCSF",,,,,,,,,,#3 killer,,,,Dean,,5,0.066576
269,1781,27172,"Make the University more of a true ""player"" in the San Diego community at all levels, not just t...",,English,131,23,,UCSD,186898,,Is there a way that UC San Diego could more effectively support regional economic growth?,Verbatim-Dept-Improve,,APLU_Improve,,,,,,-1.0,-1,-1.0,,,,,,,,,,,,,,,,225,0,193,134,0.695876,3,4.911917,1140,46,22,111,25,0.115044,1,1,196,47,24,19,10,Trump,,,NCAA Division,,,,,,,,,10%,,,,,President,,3,0.041192
304,1816,27171,"Jacobs School of Engineering, Dean's office, Corporate Relations Program offerings, Corporate Af...",,English,131,23,,UCSD,186667,,"In your opinion, what UC San Diego programs or resources make the most significant contributions...",Verbatim,,APLU_Significant,,,,,,-1.0,-1,-1.0,,,,,,,,,,,,,,,,74,0,59,49,0.833333,0,6.084746,417,42,37,18,14,0.2,0,0,59,6,2,3,35,,,,"von Liebig Center,Team Internship Program,Corporate Relations Program,Corporate Affiliates Progr...",,,,,,,,,,,,,,Dean,,9,0.102871
