# Tritonlytics Verbatims Data Prep & EDA

Prepare the verbatims data for models and perform exploratory data analysis (EDA).  This notebook will generate the necessary files for both our Language Model and Classification work.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import html, pdb, requests
from collections import Counter, defaultdict
import multiprocessing as mp

from tritonlytics_ai.utils import *
from sklearn import model_selection
from fastai import __version__ as fa2_version
from fastai.text.all import *

# from fastai import *        
# from fastai.text import *  

# from tritonlytics import Metrics as metrics_util, DataGeneration as dg_util, PandasUtil as pd_util
# from tritonlytics.evaluation import *
# from tritonlytics.callbacks import RocAucEvaluation

# import dill as pickle

import spacy
spacy_en = spacy.load('en_core_web_sm')
spacy_es = spacy.load('es_core_news_sm')

from wordcloud import WordCloud

# pandas and plotting config
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline
plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

In [3]:
print(f'fastai version: {fa2_version}')

fastai version: 2.0.12


In [None]:
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name(torch.cuda.current_device())}')

## Download  latest training data

Note: LM training is too big so it is uploaded as a CSV

In [None]:
verbatims_clean_filename = 'verbatims-clean.csv'
verbatims_raw_filename = 'verbatims-raw.csv'

model_types = [
    'verbatim-classification-sentiment',
    'verbatim-classification-metadata-standardthemes',
    'verbatim-classification-css-themes',
    'verbatim-classification-saw-themes',
    'verbatim-summarization-adhoc-themes'
]

In [None]:
# for model_type in model_types:
#     response = requests.get(f'https://tritonlytics-admin/api/ml/models/{model_type}/training-data')
#     data = response.json()
    
#     pd.DataFrame(data).to_csv(RAW_DATA_PATH/f'{model_type}-raw.csv, index=False)

## Data Preparation

### Language model

#### Review

In [None]:
chunksize = 24000

In [None]:
lm_cols = list(TASK_LM_DTYPES.keys())
                   
df = pd.read_csv(RAW_DATA_PATH/verbatims_raw_filename, dtype=TASK_LM_DTYPES, parse_dates=[])
df = df[lm_cols]

display(len(df))
display(df.head(1))

In [None]:
df.describe()

In [None]:
pd_advanced_describe(df, include='all')

#### Clean up

In [None]:
df = df.loc[df['Language'] == 'English', lm_cols]

**NOTES** 
- *(wtg3 2/22/2019): Removing this for now as it hasn't proved useful*

Add clean (stop word removed), lemmatized, and clean_lemmatized columns

In [None]:
spacy_models = defaultdict(lambda: None, { 'English': spacy_en, 'Spanish': spacy_es })
lang_cols = defaultdict(lambda: 'AnswerText_NonEnglish', { 'English': 'AnswerText'})

In [None]:
def add_cols(df, id_cols=['id'], lang_col='lang'):
    rows = []
    
    pbar = progress_bar(df.iterrows(), total=len(df))
    for index, row in pbar:
        # get the correct spacy model for the language
        spacy_fn = spacy_models[row[lang_col]]
        if (spacy_fn == None): continue;
            
        # only process the text field if something is there
        txt_col = lang_cols[row[lang_col]]
        txt = str(row[txt_col])
        if (txt == None): continue
        
        # grab tokens, entities, and word tokens
        tokens = spacy_fn(txt)
        ents = tokens.ents
        words = [ token for token in tokens if (not token.is_punct) ]
        
        # will prepend dictionary of ids to both cols and ent_cols dicts
        cols = OrderedDict({ el:row[el] for el in id_cols })
        
        # add different versions of text
        cols[f'{txt_col}_Cleaned'] = ' '.join([ t.text for t in tokens if (not t.is_stop) ])
        cols[f'{txt_col}_Lemmatized'] = ' '.join([ t.lemma_ for t in tokens ])
        cols[f'{txt_col}_Cleaned_Lemmatized'] = ' '.join([ t.lemma_ for t in tokens if (not t.is_stop) ]) 
        
        rows.append(cols)
    
    # overwrite any existing columns with new values
    df = pd.merge(df, pd.DataFrame(rows, columns=cols.keys()), on=id_cols, suffixes=('_x', ''))
    return df.loc[:, ~df.columns.str.endswith('_x')]
                                                   

In [None]:
# %%time

# df = add_cols(df, ['id'], 'language')
# df.head()

#### Create train/validation sets

In [None]:
np.random.seed(42)
idxs = np.random.permutation(len(df))
df = df.iloc[idxs]

In [None]:
trn_docs, val_docs = model_selection.train_test_split(df, test_size=0.1, random_state=42)
len(trn_docs), len(val_docs)

In [None]:
pd.concat([trn_docs, val_docs]).to_csv(LM_PATH/'all.csv', index=False)
trn_docs[lm_cols].to_csv(LM_PATH/'train.csv', index=False)
val_docs[lm_cols].to_csv(LM_PATH/'test.csv', index=False)

### Sentiment classification model

#### Review

In [None]:
chunksize = 24000

In [None]:
sent_cols = list(TASK_LM_DTYPES_SC.keys()) + list(TASK_SENTIMENT_DTYPES.keys())
               
df = pd.read_json(RAW_DATA_PATH/'verbatim-classification-sentiment-raw.json', 
                  dtype={**TASK_LM_DTYPES, **TASK_SENTIMENT_DTYPES}, convert_dates=date_cols)
df = df[sent_cols]

display(len(df))
display(df.head(1))

In [None]:
df.describe()

In [None]:
pd_advanced_describe(df, include='all')

#### Clean up

Ensure that all expected binary labels are between 0 and 1

In [None]:
# ensure binary labels are between 0 and 1
df[SENT_LABELS[1:]] = df[SENT_LABELS[1:]].clip(0, 1)

If a verbatim is tagged as very negative then it should also be considered negative (same with very positive), so ensure that is the case

In [None]:
# if IsVeryNegatve=1 then set IsNegative=1, same with IsVeryPositive and IsPositive
df.loc[df.is_very_negative == 1, 'is_negative'] = 1
df.loc[df.is_very_positive == 1, 'is_positive'] = 1

We are going to programmatically determine Overall Sentiment (1-5) based on binary labels because reviewers weren't labeling this field to start with.  We'll want to revisit this later

In [None]:
df.overall_sentiment = np.nan

In [None]:
# if OverallSentiment is null, use labels to populate
df.loc[(pd.isna(df.overall_sentiment)) & 
       (df.is_very_positive == 1) & (df.is_very_negative == 0) & (df.is_negative == 0),
        'overall_sentiment'] = 5
        
df.loc[(pd.isna(df.overall_sentiment)) & 
       (df.is_very_negative == 1) & (df.is_very_positive == 0) & (df.is_positive == 0),
        'overall_sentiment'] = 1
        
df.loc[(pd.isna(df.overall_sentiment)) & 
        (df.is_very_positive == 0) & (df.is_positive == 1) & (df.is_very_negative == 0) & (df.is_negative == 0),
        'overall_sentiment'] = 4
        
df.loc[(pd.isna(df.overall_sentiment)) & 
        (df.is_very_positive == 1) & (df.is_very_negative == 0) & (df.is_negative == 1),
        'overall_sentiment'] = 4
        
df.loc[(pd.isna(df.overall_sentiment)) & 
        (df.is_very_positive == 0) & (df.is_positive == 0) & (df.is_very_negative == 0) & (df.is_negative == 1),
        'overall_sentiment'] = 2
        
df.loc[(pd.isna(df.overall_sentiment)) & 
        (df.is_very_negative == 1) & (df.is_very_positive == 0) & (df.is_positive == 1),
        'overall_sentiment'] = 2
        
# default to 3-neutral
df.loc[(pd.isna(df.overall_sentiment)), 'overall_sentiment'] = 3

In [None]:
# display(df[df.overall_sentiment == 2.0].head())

#### Create train/validation sets

In [None]:
np.random.seed(42)
idxs = np.random.permutation(len(df))
df = df.iloc[idxs]

In [None]:
trn_docs, val_docs = model_selection.train_test_split(df, test_size=0.1, random_state=42)
len(trn_docs), len(val_docs)

In [None]:
pd.concat([trn_docs, val_docs]).to_csv(SENTIMENT_CLS_PATH/'all.csv', index=False)
trn_docs[sent_cols].to_csv(SENTIMENT_CLS_PATH/'train.csv', index=False)
val_docs[sent_cols].to_csv(SENTIMENT_CLS_PATH/'test.csv', index=False)

(SENTIMENT_CLS_PATH/'labels_sent.txt').open('w').writelines(f'{c}\n' for c in SENT_LABELS)

### Standard Theme classification model - CSS

#### Review

In [None]:
chunksize = 24000

In [None]:
sent_cols = list(TASK_LM_DTYPES_SC.keys()) + list(TASK_STANDARD_THEME_CSS_DTYPES.keys())
               
df = pd.read_json(RAW_DATA_PATH/'verbatim-classification-css-themes-raw.json', 
                  dtype={**TASK_LM_DTYPES, **TASK_STANDARD_THEME_CSS_DTYPES}, convert_dates=date_cols)
df = df[sent_cols]

display(len(df))
display(df.head(1))

In [None]:
df.describe()

In [None]:
pd_advanced_describe(df, include='all')

#### Clean up

Ensure that all expected binary labels are between 0 and 1

In [None]:
# ensure binary labels are between 0 and 1
df[STANDARD_THEME_CSS_LABELS] = df[STANDARD_THEME_CSS_LABELS].clip(0, 1)

#### Create train/validation sets

In [None]:
np.random.seed(42)
idxs = np.random.permutation(len(df))
df = df.iloc[idxs]

In [None]:
trn_docs, val_docs = model_selection.train_test_split(df, test_size=0.1, random_state=42)
len(trn_docs), len(val_docs)

In [None]:
pd.concat([trn_docs, val_docs]).to_csv(STANDARD_THEME_CSS_PATH/'all.csv', index=False)
trn_docs[sent_cols].to_csv(STANDARD_THEME_CSS_PATH/'train.csv', index=False)
val_docs[sent_cols].to_csv(STANDARD_THEME_CSS_PATH/'test.csv', index=False)

( STANDARD_THEME_CSS_PATH/'labels_.txt').open('w').writelines(f'{c}\n' for c in STANDARD_THEME_CSS_LABELS )

### Standard Theme classification model - S@W

#### Review

In [None]:
chunksize = 24000

In [None]:
sent_cols = list(TASK_LM_DTYPES_SC.keys()) + list(TASK_STANDARD_THEME_SAW_DTYPES.keys())
               
df = pd.read_json(RAW_DATA_PATH/'verbatim-classification-saw-themes-raw.json', 
                  dtype={**TASK_LM_DTYPES, **TASK_STANDARD_THEME_SAW_DTYPES}, convert_dates=date_cols)
df = df[sent_cols]

display(len(df))
display(df.head(1))

In [None]:
df.describe()

In [None]:
pd_advanced_describe(df, include='all')

#### Clean up

Ensure that all expected binary labels are between 0 and 1

In [None]:
# ensure binary labels are between 0 and 1
df[STANDARD_THEME_SAW_LABELS] = df[STANDARD_THEME_SAW_LABELS].clip(0, 1)

#### Create train/validation sets

In [None]:
np.random.seed(42)
idxs = np.random.permutation(len(df))
df = df.iloc[idxs]

In [None]:
trn_docs, val_docs = model_selection.train_test_split(df, test_size=0.1, random_state=42)
len(trn_docs), len(val_docs)

In [None]:
pd.concat([trn_docs, val_docs]).to_csv(STANDARD_THEME_SAW_PATH/'all.csv', index=False)
trn_docs[sent_cols].to_csv(STANDARD_THEME_SAW_PATH/'train.csv', index=False)
val_docs[sent_cols].to_csv(STANDARD_THEME_SAW_PATH/'test.csv', index=False)

( STANDARD_THEME_SAW_PATH/'labels.txt').open('w').writelines(f'{c}\n' for c in STANDARD_THEME_SAW_LABELS )

### Standard Theme classification model - Metadata

#### Review

In [None]:
chunksize = 24000

In [None]:
sent_cols = list(TASK_LM_DTYPES_SC.keys()) + list(TASK_STANDARD_THEME_META_DTYPES.keys())
               
df = pd.read_json(RAW_DATA_PATH/'verbatim-classification-metadata-standardthemes-raw.json', 
                  dtype={**TASK_LM_DTYPES, **TASK_STANDARD_THEME_META_DTYPES}, convert_dates=date_cols)
df = df[sent_cols]

display(len(df))
display(df.head(1))

In [None]:
df.describe()

In [None]:
pd_advanced_describe(df, include='all')

#### Clean up

Ensure that all expected binary labels are between 0 and 1

In [None]:
# ensure binary labels are between 0 and 1
df.is_example = df.is_example.clip(0, 1)

# default avg sentiment to 3 if null
df.loc[(pd.isna(df.avg_sentiment)), 'avg_sentiment'] = 3.0

#### Create train/validation sets

In [None]:
np.random.seed(42)
idxs = np.random.permutation(len(df))
df = df.iloc[idxs]

In [None]:
trn_docs, val_docs = model_selection.train_test_split(df, test_size=0.1, random_state=42)
len(trn_docs), len(val_docs)

In [None]:
pd.concat([trn_docs, val_docs]).to_csv(STANDARD_THEME_META_PATH/'all.csv', index=False)
trn_docs[sent_cols].to_csv(STANDARD_THEME_META_PATH/'train.csv', index=False)
val_docs[sent_cols].to_csv(STANDARD_THEME_META_PATH/'test.csv', index=False)

( STANDARD_THEME_META_PATH/'labels.txt').open('w').writelines(f'{c}\n' for c in STANDARD_THEME_META_LABELS )

## Playground

In [None]:
df = pd.read_csv(LM_PATH/'all.csv', dtype=TASK_LM_DTYPES)

print(f'# Examples: {len(df)}')
df.head(1)

### Classification datasets

In [None]:
cls_df = pd.read_csv(SENTIMENT_CLS_PATH/'all.csv', dtype={**TASK_LM_DTYPES_SC, **TASK_SENTIMENT_DTYPES})
trn_cls_df = pd.read_csv(SENTIMENT_CLS_PATH/'train.csv', dtype={**TASK_LM_DTYPES, **TASK_SENTIMENT_DTYPES})
val_cls_df = pd.read_csv(SENTIMENT_CLS_PATH/'test.csv', dtype={**TASK_LM_DTYPES, **TASK_SENTIMENT_DTYPES})

cls_df.head(1)

Train/Test split

In [None]:
n_train_sz, n_test_sz = len(trn_cls_df), len(val_cls_df)
n_total = n_train_sz + n_test_sz

print('\tTrain\tTest')
print(f'Size:\t{n_train_sz} | {n_test_sz}')
print(f'%:\t{round(n_train_sz / n_total, 2)}   | {round(n_test_sz / n_total, 2)}')

Let's look at the **class distribution**

Define the labels we want to predict including a "None" label for those comments with 0's for each label

In [None]:
label_cols = list(TASK_SENTIMENT_DTYPES.keys())[1:]
cls_df['None'] = 1 - cls_df[label_cols].max(axis=1)

In [None]:
value_counts_df = cls_df[label_cols + ['None']].apply(pd.Series.value_counts)
display(value_counts_df)

n_clean = value_counts_df.loc[1, "None"]
n_labeled = value_counts_df.loc[0, "None"]
p_labeled = round(n_labeled / len(cls_df), 2) * 100
labeled_to_clean_ratio = round(n_labeled / n_clean, 2)

print(f'Clean comments: {n_clean}')
print(f'Labeled comments: {n_labeled}({p_labeled}%)')
print(f'Ratio of labeled comments: {labeled_to_clean_ratio}')

In [None]:
x = value_counts_df.iloc[1,:]

plt.figure(figsize=(14,6))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title('# Occurences per label')
plt.ylabel('# Occurences')
plt.xlabel('Labels')

rects = ax.patches
labels = x.values

for rect, lbl in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, lbl, ha='center', va='bottom')
    
plt.show()

Let's see to what degree **comments are tagged with multiple labels**

In [None]:
mult_labels_s = cls_df[label_cols].sum(axis=1).value_counts().sort_index()
# display(mult_labels_s)

x = mult_labels_s

plt.figure(figsize=(14,6))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title('# Labels per comment')
plt.ylabel('# of comments')
plt.xlabel('# of labels')

rects = ax.patches
labels = x.values

for rect, lbl in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, lbl, ha='center', va='bottom')
    
plt.show()

Let's look at how these **label correlations**

In [None]:
corr_df = cls_df[label_cols].corr()
# display(corr_df)

plt.figure(figsize=(12,10))
sns.heatmap(corr_df, xticklabels=corr_df.columns.values, yticklabels=corr_df.columns.values, annot=True)

Let's look at **examples of each label**

In [None]:
for lbl in label_cols:
    ex = cls_df[cls_df[lbl] == 1].answer_text.iloc[:2]
    print(f'{lbl.upper()}:\n{ex}\n-----------------------\n')

Let's look at **WordClouds for each label to identify the most frequent words for each label**

In [None]:
for lbl in label_cols:
    df = cls_df[cls_df[lbl] == 1]
    txt = df.answer_text.values
    wc = WordCloud(background_color='black', max_words=4000, stopwords=spacy.lang.en.STOP_WORDS)
    
    # see: https://stackoverflow.com/a/10880820/54818 on why ' '.join(txt) doesn't work
    wc.generate(",".join(map(str, txt)))
    
    plt.figure(figsize=(20,14))
    plt.axis('off')
    plt.title(f'Words in {lbl.upper()} comments', fontsize=20)
    plt.imshow(wc.recolor(colormap='viridis', random_state=42), alpha=0.98)
    plt.show()