# Concept Clean-Up

The concepts extracted by MetaMap contain many spurious concepts, often identified from very common words. This step produces a condensed dataframe with just the concepts whose trigger words appear relatively *rarely* in the English language.

In [None]:
import pandas as pd
import os
import re
import requests
import string
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from nltk.corpus import stopwords

import gensim
import gensim.corpora as corpora
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric
from gensim.models import TfidfModel
from gensim.utils import lemmatize
from gensim.utils import simple_preprocess

import utils

### Paths

To begin, update the paths below to the input and output directories on your local computer.

In [None]:
tweets_path = "/path/to/thread_annotated_tweets.csv"
concepts_dir = "/path/to/concepts"

output_dir = "intermediate_data"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
# Read the tweet CSV file
tweets = pd.read_csv(tweets_path, dtype=utils.dtype_spec, lineterminator='\n')
tweets.head()

In [None]:
# Let's look at a sample of some concepts from the concept directory
test_concepts = pd.read_csv(os.path.join(concepts_dir, "concepts_0.csv"))
test_concepts.sample(n=10)

## English Word Frequency

The frequencies of English words are derived from the Google Web Trillion Word Corpus, and provided in a [unigram frequencies TSV file](http://norvig.com/ngrams/count_1w.txt) by Peter Norvig.

In [None]:
# Download the English language frequencies file
english_freqs_path = "english_language_frequencies.tsv"

if not os.path.exists(english_freqs_path):
    print("Downloading English language frequencies...")
    resp = requests.get("http://norvig.com/ngrams/count_1w.txt")
    with open(english_freqs_path, "wb") as file:
        file.write(resp.content)
    print("Saved.")

In [None]:
word_counts = pd.read_csv(english_freqs_path, delimiter='\t', header=None)
word_counts.columns = ['word', 'freq']
word_counts = word_counts.set_index('word')
word_counts.head()

In [None]:
relevance_cache = {}

def trigger_relevance(row):
    """
    Computes the relevance of the trigger_word column of the given row. Relevance
    is the average of the negative log frequencies of each word in the word_counts
    dataframe. Words that are not present in the word_counts dataframe are given
    a count of 1.
    """
    global relevance_cache
    
    trigger = row["trigger_word"]
    if trigger not in relevance_cache:
        components = re.split(r"\W", trigger)
        relevance_cache[trigger] = np.mean([-np.log(word_counts.freq.get(comp, 1)) for comp in components])

    return relevance_cache[trigger]

### Relevance Cutoff

What threshold should we choose for relevance? Let's take a look at some example concepts and what their trigger word relevances are.

In [None]:
filtered_concepts = test_concepts[~pd.isna(test_concepts.trigger)]

# Trigger words are stored in a hyphen-delimited format with the fourth component 
# corresponding to the actual trigger. For example: ["Test"-tx-1-"test"-noun-0]
filtered_concepts["trigger_word"] = filtered_concepts["trigger"].str.extract(r"\d-\"([^\"]+)\"-")[0].str.lower()
filtered_concepts = filtered_concepts[~pd.isna(filtered_concepts.trigger_word)]

filtered_concepts.trigger_word.value_counts()

In [None]:
# Compute relevance as the negative log of the word frequency of the trigger word.
unique_concepts = filtered_concepts.drop_duplicates('trigger_word')
print("Computing relevance for {} concepts...".format(len(unique_concepts)))
unique_concepts["relevance"] = unique_concepts.apply(trigger_relevance, axis=1)
print("Sorting...")
unique_concepts = unique_concepts.sort_values(by='relevance', ascending=False)
print("Done.")

In [None]:
# Look at a sample of these concepts and their relevances
plt.figure()
plt.hist(unique_concepts['relevance'], bins=np.arange(-24, 1))
plt.xlabel("Relevance")
plt.ylabel("Count")
plt.show()

# Sample the concepts in each relevance range and print a few
for relevance_range in [(-25, -20), (-20, -15), (-15, -13), (-13, -10), (-10, -1), (-1, 1)]:
    print("Concept triggers with relevances between {} and {}:".format(*relevance_range))
    sample = unique_concepts[(unique_concepts.relevance >= relevance_range[0]) & 
                             (unique_concepts.relevance < relevance_range[1])].sample(n=5)
    print(sample[['preferred_name', 'trigger_word', 'relevance']])
    print("")

As we can see above, the most relevant concepts seem to begin occurring at a relevance of around -13. We therefore chose the cutoff for our concept relevance to be -13.

The next cell loads all the concepts from each batch, filters them by relevance, and adds them to an overall dataframe. *Note:* This cell can take a long time to run.

In [None]:
# Let's write a condensed DF with relevances only above the given threshold
relevance_threshold = -13.0

df = None
batch_index = 0
path = os.path.join(concepts_dir, "concepts_{}.csv".format(batch_index))

while os.path.exists(path):
    sub_df = pd.read_csv(path)
    print("Processing {}, {} concepts so far".format(path, len(df) if df is not None else 0))
    
    filtered_concepts = sub_df[~pd.isna(sub_df.trigger)]
    filtered_concepts["trigger_word"] = filtered_concepts["trigger"].str.extract(r"\d-\"([^\"]+)\"-")[0].str.lower()
    filtered_concepts = filtered_concepts[~pd.isna(filtered_concepts.trigger_word)]    
    filtered_concepts["relevance"] = filtered_concepts.apply(trigger_relevance, axis=1)
    filtered_concepts = filtered_concepts[filtered_concepts["relevance"] >= relevance_threshold]
    
    # Concatenate concepts
    if df is None:
        df = filtered_concepts
    else:
        df = pd.concat([df, filtered_concepts])
        
    batch_index += 1
    path = os.path.join(concepts_dir, "concepts_{}.csv".format(batch_index))

# Drop cases where the same concept is extracted multiple times from the same tweet
df = df.drop_duplicates(subset=["tweet_id", "cui"])
df.to_csv(os.path.join(output_dir, "all_concepts.csv"))

In [None]:
# How did we do?
print("Extracted {} concepts with {} unique trigger words/phrases.".format(len(df), len(df.trigger_word.unique())))

plt.figure()
plt.hist(unique_concepts['relevance'], bins=np.arange(-24, 1))
plt.xlabel("Relevance")
plt.ylabel("Count")
plt.title("Relevance Distribution")
plt.show()

# Look at a random sample of concepts
df.drop_duplicates('cui').sample(n=20)