# Concept Clean-Up

The concepts extracted by MetaMap contain many spurious concepts, often identified from very common words. This step produces a condensed dataframe with just the concepts whose trigger words appear relatively *rarely* in the English language.

In [None]:
import pandas as pd
import os
import re
import requests
import string
import pickle
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from nltk.corpus import stopwords

import gensim
import gensim.corpora as corpora
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric
from gensim.models import TfidfModel
from gensim.utils import lemmatize
from gensim.utils import simple_preprocess

import utils

### Paths

To begin, update the paths below to the input and output directories on your local computer.

In [None]:
tweets_path = "/path/to/thread_annotated_tweets.csv"
concepts_dir = "/path/to/concepts"
word_counts_dir = "/path/to/word_counts" # should contain relevant_word_counts.pkl and irrelevant_word_counts.pkl

output_dir = "/path/to/output"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
# Read the tweet CSV file
tweets_path = os.path.join(base_dir, "level_0", "tweets.csv")
tweets = pd.read_csv(tweets_path, dtype=utils.dtype_spec, lineterminator='\n')
tweets.head()

In [None]:
# Let's look at a sample of some concepts from the concept directory
test_concepts = pd.read_csv(os.path.join(concepts_dir, "concepts_0.csv"))
test_concepts.sample(n=10)

# Doctor vs. Non-Doctor Relevance

In [None]:
# Base level
with open(os.path.join(word_counts_dir, "relevant_word_counts.pkl"), "rb") as file:
    doctor_info = pickle.load(file)

with open(os.path.join(word_counts_dir, "irrelevant_word_counts.pkl"), "rb") as file:
    non_doctor_info = pickle.load(file)

doctor_tweet_count = doctor_info["tweet_count"]
doctor_word_counts = doctor_info["word_counts"]
non_doctor_tweet_count = non_doctor_info["tweet_count"]
non_doctor_word_counts = non_doctor_info["word_counts"]

relevance = {}
for n, word_count_set in enumerate(doctor_word_counts):
    for word, f in word_count_set.items():
        if word.lower() in utils.FILTER_WORDS:
            continue
        non_doctor_f = non_doctor_word_counts[n].get(word, 0)
        relevance[word] = (f / doctor_tweet_count + 1e-3) / (non_doctor_f / non_doctor_tweet_count + 1e-3)

def concept_enrichment(concept_row):
    words = re.split(r"\W", concept_row.trigger_word)
    trigger = " ".join(words)
    if trigger in relevance:
        return (relevance.get(trigger, 0))
    word_rels = [relevance[word] for word in words if word in relevance]
    return np.mean(word_rels) if word_rels else 0.0

In [None]:
#df = None
batch_index = 0
path = os.path.join(concepts_dir, "concepts_{}.csv".format(batch_index))

while os.path.exists(path):
    sub_df = pd.read_csv(path)
    print("Processing {}, {} concepts so far".format(path, len(df) if df is not None else 0))

    filtered_concepts = sub_df[~pd.isna(sub_df.trigger)]

    # Extract the trigger word
    filtered_concepts["trigger_word"] = filtered_concepts["trigger"].str.extract(r"\d-\"([^\"]+)\"-")[0].str.lower()
    filtered_concepts = filtered_concepts[~pd.isna(filtered_concepts.trigger_word)]    

    # Filter by semtype and exclude certain words
    filtered_concepts = utils.filter_useful_concepts(filtered_concepts)

    # Compute enrichment
    filtered_concepts["enrichment"] = filtered_concepts.apply(concept_enrichment, axis=1)

    # Filter for only concepts that are MORE enriched in doctor tweets than non-doctor tweets
    filtered_concepts = filtered_concepts[filtered_concepts["enrichment"] >= 1.0]

    # Concatenate concepts
    if df is None:
        df = filtered_concepts
    else:
        df = pd.concat([df, filtered_concepts])

    batch_index += 1
    path = os.path.join(concepts_dir, "concepts_{}.csv".format(batch_index))

In [None]:
unique_concepts = df.drop_duplicates('preferred_name').sort_values('enrichment', ascending=False)
print("Most enriched:", unique_concepts.head(20)[['trigger_word', 'preferred_name', 'enrichment']])
print("Least enriched:", unique_concepts.tail(20)[['trigger_word', 'preferred_name', 'enrichment']])
plt.hist(unique_concepts.enrichment, bins=30);

In [None]:
unique_concepts[(unique_concepts.enrichment >= 1.0)].trigger_word.value_counts()

In [None]:
df.to_csv(os.path.join(output_dir, "concepts.csv"))