In [2]:
from util import *
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from data_cleaning import *
from pandarallel import pandarallel
from collections import defaultdict
from nltk.tokenize import word_tokenize
import nltk
pandarallel.initialize()

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", None)
pd.set_option("precision", 3)
%pylab inline
%config InlineBackend.figure_formats = ['retina']

Populating the interactive namespace from numpy and matplotlib


In [4]:
fn = '../data/data.pickle'
data = read_from_pickle(fn)

Read data from "../data/data.pickle"


In [5]:
df = pd.DataFrame(data)
df = data_cleaning(df)
df.head(2)

Unnamed: 0,title,journal,keywords,abstract,first_author,last_author,year
0,Factors Associated With Concussion Nondisclosu...,Journal of athletic training,"coach influence"", ""pressure"", ""reporting behavior",Mandated concussion education has aimed to imp...,Morgan Anderson,Erica Beidler,2021
1,Extracellular vesicle concentrations of glial ...,Scientific reports,,Traumatic brain injury (TBI) is linked to long...,Spencer Flynn,Leighton Chan,2021


## 1. **Clean texts**

In [5]:
#remove punctuation
df['abstract'] = (df['abstract']
                  .parallel_apply(remove_punctuation)
                  .parallel_apply(remove_num)
                 )
df['title'] = df['title'].parallel_apply(remove_punctuation)

In [6]:
#get nouns
df['abstract'] = df['abstract'].parallel_apply(get_nouns_adj)
df['title'] = df['title'].parallel_apply(get_nouns_adj)
# df['abstract'] = df['abstract'].parallel_apply(get_nouns)
# df['title'] = df['title'].parallel_apply(get_nouns)

In [7]:
#combine title and abstract
df['title_and_abstract'] = df['title'] + ' ' + df['abstract']

In [8]:
# lemmatization
df['title_and_abstract_lemma'] = df['title_and_abstract'].parallel_apply(paragraph_lemma)

In [9]:
df.head(2)


Unnamed: 0,title,journal,keywords,abstract,first_author,last_author,year,title_and_abstract,title_and_abstract_lemma
0,Factors Concussion Nondisclosure Collegiate St...,Journal of athletic training,"coach influence"", ""pressure"", ""reporting behavior",concussion education student athlete knowledge...,Morgan Anderson,Erica Beidler,2021,Factors Concussion Nondisclosure Collegiate St...,factor concussion nondisclosure collegiate stu...
1,Extracellular vesicle concentrations glial fib...,Scientific reports,,Traumatic brain injury TBI long term symptoms ...,Spencer Flynn,Leighton Chan,2021,Extracellular vesicle concentrations glial fib...,extracellular vesicle concentration glial fibr...


## **2. Generate n-grams**

In [10]:
# get yearly data
years = range(1991, 2022)
data_yearly = defaultdict(list)
for y in years:
    data_yearly[y] = df['title_and_abstract_lemma'][df['year'] == y]


In [11]:
# tokenize word and delete most frequent abbreviation, like 'mtbi', 'tbi'
words_tkn = defaultdict(list)
df_words_tkn = defaultdict(list)
for y in years:
    t = (data_yearly[y]
         .parallel_apply(word_tokenize)
         .parallel_apply(del_abbreviation)
        )
    
    df_words_tkn[y] = t
                       
    words_tkn[y] = sum(t)


In [12]:
all_text = sum(list(words_tkn.values()))

In [13]:
df_quagrams = (pd.Series(nltk.ngrams(all_text, 4)).value_counts())[:100]

In [14]:
thresh_quagrams = np.percentile(df_quagrams.values, 95) 
quagrams = list(df_quagrams.index[df_quagrams.values > thresh_quagrams].to_series().apply(' '.join).values)
quagrams

['mild traumatic brain injury',
 'severe traumatic brain injury',
 'patient traumatic brain injury',
 'injury traumatic brain injury',
 'traumatic brain injury patient']

In [15]:
df_trigrams = (pd.Series(nltk.ngrams(all_text, 3)).value_counts())[:100]

In [16]:
thresh_trigrams = np.percentile(df_trigrams.values, 81) 
trigrams = list(df_trigrams.index[df_trigrams.values > thresh_trigrams].to_series().apply(' '.join).values)
trigrams

['traumatic brain injury',
 'mild traumatic brain',
 'severe traumatic brain',
 'patient traumatic brain',
 'glasgow coma scale',
 'spinal cord injury',
 'brain injury patient',
 'central nervous system',
 'injury traumatic brain',
 'intensive care unit',
 'brain injury traumatic',
 'blood brain barrier',
 'coma scale score',
 'glasgow outcome scale',
 'patient severe traumatic',
 'cerebral blood flow',
 'controlled cortical impact',
 'cerebral perfusion pressure',
 'posttraumatic stress disorder']

In [17]:
df_bigrams = (pd.Series(nltk.ngrams(all_text, 2)).value_counts())[:150]

In [18]:
thresh_bigrams = np.percentile(df_bigrams.values, 58) 
bigrams = list(df_bigrams.index[df_bigrams.values > thresh_bigrams].to_series().apply(' '.join).values)
bigrams

['brain injury',
 'traumatic brain',
 'head injury',
 'mild traumatic',
 'long term',
 'severe traumatic',
 'spinal cord',
 'post traumatic',
 'patient traumatic',
 'intracranial pressure',
 'glasgow coma',
 'patient severe',
 'injury patient',
 'risk factor',
 'coma scale',
 'injury severity',
 'post injury',
 'age year',
 'white matter',
 'nervous system',
 'brain tissue',
 'present study',
 'control group',
 'magnetic resonance',
 'cord injury',
 'head trauma',
 'injury traumatic',
 'year old',
 'sport concussion',
 'intensive care',
 'significant difference',
 'moderate severe',
 'functional outcome',
 'emergency department',
 'central nervous',
 'quality life',
 'care unit',
 'axonal injury',
 'scale score',
 'confidence interval',
 'cohort study',
 'stress disorder',
 'systematic review',
 'cortical impact',
 'logistic regression',
 'clinical trial',
 'cell death',
 'post concussion',
 'outcome patient',
 'blood flow',
 'high school',
 'animal model',
 'blood brain',
 'head impac

In [27]:
dict_ngrams = {'bigrams': bigrams, 'trigrams': trigrams, 'quagrams': quagrams}
clean_data = {'df_words_tkn': df_words_tkn, 'dict_ngrams': dict_ngrams,
             'df_original': df}
fn = '../data/data_preprocessed.pickle'
save_as_pickle(fn,clean_data)

Saved data to "../data/data_preprocessed.pickle"
