### Import libraries

In [684]:
import pandas as pd 
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting
import seaborn as sns

from sklearn import preprocessing

import nltk
nltk.download('stopwords')
nltk.download('words')

from nltk.corpus import stopwords
set(stopwords.words('english'))

from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.tag import pos_tag, pos_tag_sents
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer

import re
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varshagarla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/varshagarla/nltk_data...
[nltk_data]   Package words is already up-to-date!


### Read in csv file

In [685]:
df = pd.read_csv('abstracts_keywords.csv')

In [686]:
df.head()

Unnamed: 0.1,Unnamed: 0,PaperID,Abstract
0,15,16,"Cannabis use patterns vary considerably, with ..."
1,43,44,"To date, studies have highlighted cross-sectio..."
2,161,162,"In the United States (US), rates of teenage pr..."
3,235,236,The quality of the mother-child relationship i...
4,322,323,Perinatal depression affects 21-50% of women i...


In [687]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [688]:
df.head()

Unnamed: 0,PaperID,Abstract
0,16,"Cannabis use patterns vary considerably, with ..."
1,44,"To date, studies have highlighted cross-sectio..."
2,162,"In the United States (US), rates of teenage pr..."
3,236,The quality of the mother-child relationship i...
4,323,Perinatal depression affects 21-50% of women i...


In [689]:
print(f'Dataframe contains {df.shape[0]} rows')

Dataframe contains 805 rows


In [690]:
df.dtypes

PaperID      int64
Abstract    object
dtype: object

In [691]:
# Longest abstract length - words 
max_abstract_w = df['Abstract'].str.split().str.len().max()

# Shortest abstract length - words
min_abstract_w = df['Abstract'].str.split().str.len().min()

# Mean abstract length - words
mean_abstract_w = df['Abstract'].str.split().str.len().mean()

print("Max abstract length - words: ", max_abstract_w)
print("Min abstract length - words: ", min_abstract_w)
print("Mean abstract length - words: ", mean_abstract_w)

Max abstract length - words:  1392
Min abstract length - words:  101
Mean abstract length - words:  256.15403726708075


In [692]:
df.isnull().sum()

PaperID     0
Abstract    0
dtype: int64

### Check for duplicate rows

In [693]:
df.duplicated(subset=None, keep=False).sum()

158

In [694]:
# Looking for duplicated abstracts
df.duplicated(subset=['Abstract'], keep=False).sum()

158

In [695]:
df.duplicated(subset=['PaperID', 'Abstract'], keep=False).sum()

158

In [696]:
# Drop rows where PaperID and Abstract are the same
df.drop_duplicates(subset=['PaperID', 'Abstract'], keep='last', inplace=True)

In [697]:
# Confirm that PaperID + Abstract is now unique
df.duplicated(subset=['PaperID', 'Abstract'], keep=False).sum()

0

In [698]:
df.reset_index(drop=True, inplace=True)

In [699]:
print(f'Now we have {len(df)} observations in the dataset')

Now we have 726 observations in the dataset


### Preliminary Cleaning/Preprocessing of Text

#### Make lowercase

In [700]:
df['Abstract'] = df['Abstract'].astype(str).str.lower()

#### Remove numbers and punctuation

In [701]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x) #remove numbers
punc = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x) #remove punctuation

df['Abstract'] = df['Abstract'].map(alphanumeric).map(punc)

#### Remove "\r\n" new line

In [702]:
df['Abstract'] = df['Abstract'].replace('\r\n', '')

#### Look for more characters/words to remove (potential stop words)

In [703]:
df['Abstract'][9]

'we examined whether increased risk for adolescent tobacco and marijuana problems associated with childhood adhd is explained by key intermediary influences during adolescence and differs by gender longitudinal structural equation models examined mediating effects on problems with both substances  or each substance separately  through age   peer impairment  internalizing  and adolescent adhd symptoms in two twin samples  prospectively assessed since age    n\xa0 \xa0      whether these mediators contributed beyond mediating effects of early adolescent substance use was also considered  twin difference analyses further illuminated which mediators might be potentially causal direct effects of childhood adhd on age   tobacco and marijuana problems  i e   independent of included mediators  as well as effects of adolescent adhd symptoms were significant only for females  by contrast  mediation by peer impairment  evident particularly for marijuana  was relatively stronger for males than fem

In [704]:
# words with \ in them
# ©

In [705]:
df['Abstract'][0]

'the quality of the mother child relationship in the first year of life has far reaching implications across the life course  bornstein in annu rev psychol            yet little is known about predictors of maternal bonding and emotional availability in early infancy  in this study we examined the extent to which postnatal bonding  maternal mental health  and substance use at   weeks postpartum predicted mother infant bonding  self report  and mother emotional availability  observational  at   months of age data were obtained from an australian longitudinal cohort study of pregnancy  n\u2009 \u2009    data were collected during pregnancy  at birth  and postnatally at   weeks and   months the results show strong continuity between postnatal bonding at   weeks and   months  early postpartum stress and depression were associated with bonding at   months  however  the effect did not persist after adjustment for bonding at   weeks  tobacco use at   weeks  but no other indicators of mental h

In [706]:
df['Abstract'][10]

'the hypotheses were      pregnant women with bipolar disorder  bd  have less favorable pregnancy outcomes than unaffected women  and     psychotropic treated women with bd have better outcomes than un medicated women this prospective study included   mother infant dyads  women had bd without psychotropic exposure  bd np  n\u202f \u202f    bd with psychotropic treatment  bd p  n\u202f \u202f    or neither psychotropic exposure nor major mood disorder  comp  n\u202f \u202f    maternal characteristics were completed at   weeks gestation and evaluated for associations with delivery and birth outcomes  we performed multiple regressions on infant outcomes with adjustment for maternal age  race  employment status  use of illicit drugs and pre pregnancy bmi the bp p  bp np and comp groups varied significantly on sociodemographic characteristics  women with bd were more likely to be less educated  unemployed  single  and use tobacco and illicit drugs than women in the comp group  compared to w

In [707]:
# copyright
# published
# elsevier

In [708]:
df['Abstract'][500]

'despite recent research establishing high rates of comorbidity between hoarding disorder and alcohol use disorders  aud   no studies have systemically examined the associations between symptoms of hoarding and alcohol use  moreover  no studies have explored potential mechanisms that may help to explain these relations thus  the current study was designed to examine the associations between hoarding and aud symptoms  as well as the mediating role of emotion dysregulation self report measures of hoarding  depression  anxiety  alcohol use  and emotion regulation were collected in a sample of   adults  m age        sd           female  even when controlling for depression and anxiety  aud symptoms were associated with overall hoarding severity  β      p      as well as acquiring symptoms  β      p       further  the relation between hoarding symptoms and aud symptoms was mediated by difficulties regulating emotions  b         ci           conclusions importance  results of the current stu

In [709]:
# β

In [710]:
df['Abstract'][400]

'despite the well established association between problem gambling and adhd core categories of impulsivity hyperactivity and inattention  the link between parents  problem gambling and impulsivity hyperactivity inattention  ih i  behaviors in children has not been investigated  this study investigated the association between parents  problem gambling and children s ih i behaviors while controlling for potential confounding variables  a population based prospective cohort followed up from kindergarten to age    the quebec longitudinal study of kindergarten children  qlskc   provided data over three generations  among   participants at age    parents with a child aged  \xa0year or older  n\xa0 \xa0   mean age\xa0 \xa0   \xa0years  sd\xa0 \xa0     were selected  generalized linear models included measures of grandparents  and parents  problem gambling  parents  ih i behaviors in childhood  and a host of risk factors and comorbidities to predict ih i in children  intergenerational bivariat

In [711]:
df['Abstract'][300]

'while risk of premature death is most pronounced among persons with severe mental illness  also milder conditions are associated with increased all cause mortality  we examined non psychotic mental  npm  disorders and specific causes of natural death in a cohort of late adolescent men followed for up to   years prospective cohort study of swedish males  n        who took part in structured conscription interviews          men were diagnosed with npm disorders at or prior to conscription  median follow up time was   years  hrs for cause specific mortality were calculated using cox proportional hazards models risks in fully adjusted models were particularly elevated for death by infectious diseases  depressive and neurotic adjustment disorders  hr        \u2009ci     to       personality disorders  hr        \u2009ci     to      and alcohol related and other substance use disorders  hr        \u2009ci     to       as well as by gastrointestinal causes  depressive and neurotic adjustment

In [712]:
# author

In [713]:
df['Abstract'][50]

'waterpipe smoking is addictive and its use is increasing globally among youth  yet little is known about the factors associated with nicotine dependence  nd  among waterpipe smokers  we investigated the factors associated with nd symptoms among a sample of lebanese adolescents who smoke a waterpipe we collected data on factors potentially associated with nd  individual  socio demographic  environmental  smoking patterns  among   current  past    waterpipe smokers recruited from   and   school grades in lebanon  we assessed the loss of autonomy over tobacco using the hooked on nicotine checklist  honc   nd using the international classification of diseases    revision  icd     and the number of nd symptoms endorsed depressive symptoms  lower self esteem  and having at least one sibling who smokes a waterpipe were associated with the presence of nd symptoms  while enrollment in public schools  smoking a waterpipe ≥  per session  and believing that cigarette smoking is harmful to health 

In [714]:
# warranted
# ireland
# ltd
# rights
# reserved

In [715]:
df['Abstract'][49]

'community health workers  chw  may be effective in the delivery of tobacco dependence treatment with underserved groups  this study evaluated two evidence based chw models of treatment  it was hypothesized that smokers assigned to a chw face to face condition would have higher abstinence at   month posttreatment than smokers enrolled in chw referral to a state sponsored quitline condition  intrapersonal and treatment related factors associated with abstinence at   months were determined a group randomized trial was conducted with residents of   ohio appalachian counties with counties  n      randomized to either a chw face to face     or chw quitline  ql  condition  both conditions included behavioral counseling and free nicotine replacement therapy for   weeks  follow up data were collected at         and   month posttreatment  biochemically validated abstinence at   months served as the primary outcome seven hundred and seven participants were enrolled  n        n     chwql   baseli

In [716]:
# research
# ih

In [717]:
df['Abstract'][10]

'the hypotheses were      pregnant women with bipolar disorder  bd  have less favorable pregnancy outcomes than unaffected women  and     psychotropic treated women with bd have better outcomes than un medicated women this prospective study included   mother infant dyads  women had bd without psychotropic exposure  bd np  n\u202f \u202f    bd with psychotropic treatment  bd p  n\u202f \u202f    or neither psychotropic exposure nor major mood disorder  comp  n\u202f \u202f    maternal characteristics were completed at   weeks gestation and evaluated for associations with delivery and birth outcomes  we performed multiple regressions on infant outcomes with adjustment for maternal age  race  employment status  use of illicit drugs and pre pregnancy bmi the bp p  bp np and comp groups varied significantly on sociodemographic characteristics  women with bd were more likely to be less educated  unemployed  single  and use tobacco and illicit drugs than women in the comp group  compared to w

#### Remove instances of \u202f

In [718]:
df['Abstract'] = df['Abstract'].str.replace('\u202f', '')

In [719]:
df['Abstract'][10]

'the hypotheses were      pregnant women with bipolar disorder  bd  have less favorable pregnancy outcomes than unaffected women  and     psychotropic treated women with bd have better outcomes than un medicated women this prospective study included   mother infant dyads  women had bd without psychotropic exposure  bd np  n     bd with psychotropic treatment  bd p  n     or neither psychotropic exposure nor major mood disorder  comp  n     maternal characteristics were completed at   weeks gestation and evaluated for associations with delivery and birth outcomes  we performed multiple regressions on infant outcomes with adjustment for maternal age  race  employment status  use of illicit drugs and pre pregnancy bmi the bp p  bp np and comp groups varied significantly on sociodemographic characteristics  women with bd were more likely to be less educated  unemployed  single  and use tobacco and illicit drugs than women in the comp group  compared to women with bd np  women with bd p wer

In [720]:
df['Abstract'][300]

'while risk of premature death is most pronounced among persons with severe mental illness  also milder conditions are associated with increased all cause mortality  we examined non psychotic mental  npm  disorders and specific causes of natural death in a cohort of late adolescent men followed for up to   years prospective cohort study of swedish males  n        who took part in structured conscription interviews          men were diagnosed with npm disorders at or prior to conscription  median follow up time was   years  hrs for cause specific mortality were calculated using cox proportional hazards models risks in fully adjusted models were particularly elevated for death by infectious diseases  depressive and neurotic adjustment disorders  hr        \u2009ci     to       personality disorders  hr        \u2009ci     to      and alcohol related and other substance use disorders  hr        \u2009ci     to       as well as by gastrointestinal causes  depressive and neurotic adjustment

#### Remove instances of "       \u2009ci    "

In [721]:
df['Abstract'] = df['Abstract'].str.replace('       \u2009ci    ', '')

In [722]:
df['Abstract'][300]

'while risk of premature death is most pronounced among persons with severe mental illness  also milder conditions are associated with increased all cause mortality  we examined non psychotic mental  npm  disorders and specific causes of natural death in a cohort of late adolescent men followed for up to   years prospective cohort study of swedish males  n        who took part in structured conscription interviews          men were diagnosed with npm disorders at or prior to conscription  median follow up time was   years  hrs for cause specific mortality were calculated using cox proportional hazards models risks in fully adjusted models were particularly elevated for death by infectious diseases  depressive and neurotic adjustment disorders  hr  to       personality disorders  hr  to      and alcohol related and other substance use disorders  hr  to       as well as by gastrointestinal causes  depressive and neurotic adjustment disorders  hr  to       personality disorders  hr  to   

#### Remove any words containing numbers in the middle to handle other instances of those

In [723]:
df['Abstract'] = df['Abstract'].str.replace(r'\w*\d\w*', '')

  df['Abstract'] = df['Abstract'].str.replace(r'\w*\d\w*', '')


In [724]:
df['Abstract'][10]

'the hypotheses were      pregnant women with bipolar disorder  bd  have less favorable pregnancy outcomes than unaffected women  and     psychotropic treated women with bd have better outcomes than un medicated women this prospective study included   mother infant dyads  women had bd without psychotropic exposure  bd np  n     bd with psychotropic treatment  bd p  n     or neither psychotropic exposure nor major mood disorder  comp  n     maternal characteristics were completed at   weeks gestation and evaluated for associations with delivery and birth outcomes  we performed multiple regressions on infant outcomes with adjustment for maternal age  race  employment status  use of illicit drugs and pre pregnancy bmi the bp p  bp np and comp groups varied significantly on sociodemographic characteristics  women with bd were more likely to be less educated  unemployed  single  and use tobacco and illicit drugs than women in the comp group  compared to women with bd np  women with bd p wer

In [725]:
df['Abstract'][9]

'we examined whether increased risk for adolescent tobacco and marijuana problems associated with childhood adhd is explained by key intermediary influences during adolescence and differs by gender longitudinal structural equation models examined mediating effects on problems with both substances  or each substance separately  through age   peer impairment  internalizing  and adolescent adhd symptoms in two twin samples  prospectively assessed since age    n\xa0 \xa0      whether these mediators contributed beyond mediating effects of early adolescent substance use was also considered  twin difference analyses further illuminated which mediators might be potentially causal direct effects of childhood adhd on age   tobacco and marijuana problems  i e   independent of included mediators  as well as effects of adolescent adhd symptoms were significant only for females  by contrast  mediation by peer impairment  evident particularly for marijuana  was relatively stronger for males than fem

In [726]:
df['Abstract'][400]

'despite the well established association between problem gambling and adhd core categories of impulsivity hyperactivity and inattention  the link between parents  problem gambling and impulsivity hyperactivity inattention  ih i  behaviors in children has not been investigated  this study investigated the association between parents  problem gambling and children s ih i behaviors while controlling for potential confounding variables  a population based prospective cohort followed up from kindergarten to age    the quebec longitudinal study of kindergarten children  qlskc   provided data over three generations  among   participants at age    parents with a child aged  \xa0year or older  n\xa0 \xa0   mean age\xa0 \xa0   \xa0years  sd\xa0 \xa0     were selected  generalized linear models included measures of grandparents  and parents  problem gambling  parents  ih i behaviors in childhood  and a host of risk factors and comorbidities to predict ih i in children  intergenerational bivariat

#### Remove \xa0

In [727]:
df['Abstract'] = df['Abstract'].str.replace('\xa0', '')

In [728]:
df['Abstract'][400]

'despite the well established association between problem gambling and adhd core categories of impulsivity hyperactivity and inattention  the link between parents  problem gambling and impulsivity hyperactivity inattention  ih i  behaviors in children has not been investigated  this study investigated the association between parents  problem gambling and children s ih i behaviors while controlling for potential confounding variables  a population based prospective cohort followed up from kindergarten to age    the quebec longitudinal study of kindergarten children  qlskc   provided data over three generations  among   participants at age    parents with a child aged  year or older  n    mean age    years  sd      were selected  generalized linear models included measures of grandparents  and parents  problem gambling  parents  ih i behaviors in childhood  and a host of risk factors and comorbidities to predict ih i in children  intergenerational bivariate associations were observed bet

In [729]:
df['Abstract'][35]

'pediatric guidelines recommend that providers address a range of parental health issues  however  adherence to these guidelines has been suboptimal  drawing on a nationally representative sample of children s primary care physicians  we examined whether providers view parental issues as relevant to child health and whether they believe it is their personal responsibility to address them  issues included maternal depression  tobacco use  intimate partner violence  tdap  tetanus  diphtheria  and acellular pertussis  immunization  family planning  and health insurance  while the majority of respondents endorsed the relevance of these issues to child health  particularly for issues with an established evidencebase  significantly fewer felt responsible for addressing them  physicians who endorsed relevance or responsibility were almost always more likely to address these issues in their clinical practice  to advance parental health promotion practices  highlighting relevance to pediatric o

In [730]:
df.head()

Unnamed: 0,PaperID,Abstract
0,236,the quality of the mother child relationship i...
1,323,perinatal depression affects of women in ...
2,874,sexual minority sm young adults such as tho...
3,1098,the increasingly female face of chronic obstru...
4,1427,the prevalence of korean adolescents with depr...


#### Tokenize words to do additional preprocessing

In [731]:
df['Abstract_tokenized'] = df['Abstract'].apply(word_tokenize)

In [732]:
# Check for tokenization
df['Abstract_tokenized'][0]

['the',
 'quality',
 'of',
 'the',
 'mother',
 'child',
 'relationship',
 'in',
 'the',
 'first',
 'year',
 'of',
 'life',
 'has',
 'far',
 'reaching',
 'implications',
 'across',
 'the',
 'life',
 'course',
 'bornstein',
 'in',
 'annu',
 'rev',
 'psychol',
 'yet',
 'little',
 'is',
 'known',
 'about',
 'predictors',
 'of',
 'maternal',
 'bonding',
 'and',
 'emotional',
 'availability',
 'in',
 'early',
 'infancy',
 'in',
 'this',
 'study',
 'we',
 'examined',
 'the',
 'extent',
 'to',
 'which',
 'postnatal',
 'bonding',
 'maternal',
 'mental',
 'health',
 'and',
 'substance',
 'use',
 'at',
 'weeks',
 'postpartum',
 'predicted',
 'mother',
 'infant',
 'bonding',
 'self',
 'report',
 'and',
 'mother',
 'emotional',
 'availability',
 'observational',
 'at',
 'months',
 'of',
 'age',
 'data',
 'were',
 'obtained',
 'from',
 'an',
 'australian',
 'longitudinal',
 'cohort',
 'study',
 'of',
 'pregnancy',
 'n',
 'data',
 'were',
 'collected',
 'during',
 'pregnancy',
 'at',
 'birth',
 'and'

In [734]:
len(df['Abstract_tokenized'][0])

221

#### Remove english and custom stopwords

In [735]:
#Add words to stopwords list:

removethese=stopwords.words('english')
words = ['©', 'copyright', 'published', 'elsevier', 'β', 'author',
        'warranted', 'ireland', 'ltd', 'rights', 'reserved', 'research',
        'ih', 'hr', 'n', 'e', 'illuminated', 'particularly', 'may',
        'also', 'findings', 'b', 'v', 'study', 'c', 'apa']

words_2  = ['©', 'copyright', 'published', 'elsevier', 'β', 'author',
        'warranted', 'ireland', 'ltd', 'rights', 'reserved', 'research',
        'ih', 'hr', 'n', 'e', 'illuminated', 'particularly', 'may',
        'also', 'findings', 'b', 'v', 'study', 'c', 'apa', 'among' 'associate', 'association', 'logistic',
        'regression', 'cross', 'sectional', 'odds', 'analysis', 'examine', 'include', 'relate', 'sample',
           'collect', 'likely', 'significant', 'suggest']

words_3 = ['©', 'copyright', 'published', 'elsevier', 'β', 'author',
        'warranted', 'ireland', 'ltd', 'rights', 'reserved', 'research',
        'ih', 'hr', 'n', 'e', 'illuminated', 'particularly', 'may',
        'also', 'findings', 'b', 'v', 'study', 'c', 'apa', 'among' 'associate', 'odds',
           'analysis', 'examine', 'include', 'relate', 'sample', 'collect', 'likely', 'significant']

words_4  = ['©', 'copyright', 'published', 'elsevier', 'β', 'author',
        'warranted', 'ireland', 'ltd', 'rights', 'reserved', 'research',
        'ih', 'hr', 'n', 'e', 'illuminated', 'particularly', 'may',
        'also', 'findings', 'b', 'v', 'study', 'c', 'apa', 'among' 'associate', 'association', 'logistic',
        'regression', 'cross', 'sectional', 'odds', 'analysis', 'examine', 'include', 'relate', 'sample',
           'collect', 'likely', 'significant', 'suggest', 'permit', 'employer', 'commercial', 'unless', 'text',
            'article']

for i in words_4:
    removethese.append(i)
print(removethese)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

We will probably have to add more stopwords later

In [736]:
# Remove stop words:
df['Abstract_tokenized'] = df['Abstract_tokenized'].apply(lambda x: [item for item in x if item not in removethese])

In [737]:
# Check
df['Abstract_tokenized'][0]

# no longer has words like "is" and "a"

['quality',
 'mother',
 'child',
 'relationship',
 'first',
 'year',
 'life',
 'far',
 'reaching',
 'implications',
 'across',
 'life',
 'course',
 'bornstein',
 'annu',
 'rev',
 'psychol',
 'yet',
 'little',
 'known',
 'predictors',
 'maternal',
 'bonding',
 'emotional',
 'availability',
 'early',
 'infancy',
 'examined',
 'extent',
 'postnatal',
 'bonding',
 'maternal',
 'mental',
 'health',
 'substance',
 'use',
 'weeks',
 'postpartum',
 'predicted',
 'mother',
 'infant',
 'bonding',
 'self',
 'report',
 'mother',
 'emotional',
 'availability',
 'observational',
 'months',
 'age',
 'data',
 'obtained',
 'australian',
 'longitudinal',
 'cohort',
 'pregnancy',
 'data',
 'collected',
 'pregnancy',
 'birth',
 'postnatally',
 'weeks',
 'months',
 'results',
 'show',
 'strong',
 'continuity',
 'postnatal',
 'bonding',
 'weeks',
 'months',
 'early',
 'postpartum',
 'stress',
 'depression',
 'associated',
 'bonding',
 'months',
 'however',
 'effect',
 'persist',
 'adjustment',
 'bonding',
 

In [738]:
df['Abstract_tokenized'][9]

['examined',
 'whether',
 'increased',
 'risk',
 'adolescent',
 'tobacco',
 'marijuana',
 'problems',
 'associated',
 'childhood',
 'adhd',
 'explained',
 'key',
 'intermediary',
 'influences',
 'adolescence',
 'differs',
 'gender',
 'longitudinal',
 'structural',
 'equation',
 'models',
 'examined',
 'mediating',
 'effects',
 'problems',
 'substances',
 'substance',
 'separately',
 'age',
 'peer',
 'impairment',
 'internalizing',
 'adolescent',
 'adhd',
 'symptoms',
 'two',
 'twin',
 'samples',
 'prospectively',
 'assessed',
 'since',
 'age',
 'whether',
 'mediators',
 'contributed',
 'beyond',
 'mediating',
 'effects',
 'early',
 'adolescent',
 'substance',
 'use',
 'considered',
 'twin',
 'difference',
 'analyses',
 'mediators',
 'might',
 'potentially',
 'causal',
 'direct',
 'effects',
 'childhood',
 'adhd',
 'age',
 'tobacco',
 'marijuana',
 'problems',
 'independent',
 'included',
 'mediators',
 'well',
 'effects',
 'adolescent',
 'adhd',
 'symptoms',
 'females',
 'contrast',
 '

#### Remove non-english words

In [739]:
words = set(nltk.corpus.words.words())  #set of English words 

df['Abstract_tokenized']=df['Abstract_tokenized'].apply(lambda x: [item for item in x if item in words])

#### Stemming

In [740]:
# def stem_text(text):
#     doc_stemmed = []
#     stemmer = LancasterStemmer()
    
#     for word in text:
#         doc_stemmed.append(stemmer.stem(word))
        
#     return doc_stemmed

In [741]:
# df['Abstract_stemmed'] = df['Abstract_tokenized'].apply(stem_text)

In [742]:
# check
# df.head()

In [743]:
# check
# df['Abstract_stemmed'][2]

In [744]:
# The stemming is very aggressive and makes the words hard to interpret especially since many are scientific
# and the subject matters of all abstracts are very different

# for now, don't do stemming. we can try manual stemming later for specific words such as associate and association
# using str.replace

#### Observe parts of speech

In [745]:
#tagging words with parts of speech 
df['Abstract_tagged'] = pos_tag_sents(df['Abstract_tokenized'].tolist())

In [746]:
#inspect tagged words for one abstract
df['Abstract_tagged'][1]

[('depression', 'NN'),
 ('south', 'VBZ'),
 ('health', 'NN'),
 ('depressive', 'JJ'),
 ('change', 'NN'),
 ('time', 'NN'),
 ('well', 'RB'),
 ('period', 'NN'),
 ('low', 'JJ'),
 ('middle', 'JJ'),
 ('income', 'NN'),
 ('data', 'NNS'),
 ('enrolled', 'VBD'),
 ('population', 'NN'),
 ('based', 'VBN'),
 ('birth', 'NN'),
 ('cohort', 'NN'),
 ('south', 'VBD'),
 ('least', 'JJS'),
 ('depression', 'JJ'),
 ('pregnancy', 'NN'),
 ('depressive', 'NN'),
 ('measured', 'VBD'),
 ('continuously', 'RB'),
 ('postnatal', 'JJ'),
 ('depression', 'NN'),
 ('scale', 'NN'),
 ('group', 'NN'),
 ('based', 'VBN'),
 ('trajectory', 'NN'),
 ('used', 'VBN'),
 ('estimate', 'JJ'),
 ('depressive', 'JJ'),
 ('period', 'NN'),
 ('multinomial', 'JJ'),
 ('identify', 'JJ'),
 ('trajectory', 'NN'),
 ('group', 'NN'),
 ('membership', 'NN'),
 ('five', 'CD'),
 ('distinct', 'JJ'),
 ('trajectory', 'NN'),
 ('depressive', 'JJ'),
 ('moderate', 'JJ'),
 ('depressive', 'JJ'),
 ('pregnancy', 'NN'),
 ('minimal', 'JJ'),
 ('minimal', 'JJ'),
 ('pregnancy', 

In [747]:
df['Abstract_tagged'][1]

[('depression', 'NN'),
 ('south', 'VBZ'),
 ('health', 'NN'),
 ('depressive', 'JJ'),
 ('change', 'NN'),
 ('time', 'NN'),
 ('well', 'RB'),
 ('period', 'NN'),
 ('low', 'JJ'),
 ('middle', 'JJ'),
 ('income', 'NN'),
 ('data', 'NNS'),
 ('enrolled', 'VBD'),
 ('population', 'NN'),
 ('based', 'VBN'),
 ('birth', 'NN'),
 ('cohort', 'NN'),
 ('south', 'VBD'),
 ('least', 'JJS'),
 ('depression', 'JJ'),
 ('pregnancy', 'NN'),
 ('depressive', 'NN'),
 ('measured', 'VBD'),
 ('continuously', 'RB'),
 ('postnatal', 'JJ'),
 ('depression', 'NN'),
 ('scale', 'NN'),
 ('group', 'NN'),
 ('based', 'VBN'),
 ('trajectory', 'NN'),
 ('used', 'VBN'),
 ('estimate', 'JJ'),
 ('depressive', 'JJ'),
 ('period', 'NN'),
 ('multinomial', 'JJ'),
 ('identify', 'JJ'),
 ('trajectory', 'NN'),
 ('group', 'NN'),
 ('membership', 'NN'),
 ('five', 'CD'),
 ('distinct', 'JJ'),
 ('trajectory', 'NN'),
 ('depressive', 'JJ'),
 ('moderate', 'JJ'),
 ('depressive', 'JJ'),
 ('pregnancy', 'NN'),
 ('minimal', 'JJ'),
 ('minimal', 'JJ'),
 ('pregnancy', 

In [748]:
df['Abstract_tagged'][2]

[('sexual', 'JJ'),
 ('minority', 'NN'),
 ('young', 'JJ'),
 ('identify', 'VB'),
 ('gay', 'JJ'),
 ('bisexual', 'JJ'),
 ('well', 'RB'),
 ('smoking', 'VBG'),
 ('heterosexual', 'JJ'),
 ('young', 'JJ'),
 ('however', 'RB'),
 ('simultaneously', 'RB'),
 ('tested', 'VBN'),
 ('role', 'NN'),
 ('three', 'CD'),
 ('risk', 'NN'),
 ('depressive', 'JJ'),
 ('tobacco', 'NN'),
 ('marketing', 'NN'),
 ('cigarette', 'NN'),
 ('related', 'VBN'),
 ('social', 'JJ'),
 ('explain', 'NN'),
 ('tobacco', 'NN'),
 ('use', 'VBP'),
 ('longitudinal', 'JJ'),
 ('structural', 'JJ'),
 ('equation', 'NN'),
 ('modeling', 'VBG'),
 ('used', 'VBN'),
 ('explore', 'NN'),
 ('identity', 'NN'),
 ('past', 'IN'),
 ('day', 'NN'),
 ('cigarette', 'NN'),
 ('smoking', 'VBG'),
 ('one', 'CD'),
 ('year', 'NN'),
 ('later', 'RB'),
 ('three', 'CD'),
 ('risk', 'NN'),
 ('starting', 'VBG'),
 ('fall', 'NN'),
 ('three', 'CD'),
 ('every', 'DT'),
 ('young', 'JJ'),
 ('adult', 'NN'),
 ('college', 'NN'),
 ('assessed', 'VBD'),
 ('identity', 'NN'),
 ('depressive'

In [749]:
df['Abstract_tagged'][3]

[('increasingly', 'RB'),
 ('female', 'JJ'),
 ('face', 'NN'),
 ('chronic', 'JJ'),
 ('obstructive', 'JJ'),
 ('pulmonary', 'JJ'),
 ('disease', 'NN'),
 ('prevalence', 'NN'),
 ('among', 'IN'),
 ('men', 'NNS'),
 ('since', 'IN'),
 ('due', 'JJ'),
 ('part', 'NN'),
 ('tobacco', 'NN'),
 ('use', 'NN'),
 ('among', 'IN'),
 ('exposure', 'NN'),
 ('finding', 'VBG'),
 ('number', 'NN'),
 ('evidence', 'NN'),
 ('susceptibility', 'NN'),
 ('smoking', 'NN'),
 ('along', 'IN'),
 ('epidemiological', 'JJ'),
 ('phenotypic', 'NN'),
 ('thus', 'RB'),
 ('become', 'VB'),
 ('leading', 'JJ'),
 ('cause', 'NN'),
 ('death', 'NN'),
 ('clinical', 'JJ'),
 ('presentation', 'NN'),
 ('increasingly', 'RB'),
 ('pronounced', 'VBD'),
 ('marked', 'JJ'),
 ('tendency', 'NN'),
 ('towards', 'NNS'),
 ('anxiety', 'NN'),
 ('depression', 'NN'),
 ('undernutrition', 'NN'),
 ('cell', 'NN'),
 ('lung', 'NN'),
 ('cancer', 'NN'),
 ('especially', 'RB'),
 ('adenocarcinoma', 'JJ'),
 ('osteoporosis', 'NN'),
 ('quality', 'NN'),
 ('life', 'NN'),
 ('signif

In [750]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

While there are not many, it appears that adverbs ('RB') do not add informational value

#### Remove adverbs

In [751]:
#creating a mini function to remove proper nouns
def remove_adv(tagged_text):
    return [(word, tag) for word,tag in tagged_text if tag != 'RB'] #keeping tag for lemmatizing

In [752]:
#applying the remove adverb function
df['Abstract_tagged_no_adv'] = df['Abstract_tagged'].apply(remove_adv)

#### Lemmatizing

In [753]:
#creating a function to convert post_tags to WordNet friendly tags, then lemmatizing
def lemmatize_all(tagged_text):
    doc_lemm = []
    wnl = WordNetLemmatizer()
    for word, tag in (tagged_text):
        if tag.startswith("NN"):
            doc_lemm.append(wnl.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('VB'):
            doc_lemm.append(wnl.lemmatize(word, wordnet.VERB))
        elif tag.startswith('JJ'):
            doc_lemm.append(wnl.lemmatize(word, wordnet.ADJ))
        else:
            doc_lemm.append(word)
    return doc_lemm

In [754]:
#applying the lemmatization function
df['Abstract_lemm'] = df['Abstract_tagged_no_adv'].apply(lemmatize_all)

In [755]:
#inspect one document to see if adverbs (and tags) have been removed
df['Abstract_lemm'][3]

['female',
 'face',
 'chronic',
 'obstructive',
 'pulmonary',
 'disease',
 'prevalence',
 'among',
 'men',
 'since',
 'due',
 'part',
 'tobacco',
 'use',
 'among',
 'exposure',
 'find',
 'number',
 'evidence',
 'susceptibility',
 'smoking',
 'along',
 'epidemiological',
 'phenotypic',
 'become',
 'leading',
 'cause',
 'death',
 'clinical',
 'presentation',
 'pronounce',
 'marked',
 'tendency',
 'towards',
 'anxiety',
 'depression',
 'undernutrition',
 'cell',
 'lung',
 'cancer',
 'adenocarcinoma',
 'osteoporosis',
 'quality',
 'life',
 'impact',
 'advanced',
 'explain',
 'involve',
 'role',
 'gas',
 'exchange',
 'smoking',
 'require',
 'appropriate',
 'therapeutic',
 'smoking',
 'cessation',
 'pulmonary',
 'rehabilitation',
 'long',
 'term',
 'oxygen',
 'therapy',
 'treatment',
 'great',
 'diagnosis',
 'men',
 'spirometry',
 'medical',
 'face',
 'serious',
 'public',
 'health',
 'problem',
 'need',
 'update',
 'adapt',
 'knowledge',
 'epidemiological']

### Final cleaning before modeling

In [756]:
#inspect dataframe one last time
df.head()

Unnamed: 0,PaperID,Abstract,Abstract_tokenized,Abstract_tagged,Abstract_tagged_no_adv,Abstract_lemm
0,236,the quality of the mother child relationship i...,"[quality, mother, child, relationship, first, ...","[(quality, NN), (mother, CC), (child, NN), (re...","[(quality, NN), (mother, CC), (child, NN), (re...","[quality, mother, child, relationship, first, ..."
1,323,perinatal depression affects of women in ...,"[depression, south, health, depressive, change...","[(depression, NN), (south, VBZ), (health, NN),...","[(depression, NN), (south, VBZ), (health, NN),...","[depression, south, health, depressive, change..."
2,874,sexual minority sm young adults such as tho...,"[sexual, minority, young, identify, gay, bisex...","[(sexual, JJ), (minority, NN), (young, JJ), (i...","[(sexual, JJ), (minority, NN), (young, JJ), (i...","[sexual, minority, young, identify, gay, bisex..."
3,1098,the increasingly female face of chronic obstru...,"[increasingly, female, face, chronic, obstruct...","[(increasingly, RB), (female, JJ), (face, NN),...","[(female, JJ), (face, NN), (chronic, JJ), (obs...","[female, face, chronic, obstructive, pulmonary..."
4,1427,the prevalence of korean adolescents with depr...,"[prevalence, depression, suicide, rate, among,...","[(prevalence, NN), (depression, NN), (suicide,...","[(prevalence, NN), (depression, NN), (suicide,...","[prevalence, depression, suicide, rate, among,..."


In [757]:
df.duplicated(subset=['PaperID', 'Abstract'], keep=False).sum()

0

In [758]:
#save df as csv just in case
#df.to_csv('abstracts_preprocessed.csv', index=False)

In [759]:
#drop redundant columns
df.drop(['Abstract_tokenized', 'Abstract_tagged', 'Abstract_tagged_no_adv'],
       axis=1,
       inplace=True)

In [760]:
#rename abstract lemm column
df.rename(columns={'Abstract_lemm':'Abstract_modeling'}, inplace=True)

In [761]:
df.head()

Unnamed: 0,PaperID,Abstract,Abstract_modeling
0,236,the quality of the mother child relationship i...,"[quality, mother, child, relationship, first, ..."
1,323,perinatal depression affects of women in ...,"[depression, south, health, depressive, change..."
2,874,sexual minority sm young adults such as tho...,"[sexual, minority, young, identify, gay, bisex..."
3,1098,the increasingly female face of chronic obstru...,"[female, face, chronic, obstructive, pulmonary..."
4,1427,the prevalence of korean adolescents with depr...,"[prevalence, depression, suicide, rate, among,..."


### Save as CSV for modeling

In [762]:
df.to_csv('abstracts_for_modeling_4.csv', index=False)