In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import glob
from tqdm import tqdm

# Importing drive method from colab for accessing google drive
from google.colab import drive

import re
import string

import spacy
import spacy.cli
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
nltk.download('punkt') 

import textblob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

from wordcloud import WordCloud
from collections import defaultdict
from collections import  Counter
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Mounting drive
# This will require authentication : Follow the steps as guided
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Viewing the data in the folder present from the drive
!ls "/content/drive/My Drive/ML_projects/NLP_DRUG/"

DATA_NEG0.csv	   DATA_NEG5000.csv  DATA_POS1000.csv
DATA_NEG10000.csv  DATA_NEG6000.csv  encoder.pickle
DATA_NEG1000.csv   DATA_NEG7000.csv  NLP_classification_sentence
DATA_NEG2000.csv   DATA_NEG8000.csv  NLP_Drug_model.ipynb
DATA_NEG3000.csv   DATA_NEG9000.csv  NLP_Drug_NER_CRF.pickle
DATA_NEG4000.csv   DATA_POS0.csv     tokenizer.pickle


In [None]:
# getting list of 
data_files = glob.glob("/content/drive/My Drive/ML_projects/NLP_DRUG/*.csv")
#data_files[::-1]
df = pd.concat((pd.read_csv(f) for f in data_files[::-1]))
df = df.drop(df.columns[[0]], axis=1)
df.head()

Unnamed: 0,articleID,publication_date,title,abstract
0,2871696,1986-02-01,Long-term neuroleptic therapy in chronic anore...,A patient is described with a 25-year history ...
1,7560456,1995-08-01,Bilateral inguinal scrofuloderma during steroi...,This report described a case of scrofuloderma ...
2,16053946,2005-08-02,Report of a patient with severe transfusion-re...,To report a patient with a large gastrointesti...
3,8667442,1996-05-01,"Milk-alkali syndrome induced by 1,25(OH)2D in ...",Milk-alkali syndrome was first described 70 ye...
4,19822103,2009-10-14,Beneficial effects of telmisartan in an HIV+ d...,In HIV-infected patients with metabolic disord...


In [None]:
#check articleID : if len(articleID)=2, it means articleID = []
df['check_ID'] = df['articleID'].apply(lambda x: len(str(x)))

print('drop empty rows')
print(min(df['check_ID']))
print(df[df['check_ID']==min(df['check_ID'])])

df = df.drop(df[df['check_ID']==2].index)

print('check if dataframe drops the line with article_ID = []')
print(min(df['check_ID']))
df.drop(['check_ID'], axis=1, inplace=True)

drop empty rows
2
    articleID publication_date title abstract  check_ID
556        []               []    []       []         2
936        []               []    []       []         2
check if dataframe drops the line with article_ID = []
5


In [None]:
df = df.drop_duplicates()

df = df[~df.title.duplicated(keep='first')]
df = df.reset_index().drop(['index'], axis=1)
df.head()

Unnamed: 0,articleID,publication_date,title,abstract
0,2871696,1986-02-01,Long-term neuroleptic therapy in chronic anore...,A patient is described with a 25-year history ...
1,7560456,1995-08-01,Bilateral inguinal scrofuloderma during steroi...,This report described a case of scrofuloderma ...
2,16053946,2005-08-02,Report of a patient with severe transfusion-re...,To report a patient with a large gastrointesti...
3,8667442,1996-05-01,"Milk-alkali syndrome induced by 1,25(OH)2D in ...",Milk-alkali syndrome was first described 70 ye...
4,19822103,2009-10-14,Beneficial effects of telmisartan in an HIV+ d...,In HIV-infected patients with metabolic disord...


In [None]:
#check
df.shape
#-np.sort(-df['articleID'].value_counts())
#-np.sort(-df['title'].value_counts())
#-np.sort(-df['abstract'].value_counts())

(2343, 4)

In [None]:
df.isnull().sum()

articleID           0
publication_date    0
title               0
abstract            0
dtype: int64

### Tag label for article title

## Separate abstract to sentences

In [None]:
def remouve_special_character(text):
  text = re.sub(r"[-()\"#/@;:<>{}`+=~|!?,]", " ", text)
  return text

In [None]:
df['cleaned_abtract'] = df['abstract'].apply(lambda x: remouve_special_character(x))
df.head()

Unnamed: 0,articleID,publication_date,title,abstract,cleaned_abtract
0,2871696,1986-02-01,Long-term neuroleptic therapy in chronic anore...,A patient is described with a 25-year history ...,A patient is described with a 25 year history ...
1,7560456,1995-08-01,Bilateral inguinal scrofuloderma during steroi...,This report described a case of scrofuloderma ...,This report described a case of scrofuloderma ...
2,16053946,2005-08-02,Report of a patient with severe transfusion-re...,To report a patient with a large gastrointesti...,To report a patient with a large gastrointesti...
3,8667442,1996-05-01,"Milk-alkali syndrome induced by 1,25(OH)2D in ...",Milk-alkali syndrome was first described 70 ye...,Milk alkali syndrome was first described 70 ye...
4,19822103,2009-10-14,Beneficial effects of telmisartan in an HIV+ d...,In HIV-infected patients with metabolic disord...,In HIV infected patients with metabolic disord...


In [None]:
def seperate_into_sentence(text):
  return nltk.sent_tokenize(text)

df['sentences'] = df['cleaned_abtract'].map(seperate_into_sentence)
df.head()

Unnamed: 0,articleID,publication_date,title,abstract,cleaned_abtract,sentences
0,2871696,1986-02-01,Long-term neuroleptic therapy in chronic anore...,A patient is described with a 25-year history ...,A patient is described with a 25 year history ...,[A patient is described with a 25 year history...
1,7560456,1995-08-01,Bilateral inguinal scrofuloderma during steroi...,This report described a case of scrofuloderma ...,This report described a case of scrofuloderma ...,[This report described a case of scrofuloderma...
2,16053946,2005-08-02,Report of a patient with severe transfusion-re...,To report a patient with a large gastrointesti...,To report a patient with a large gastrointesti...,[To report a patient with a large gastrointest...
3,8667442,1996-05-01,"Milk-alkali syndrome induced by 1,25(OH)2D in ...",Milk-alkali syndrome was first described 70 ye...,Milk alkali syndrome was first described 70 ye...,[Milk alkali syndrome was first described 70 y...
4,19822103,2009-10-14,Beneficial effects of telmisartan in an HIV+ d...,In HIV-infected patients with metabolic disord...,In HIV infected patients with metabolic disord...,[In HIV infected patients with metabolic disor...


In [None]:
df.loc[15,'abstract']

'We report two cases of lost seizure control associated with the generic substitution of carbamazepine, review pertinent literature, and discuss the impact of this substitution on patient care.\r\nCase studies, abstracts, and research publications identified in MEDLINE and bibliographic review.\r\nOne author reviewed cases supplied by the other authors and abstracted information from published literature sources.\r\nThe first case describes a 15-year-old boy who received valproic acid and carbamazepine for partial seizures. A change in government program policies caused him to receive generic carbamazepine. This resulted in loss of seizure control and a decrease in his serum carbamazepine concentration from 12.4 to 6.7 micrograms/mL. When his carbamazepine concentration returned to previous levels, seizure control was not reestablished. A second case involves a 21-year-old woman who substituted generic carbamazepine because of financial problems. After being seizure-free for at least f

In [None]:
df.loc[15,'sentences']

['We report two cases of lost seizure control associated with the generic substitution of carbamazepine  review pertinent literature  and discuss the impact of this substitution on patient care.',
 'Case studies  abstracts  and research publications identified in MEDLINE and bibliographic review.',
 'One author reviewed cases supplied by the other authors and abstracted information from published literature sources.',
 'The first case describes a 15 year old boy who received valproic acid and carbamazepine for partial seizures.',
 'A change in government program policies caused him to receive generic carbamazepine.',
 'This resulted in loss of seizure control and a decrease in his serum carbamazepine concentration from 12.4 to 6.7 micrograms mL.',
 'When his carbamazepine concentration returned to previous levels  seizure control was not reestablished.',
 'A second case involves a 21 year old woman who substituted generic carbamazepine because of financial problems.',
 'After being sei

In [None]:
df1 = df[['articleID', 'publication_date', 'title']]
df1.columns = ['articleID', 'publication_date', 'text']

In [None]:
df2 = df[['articleID', 'publication_date', 'sentences']]
df2.columns = ['articleID', 'publication_date', 'text']
df2 = df2.explode('text')
df2.head()

Unnamed: 0,articleID,publication_date,text
0,2871696,1986-02-01,A patient is described with a 25 year history ...
0,2871696,1986-02-01,Repeated relapse occurred despite a variety of...
0,2871696,1986-02-01,On this regime together with supportive psych...
0,2871696,1986-02-01,Multiple attempts to withdraw the medication r...
0,2871696,1986-02-01,Recently the patient has exhibited signs of t...


In [None]:
df_final = df1.append(df2, ignore_index=True)
df_final.shape

(16834, 3)

In [None]:
#df_final['articleID'] = df_final['articleID'].astype({"articleID": int})
df_final[df_final['articleID']==15529178]

In [None]:
df_final = (df_final.sort_values(['articleID']).reset_index().drop(['index'], axis=1))

#compute length of text
df_final['len_text'] = df_final['text'].apply(lambda x: len(str(x).split()))
df_final.head()

In [None]:
np.sort(df_final['len_text'].value_counts())

In [None]:
df_final[df_final['len_text']<4].shape

(209, 4)

In [None]:
idx_to_drop = []
 
for idx in df_final[df_final['len_text']<4].index:
  # check if the 1st character of the short sentence if lower
  # and this sentence to the previos line
  if df_final.loc[idx, 'text'][0].islower():
    df_final.loc[idx-1, 'text'] += str(' ') + str(df_final.loc[idx, 'text'])
    idx_to_drop.append(idx)

  #if 1st charater of next line (after the short sentence) is lower
  #add 2 lines to create one sentence, then drop the second line
  elif df_final.loc[idx, 'text'][0].isupper():
    if df_final.loc[idx+1, 'text'][0].islower(): 
      df_final.loc[idx, 'text']+= str(' ') + str(df_final.loc[idx+1,'text'])
      idx_to_drop.append(idx+1)
  else:
    idx_to_drop.append(idx)

len(idx_to_drop)

43

In [None]:
df_final['len_text'] = df_final['text'].apply(lambda x: len(str(x).split()))
len(df_final[df_final['len_text']<3])

98

In [None]:
display(df_final[df_final['len_text']==2])

Unnamed: 0,index,articleID,publication_date,text,len_text


In [None]:
df_final.loc[446, 'text']

'Tamoxifen retinopathy.'

In [None]:
df_final.loc[446-1, 'text']

'These findings suggest that the corneal and retinal changes are the result of a toxic effect of tamoxifen when used in the doses and duration described.'

In [None]:
df_final.loc[446+1, 'text']

'Tamoxifen has been used as a chemotherapeutic agent with no serious side effects noted.'

In [None]:
df_final = df_final[df_final['len_text']>=3]
df_final = df_final.reset_index()
len(df_final[df_final['len_text']<3])

0

## Tag label for each sentences

In [None]:
#read data in AED line by line 
with open('ADE-POS.txt') as f:
    content_POS = f.readlines()
content_POS = [x.strip() for x in content_POS] 

In [None]:
df_final['label'] = 0
#df2['full_sentence'] = 0

for i in tqdm(range(len(df_final))):
  matching = [s for s in content_POS if df_final.loc[i,'text'] in s]
  if len(matching) > 0:
    df_final.loc[i, 'label'] = 1

#check values of label_sentence
df_final['label'].value_counts()

100%|██████████| 20882/20882 [19:58<00:00, 17.43it/s]


0    16688
1     4194
Name: label, dtype: int64

In [None]:
df_final = df_final.drop('index', axis=1)
df_final.head()

Unnamed: 0,articleID,publication_date,text,len_text,label
0,3393,1976-03-01,Treatment of tardive dyskinesia.,4,0
1,3393,1976-03-01,An effective schema for the treatment of tardi...,25,0
2,3393,1976-03-01,The former is thought to be related to central...,24,0
3,3393,1976-03-01,The pathogenesis of tardive dyskinesia is dist...,17,0
4,18795,1977-06-01,Asthma and urticaria during disodium cromoglyc...,10,0


In [None]:
df_final.to_csv('data_to_NLP.csv')