### Download the data

In [None]:
!wget https://raw.githubusercontent.com/suvigyajain0101/CaseStudies/main/AdverseEventClassification/Data/AE_Data.csv

In [None]:
! pip install --quiet bertopic

In [3]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [4]:
df = pd.read_csv('/content/AE_Data.csv')
df.head()

Unnamed: 0,title,abstract,label
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0


## BERTopic - Out of the Box

In [33]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

model = BERTopic(
                  vectorizer_model=vectorizer_model,
                  language='english', 
                  calculate_probabilities=True,
                  verbose=True
                )

In [6]:
topics, probs = model.fit_transform(df['abstract'])

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

2022-10-03 21:58:57,314 - BERTopic - Transformed documents to Embeddings
2022-10-03 21:59:37,079 - BERTopic - Reduced dimensionality
2022-10-03 21:59:38,495 - BERTopic - Clustered reduced embeddings


In [7]:
freq = model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,1066,-1_bacteria_strains_andotherspecies_activity
1,0,149,0_compounds_synthesized_derivatives_antibacterial
2,1,133,1_naocl_antimicrobial_microorganisms_agar
3,2,129,2____
4,3,125,3_extracts_extract_activity_plants
5,4,110,4_utis_urinary_mic90s_urinary tract
6,5,108,5_ciprofloxacin_mgl_active_fluoroquinolones
7,6,106,6_resistance_isolates_resistant_susceptibility
8,7,99,7_vaginal_women_bv_grade
9,8,84,8_lactic_yogurt_lactic acid_acid


In [8]:
model.visualize_barchart()

In [9]:
model.visualize_topics()

In [10]:
model.visualize_hierarchy()

### Data Cleaning

Out-of-the-Box solution has few evident issues - 
1. Too many Topics - Ideally we'd like to summarize the data in under fewer topics
2. Records with null text - Need to remove the records with no text
3. Overlapping Topics - Intertopic Distance Map shows overlapping topics, which shouldn't be the case

In order to clear these issues, let's first start with data cleaning - 
1. Lower case the entire corpus
2. Remove Stop words, add #PADDING and -TI to the list
3. Remove punctuations
4. Lemmatize the corpus
5. Remove the records with blank text

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [12]:
eng_stopwords = stopwords.words('english')
stemmer = WordNetLemmatizer()

# Define few data specific stopwords
WORDS_TO_REMOVE = ['##padding##', 'ti-', 'ti -']

joined_words_to_remove = '|'.join(WORDS_TO_REMOVE)


def clean_text(x):
  # Lower case the text
  lower_x = x.lower()

  # Remove line breaks and tabs
  no_break_x = re.sub("\n|\r|\t", " ", lower_x)

  # Remove specific words
  no_waste_words_x = re.sub(joined_words_to_remove, " ", no_break_x)

  # Remove all non alphabet, numeral characters and spaces
  alpha_x = re.sub('[^0-9a-zA-Z ]+', ' ', no_waste_words_x)

  # Remove stopwords and lemmatize the word. Join at the end will also remove multi-spaces
  lemma_x = ' '.join([stemmer.lemmatize(word) for word in alpha_x.split() if word not in eng_stopwords])

  return lemma_x

# Apply cleaning function to the text field
df['clean_text'] = df['abstract'].apply(lambda x : clean_text(x))

In [31]:
cleaned_text = df[df['clean_text'].str.len() != 0]['clean_text'].values

In [34]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

model_exp2 = BERTopic(
                      vectorizer_model=vectorizer_model,
                      language='english', 
                      calculate_probabilities=True,
                      verbose=True
                    )

In [None]:
# Fit BERTopic model on cleaned data 
topics, probs = model.fit_transform(cleaned_text)

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

In [None]:
freq = model.get_topic_info()
freq.head(10)

In [None]:
model.visualize_barchart()

In [None]:
model.visualize_topics()