Step 1: Heuristic-Based Filtering with Regular Expressions

In [3]:
import pandas as pd
import re

# Load data
data = pd.read_csv(r"C:\Users\varun\Downloads\collection_with_abstracts.csv")

# Define keywords
deep_learning_keywords = r"\b(deep learning|neural network|cnn|rnn|lstm|transformer|bert|mlp|gan|autoencoder)\b"
virology_keywords = r"\b(virology|virus|epidemiology|pathogen|covid|sars|influenza|outbreak)\b"

# Filter data
filtered_data = data[
    data['Abstract'].fillna("").str.contains(deep_learning_keywords, flags=re.IGNORECASE) &
    data['Abstract'].fillna("").str.contains(virology_keywords, flags=re.IGNORECASE)
]

# Display filtered count
print(f"Filtered relevant papers: {filtered_data.shape[0]}")


  data['Abstract'].fillna("").str.contains(deep_learning_keywords, flags=re.IGNORECASE) &
  data['Abstract'].fillna("").str.contains(virology_keywords, flags=re.IGNORECASE)


Filtered relevant papers: 1055


Step 2: Classification Based on Heuristic Matching

In [13]:
# Define method keywords
text_mining_keywords = r"\b(nlp|natural language processing|text analysis|bert|token)\b"
computer_vision_keywords = r"\b(image processing|cnn|segmentation|CT|MRI|X-ray)\b"

# Classification function
def classify_method(abstract):
    if re.search(text_mining_keywords, abstract, flags=re.IGNORECASE) and re.search(computer_vision_keywords, abstract, flags=re.IGNORECASE):
        return "both"
    elif re.search(text_mining_keywords, abstract, flags=re.IGNORECASE):
        return "text mining"
    elif re.search(computer_vision_keywords, abstract, flags=re.IGNORECASE):
        return "computer vision"
    else:
        return "other"

# Apply classification
filtered_data['Method Classification'] = filtered_data['Abstract'].apply(classify_method)
print(filtered_data[['Title', 'Method Classification']].head())


                                                Title Method Classification
31  MEFFGRN: Matrix enhancement and feature fusion...       computer vision
42  Automated cooling tower detection through deep...                 other
43  Multi-detector fusion and Bayesian smoothing f...                 other
52  Automatic mapping of high-risk urban areas for...                 other
67  Chest CT-based automated vertebral fracture as...       computer vision


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Method Classification'] = filtered_data['Abstract'].apply(classify_method)


Step 3: Extract Deep Learning Methods with Regular Expressions

In [16]:
# Define common method names
methods_keywords = [
    "cnn", "convolutional neural network", "rnn", "lstm", "transformer", "bert",
    "gpt", "mlp", "gan", "autoencoder", "unet", "resnet", "vgg"
]

# Compile regex
methods_pattern = r"\b(" + "|".join(methods_keywords) + r")\b"

# Extraction function
def extract_methods(abstract):
    return re.findall(methods_pattern, abstract, flags=re.IGNORECASE)

# Apply method extraction
filtered_data['Extracted Methods'] = filtered_data['Abstract'].apply(extract_methods)
print(filtered_data[['Title', 'Extracted Methods']].head())


                                                Title  \
31  MEFFGRN: Matrix enhancement and feature fusion...   
42  Automated cooling tower detection through deep...   
43  Multi-detector fusion and Bayesian smoothing f...   
52  Automatic mapping of high-risk urban areas for...   
67  Chest CT-based automated vertebral fracture as...   

                      Extracted Methods  
31  [CNN, Convolutional Neural Network]  
42                                   []  
43                                   []  
52                                   []  
67                                   []  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Extracted Methods'] = filtered_data['Abstract'].apply(extract_methods)
