In [None]:
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

# Load dataset
def load_data(file_path):
    return pd.read_csv(file_path)

# Text Preprocessing
def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
        return text
    return ""

# Initialize BERT model and tokenizer for embedding generation
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Generate embedding for a single sentence
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach()

# Filter relevant papers using embeddings and cosine similarity
def filter_relevant_papers(df, threshold=0.7):
    target_phrases = ["deep learning in virology ",
    "neural networks for epidemiology",
    "deep learning in epidemiology",
    "neural networks in virology",]
    target_embedding = sum([get_embedding(phrase) for phrase in target_phrases]) / len(target_phrases)

    relevant_papers = []
    for idx, row in df.iterrows():
        title_embedding = get_embedding(preprocess_text(row["Title"]))
        abstract_embedding = get_embedding(preprocess_text(row["Abstract"]))
        combined_embedding = (title_embedding + abstract_embedding) / 2
        similarity = cosine_similarity(combined_embedding, target_embedding)
        if similarity >= threshold:
            relevant_papers.append(row)
    return pd.DataFrame(relevant_papers)


## Paper Classification

In [None]:
# Classify papers into text mining, computer vision, both, or other
def classify_papers(df) :
    # put title and abstract together
    df['content']  = df['Title'] + '\n' + df['Abstract']

    # zero shot classification pipeline
    zs_pipeline = pipeline("zero-shot-classification", model="tasksource/deberta-small-long-nli")

    # separate independent predictions for computer vision and text mining. 
    df['cv'] = predict_labels(zs_pipeline, ['Computer Vision'], list(df['content']))
    df['nlp'] = predict_labels(zs_pipeline, ['Text Mining'], list(df['content']))

    # infer 'others' and 'both' based on computer vision and text mining labels
    df['Category'] = 'others'
    df.loc[df.cv > 0.5, 'Category'] = 'Computer Vision'
    df.loc[df.nlp > 0.4, 'Category'] = 'Text Mining'
    df.loc[(df.cv > 0.5) & (df.nlp > 0.4), 'Category'] = 'both'
    
    return df

## Method Extraction

In [None]:
# Extract deep learning methods from abstracts and title
df['Title'] = df['Title'].fillna('')
df['Abstract'] = df['Abstract'].fillna('')

# List of known deep learning methods
deep_learning_methods = ['CNN', 'Convolutional Neural Networks', 'RNN', 'Recurrent Neural Networks', 
                         'LSTM', 'Long Short-Term Memory', 'GAN', 'Generative Adversarial Networks', 
                         'DQN', 'Deep Q-Networks', 'Reinforcement Learning'
                         'Transformer', 'transformer|attention-based model']

# Function to identify deep learning method from text (title + abstract)
def identify_dl_method(title, abstract):
    # Combine title and abstract for keyword extraction
    text = title + " " + abstract
    keywords = kw_model.extract_keywords(text, top_n=5)  # Extract top 5 keywords
    keywords_list = [keyword for keyword, _ in keywords]
    
    # Search for deep learning methods in the keywords list
    identified_methods = [method for method in deep_learning_methods if any(keyword.lower() in method.lower() for keyword in keywords_list)]
    
    return identified_methods if identified_methods else ['Unknown Method']
    
# Apply the method identification function
df['Identified_DL_Method'] = df.apply(lambda row: identify_dl_method(row['Title'], row['Abstract']), axis=1)

# Display the DataFrame with identified deep learning methods
#print(df[['Title', 'Identified_DL_Method']])

methods = subset_df['Identified_DL_Method'].value_counts()


# Function to categorize based on the threshold of 0.5
def categorize(row):
    if row['Computer Vision'] >= 0.5 and row['Text Mining'] >= 0.5:
        return 'both'
    elif row['Computer Vision'] < 0.5 and row['Text Mining'] < 0.5:
        return 'others'
    elif row['Computer Vision'] >= 0.5 and row['Text Mining'] < 0.5:
        return 'Computer Vision'
    else:
        return 'Text Mining'

# Apply the function to each row and create a new column 'Category'
df1['Category'] = df1.apply(categorize, axis=1)

# Get the statistics of each category (i.e., count of occurrences)
category_stats = df1['Category'].value_counts()

# Print the stats
print("Category Statistics:")
print(category_stats)
