In [1]:
import pandas as pd

# Load the CSV file
file_path = './updated_filtered_scopus_modified.csv'  # Replace 'your_file_path.csv' with the path to your CSV file
articles_df = pd.read_csv(file_path)

# Define the keywords for filtering
keywords = ["Anesthesiology", "Anesthesia", "Pain Management", "Perioperative Care", "Sedation", "Anesthetic Monitoring"]

# Function to check if any keyword is present in the text
def contains_keyword(text):
    if pd.isna(text):
        return False
    for keyword in keywords:
        if keyword.lower() in text.lower():
            return True
    return False

# Add a new column to indicate whether the article is related to anesthesiology
articles_df['Related to Anesthesiology'] = articles_df.apply(lambda row: contains_keyword(row['Title']) or contains_keyword(row['Abstract']), axis=1)

# Save the updated dataframe to a new CSV file
# updated_file_path = 'updated_scopus_with_relevance.csv'  # You can specify a different path or file name here
# articles_df.to_csv(updated_file_path, index=False)


In [3]:
articles_df.loc[1, "Abstract"]

'Background: The application of data-driven methods is expected to play an increasingly important role in healthcare. However, a lack of personnel with the necessary skills to develop these models and interpret its output is preventing a wider adoption of these methods. To address this gap, we introduce and describe ORIENTATE, a software for automated application of machine learning classification algorithms by clinical practitioners lacking specific technical skills. ORIENTATE allows the selection of features and the target variable, then automatically generates a number of classification models and cross-validates them, finding the best model and evaluating it. It also implements a custom feature selection algorithm for systematic searches of the best combination of predictors for a given target variable. Finally, it outputs a comprehensive report with graphs that facilitates the explanation of the classification model results, using global interpretation methods, and an interface fo

In [5]:
len(articles_df)

1504

In [9]:
len(articles_df[articles_df['Related to Anesthesiology']==True])

995

In [10]:
articles_df.columns

Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS',
       'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts',
       'References', 'Correspondence Address', 'Editors', 'Publisher',
       'Sponsors', 'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Abbreviated Source Title',
       'Document Type', 'Publication Stage', 'Open Access', 'Source', 'EID',
       'Related to Anesthesiology'],
      dtype='object')

In [11]:
articles_df['Author Keywords']

0                                                     NaN
1       Classification; Deep sedation; Machine learnin...
2                                                     NaN
3       Bariatric surgery; Cost analysis; Cost predict...
4       channel-spatial attention; deep learning; gene...
                              ...                        
1499           Cecal intubation time; Colonoscopy; Magnet
1500    Apoptosis; Ischemia reperfusion injury; NEP1-4...
1501    anesthesiology; closed-loop; fuzzy system; hea...
1502    Adrenal function; Adrenal insufficiency; Cardi...
1503    ANZCA; Examinations; Learning approaches; Spec...
Name: Author Keywords, Length: 1504, dtype: object

In [12]:
# 计算 'Author Keywords' 列中缺失值的数量
missing_author_keywords = articles_df['Author Keywords'].isna().sum()

# 计算 'Index Keywords' 列中缺失值的数量
missing_index_keywords = articles_df['Index Keywords'].isna().sum()

# 打印结果
print(f"'Author Keywords' column missing values: {missing_author_keywords}")
print(f"'Index Keywords' column missing values: {missing_index_keywords}")

'Author Keywords' column missing values: 338
'Index Keywords' column missing values: 140


In [13]:
# 计算同时在 'Author Keywords' 和 'Index Keywords' 这两列中缺失的行数
missing_both = articles_df[articles_df['Author Keywords'].isna() & articles_df['Index Keywords'].isna()].shape[0]

# 打印结果
print(f"Rows with missing values in both 'Author Keywords' and 'Index Keywords': {missing_both}")


Rows with missing values in both 'Author Keywords' and 'Index Keywords': 22


In [14]:
keywords = ["Anesthesiology", "Anesthesia", "Pain Management", "Perioperative Care", "Sedation", "Anesthetic Monitoring"]

# Function to check if any keyword is present in the text
def contains_keyword(text):
    if pd.isna(text):
        return False
    for keyword in keywords:
        if keyword.lower() in text.lower():
            return True
    return False

# Add a new column to indicate whether the article is related to anesthesiology
articles_df['Related to Anesthesiology2'] = articles_df.apply(lambda row: contains_keyword(row['Title']) or contains_keyword(row['Author Keywords']) or contains_keyword(row['Index Keywords']), axis=1)


In [15]:
len(articles_df[articles_df['Related to Anesthesiology2']==True])

1105

In [16]:
missing_abstract_keywords = articles_df['Abstract'].isna().sum()
print(missing_abstract_keywords)

0


In [20]:
articles_df['Related to Anesthesiology3'] = articles_df['Related to Anesthesiology'] & articles_df['Related to Anesthesiology2']

In [21]:
len(articles_df[articles_df['Related to Anesthesiology3']==True])

646

In [22]:
updated_file_path = "./3level_filter.csv"
articles_df.to_csv(updated_file_path, index=False)

In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


In [29]:
df = articles_df
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stopwords.words('english')])

# 应用文本预处理
df['Processed_Abstract'] = df['Abstract'].apply(preprocess_text)
df['Processed_Keywords'] = df['Author Keywords'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else '')
df['Processed_Title'] = df['Title'].apply(preprocess_text)

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lxk/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/lxk/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [30]:
df['Processed_Abstract']

0       problem implementation competencybased medical...
1       background application datadriven methods expe...
2       preoperative knowledge expected postoperative ...
3       hospitals facing difficulties predicting evalu...
4       vast amounts monitoring data obtained various ...
                              ...                        
1499    background colonoscopy considered effective me...
1500    background retina ischemia reperfusion injury ...
1501    introduction measuring ensuring adequate level...
1502    although evidence suggests benefit steroid sup...
1503    evaluating assessments impact learning often o...
Name: Processed_Abstract, Length: 1504, dtype: object

In [31]:
df['Processed_Keywords']

0                                                        
1       classification deep sedation machine learning ...
2                                                        
3       bariatric surgery cost analysis cost predictio...
4       channelspatial attention deep learning general...
                              ...                        
1499             cecal intubation time colonoscopy magnet
1500    apoptosis ischemia reperfusion injury nep140 n...
1501    anesthesiology closedloop fuzzy system heart r...
1502    adrenal function adrenal insufficiency cardiac...
1503    anzca examinations learning approaches special...
Name: Processed_Keywords, Length: 1504, dtype: object

In [32]:
df['Processed_Title']

0       deep learning model automated trainee assessme...
1       orientate automated machine learning classifie...
2       development prospective validation postoperati...
3       development validation predictive model hospit...
4       multiscene mask detection based multiscale res...
                              ...                        
1499    colonoscopy magnetic control system navigate f...
1500    protection nep140 retinal cells following reti...
1501    design implementation control system reflectin...
1502    adrenal insufficiency children undergoing hear...
1503    approaches learning anzca final examination va...
Name: Processed_Title, Length: 1504, dtype: object

In [33]:
df.to_csv('keywords_expansion.csv',index=False)