<a href="https://colab.research.google.com/github/toxresearch/lit_search/blob/main/pdf_key_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install PyPDF2 textract PdfReader nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
#GPT-3 revision
import pandas as pd
import numpy as np
import PyPDF2
import textract
import re

filename = 'Abdallah et al. 2015.pdf'

# Open the PDF file
with open(filename, 'rb') as pdfFileObj:
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    num_pages = len(pdfReader.pages)

    text = ""

    # Extract text from each page
    for count in range(num_pages):
        pageObj = pdfReader.pages[count]
        page_text = pageObj.extract_text()  # Use get_text() in the latest PyPDF2 versions
        if page_text:
            text += page_text

# If PyPDF2 fails to extract text, use textract as a fallback
if not text.strip():
    text = textract.process(filename, method='tesseract', language='eng').decode('utf-8')

# Clean and prepare the text
text = text.lower()  # Convert to lowercase
keywords = re.findall(r'\b\w+\b', text)  # Extract words using regex

# Create a DataFrame with unique keywords
df = pd.DataFrame(list(set(keywords)), columns=['keywords'])

# Define the weightage function
def weightage(word, text, number_of_documents=1):
    word_list = re.findall(r'\b' + re.escape(word) + r'\b', text)
    number_of_times_word_appeared = len(word_list)
    tf = number_of_times_word_appeared / float(len(text.split()))
    idf = np.log((number_of_documents) / float(number_of_times_word_appeared))
    tf_idf = tf * idf
    return number_of_times_word_appeared, tf, idf, tf_idf

# Calculate tf-idf and other metrics for each keyword
df['number_of_times_word_appeared'] = df['keywords'].apply(lambda x: weightage(x, text)[0])
df['tf'] = df['keywords'].apply(lambda x: weightage(x, text)[1])
df['idf'] = df['keywords'].apply(lambda x: weightage(x, text)[2])
df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x, text)[3])

# Sort the DataFrame by tf-idf
df = df.sort_values('tf_idf', ascending=True)

# Save the DataFrame to a CSV file
df.to_csv('out_put.csv', index=False)
print(df.head(100))


     keywords  number_of_times_word_appeared        tf       idf    tf_idf
750       the                            288  0.046340 -5.662960 -0.262419
1502       of                            223  0.035881 -5.407172 -0.194014
1628      and                            159  0.025583 -5.068904 -0.129679
697        in                            128  0.020595 -4.852030 -0.099929
712         a                            127  0.020434 -4.844187 -0.098988
...       ...                            ...       ...       ...       ...
819      each                             11  0.001770 -2.397895 -0.004244
1564   tested                             11  0.001770 -2.397895 -0.004244
416   tissues                             11  0.001770 -2.397895 -0.004244
1385  acetone                             11  0.001770 -2.397895 -0.004244
477    indoor                             11  0.001770 -2.397895 -0.004244

[100 rows x 5 columns]


In [None]:
# run batch
import os
import pandas as pd
import numpy as np
import PyPDF2
import textract
import re

# Directory containing the PDF files
folder_path = 'C:/Users/Miao.Li/python_learning/pdf scraping'

# Function to process a single PDF file
def process_pdf(filename):
    with open(filename, 'rb') as pdfFileObj:
        pdfReader = PyPDF2.PdfReader(pdfFileObj)
        num_pages = len(pdfReader.pages)

        text = ""

        # Extract text from each page
        for count in range(num_pages):
            pageObj = pdfReader.pages[count]
            page_text = pageObj.extract_text()
            if page_text:
                text += page_text

    # If PyPDF2 fails to extract text, use textract as a fallback
    if not text.strip():
        text = textract.process(filename, method='tesseract', language='eng').decode('utf-8')

    # Clean and prepare the text
    text = text.lower()
    keywords = re.findall(r'\b\w+\b', text)

    # Create a DataFrame with unique keywords
    df = pd.DataFrame(list(set(keywords)), columns=['keywords'])

    # Define the weightage function
    def weightage(word, text, number_of_documents=1):
        word_list = re.findall(r'\b' + re.escape(word) + r'\b', text)
        number_of_times_word_appeared = len(word_list)
        tf = number_of_times_word_appeared / float(len(text.split()))
        idf = np.log((number_of_documents) / float(number_of_times_word_appeared))
        tf_idf = tf * idf
        return number_of_times_word_appeared, tf, idf, tf_idf

    # Calculate tf-idf and other metrics for each keyword
    df['number_of_times_word_appeared'] = df['keywords'].apply(lambda x: weightage(x, text)[0])
    df['tf'] = df['keywords'].apply(lambda x: weightage(x, text)[1])
    df['idf'] = df['keywords'].apply(lambda x: weightage(x, text)[2])
    df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x, text)[3])

    # Sort the DataFrame by tf-idf
    df = df.sort_values('tf_idf', ascending=True)

    # Output CSV file name
    output_filename = os.path.splitext(os.path.basename(filename))[0] + '_output.csv'

    # Save the DataFrame to a CSV file
    df.to_csv(os.path.join(folder_path, output_filename), index=False)
    print(f"Processed {filename} and saved to {output_filename}")

# Iterate over all PDF files in the folder
for file in os.listdir(folder_path):
    if file.endswith('.pdf'):
        process_pdf(os.path.join(folder_path, file))


Processed C:/Users/Miao.Li/python_learning/pdf scraping\Abdallah et al. 2015.pdf and saved to Abdallah et al. 2015_output.csv
Processed C:/Users/Miao.Li/python_learning/pdf scraping\Abrego et al. 2016.pdf and saved to Abrego et al. 2016_output.csv
Processed C:/Users/Miao.Li/python_learning/pdf scraping\Alsaab et al. 2015.pdf and saved to Alsaab et al. 2015_output.csv
Processed C:/Users/Miao.Li/python_learning/pdf scraping\Berthet et al. 2020.pdf and saved to Berthet et al. 2020_output.csv
Processed C:/Users/Miao.Li/python_learning/pdf scraping\Boonen et al. 2012.pdf and saved to Boonen et al. 2012_output.csv
Processed C:/Users/Miao.Li/python_learning/pdf scraping\Bányiová et al. 2016.pdf and saved to Bányiová et al. 2016_output.csv
Processed C:/Users/Miao.Li/python_learning/pdf scraping\Champmartin et al. 2020.pdf and saved to Champmartin et al. 2020_output.csv
Processed C:/Users/Miao.Li/python_learning/pdf scraping\De Spiegeleer et al. 2013.pdf and saved to De Spiegeleer et al. 2013_o

In [None]:
#comparison of the csv files
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Directory containing the output CSV files
folder_path = 'C:/Users/Miao.Li/python_learning/pdf scraping'

# Dictionary to hold the cumulative data
keyword_data = {}

# Function to merge keyword data from a CSV file into the cumulative dictionary
def merge_keywords(csv_file):
    df = pd.read_csv(csv_file)
    for _, row in df.iterrows():
        keyword = row['keywords']
        frequency = row['number_of_times_word_appeared']
        # Remove stopwords
        if keyword not in stop_words:
            if keyword in keyword_data:
                keyword_data[keyword] += frequency
            else:
                keyword_data[keyword] = frequency

# Iterate over all output CSV files and merge their data
for file in os.listdir(folder_path):
    if file.endswith('_output.csv'):
        merge_keywords(os.path.join(folder_path, file))

# Convert the cumulative dictionary to a DataFrame
cumulative_df = pd.DataFrame(list(keyword_data.items()), columns=['keywords', 'total_frequency'])

# Sort the DataFrame by total frequency in descending order
cumulative_df = cumulative_df.sort_values('total_frequency', ascending=False)

# Display the top 25 keywords with the highest total frequency
print(cumulative_df.head(25))

# Save the cumulative DataFrame to a CSV file
cumulative_df.to_csv(os.path.join(folder_path, 'cumulative_keyword_frequency.csv'), index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Miao.Li\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


         keywords  total_frequency
0            skin             3540
1               0             3243
5               1             2559
6               2             2378
11              h             1870
8               3             1638
14              5             1358
16              4             1347
17              j             1337
3              al             1287
4              et             1256
23             10             1141
35              6              978
2           human              940
1206         drug              884
12     permeation              880
20              c              840
159             8              730
64    penetration              729
9               e              706
143             n              676
163             p              656
42          using              654
104             7              639
295            ml              635


In [None]:
# remove numbers, words with number of letters less than 3

import os
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Directory containing the output CSV files
folder_path = 'C:/Users/Miao.Li/python_learning/pdf scraping'

# Dictionary to hold the cumulative data
keyword_data = {}

# Function to determine if a keyword should be filtered out
def should_exclude(keyword):
    # Ensure the keyword is a string
    if not isinstance(keyword, str):
        return True
    # Check if the keyword is a stopword, a single letter, a number, or has fewer than 3 letters
    if keyword in stop_words or len(keyword) < 3 or keyword.isdigit():
        return True
    return False

# Function to merge keyword data from a CSV file into the cumulative dictionary
def merge_keywords(csv_file):
    df = pd.read_csv(csv_file)
    for _, row in df.iterrows():
        keyword = row['keywords']
        frequency = row['number_of_times_word_appeared']
        # Exclude unwanted keywords
        if not should_exclude(keyword):
            if keyword in keyword_data:
                keyword_data[keyword] += frequency
            else:
                keyword_data[keyword] = frequency

# Iterate over all output CSV files and merge their data
for file in os.listdir(folder_path):
    if file.endswith('_output.csv'):
        merge_keywords(os.path.join(folder_path, file))

# Convert the cumulative dictionary to a DataFrame
cumulative_df = pd.DataFrame(list(keyword_data.items()), columns=['keywords', 'total_frequency'])

# Sort the DataFrame by total frequency in descending order
cumulative_df = cumulative_df.sort_values('total_frequency', ascending=False)

# Display the top 25 keywords with the highest total frequency
print(cumulative_df.head(25))

# Save the cumulative DataFrame to a CSV file
cumulative_df.to_csv(os.path.join(folder_path, 'cumulative_keyword_frequency.csv'), index=False)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Miao.Li\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


           keywords  total_frequency
0              skin             3540
1             human              940
982            drug              884
4        permeation              880
37      penetration              729
23            using              654
14             dose              559
25             used              549
7          exposure              543
9             vitro              514
21             time              505
10       absorption              502
40            water              496
24             data              454
27            study              428
26            table              427
1526    transdermal              426
46    concentration              418
776            acid              403
97          applied              401
1394       delivery              397
64          studies              394
3            dermal              390
1447            gel              384
5          receptor              358


In [None]:
# remove numbers, words with number of letters less than 3

import os
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Directory containing the output CSV files
folder_path = 'C:/Users/Miao.Li/python_learning/pdf scraping'

# Dictionary to hold cumulative data: keyword -> [total_frequency, file_count]
keyword_data = {}

# Function to determine if a keyword should be filtered out
def should_exclude(keyword):
    # Ensure the keyword is a string
    if not isinstance(keyword, str):
        return True
    # Check if the keyword is a stopword, a single letter, a number, or has fewer than 3 letters
    if keyword in stop_words or len(keyword) < 3 or keyword.isdigit():
        return True
    return False

# Function to merge keyword data from a CSV file into the cumulative dictionary
def merge_keywords(csv_file):
    df = pd.read_csv(csv_file)
    file_keywords = set()  # Track keywords that appear in this file
    for _, row in df.iterrows():
        keyword = row['keywords']
        frequency = row['number_of_times_word_appeared']
        # Exclude unwanted keywords
        if not should_exclude(keyword):
            if keyword in keyword_data:
                keyword_data[keyword][0] += frequency
                if keyword not in file_keywords:
                    keyword_data[keyword][1] += 1
            else:
                keyword_data[keyword] = [frequency, 1]
            file_keywords.add(keyword)

# Iterate over all output CSV files and merge their data
for file in os.listdir(folder_path):
    if file.endswith('_output.csv'):
        merge_keywords(os.path.join(folder_path, file))

# Convert the cumulative dictionary to a DataFrame
cumulative_df = pd.DataFrame(list(keyword_data.items()), columns=['keywords', 'total_frequency_file_count'])
cumulative_df[['total_frequency', 'file_count']] = pd.DataFrame(cumulative_df['total_frequency_file_count'].tolist(), index=cumulative_df.index)
cumulative_df.drop(columns=['total_frequency_file_count'], inplace=True)

# Sort the DataFrame by total frequency in descending order
cumulative_df = cumulative_df.sort_values('total_frequency', ascending=False)

# Display the top 25 keywords with the highest total frequency
print(cumulative_df.head(25))

# Save the cumulative DataFrame to a CSV file
cumulative_df.to_csv(os.path.join(folder_path, 'cumulative_keyword_frequency.csv'), index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Miao.Li\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


           keywords  total_frequency  file_count
0              skin             3540          36
1             human              940          36
982            drug              884          33
4        permeation              880          34
37      penetration              729          35
23            using              654          35
14             dose              559          27
25             used              549          35
7          exposure              543          19
9             vitro              514          36
21             time              505          35
10       absorption              502          33
40            water              496          35
24             data              454          35
27            study              428          35
26            table              427          36
1526    transdermal              426          30
46    concentration              418          34
776            acid              403          28
97          applied 