In [14]:
!pip install pdfminer

import os
import pandas as pd
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage

# This code will loop over pdfs within a directory, within each pdf it will look for an assigned string, and place 
# the next string into a dataframe 

def extract_text_from_pdf(pdf_path):
    # Open the pdf file
    with open(pdf_path, 'rb') as fh:
        # Create a PDF resource manager object that stores shared resources
        resource_manager = PDFResourceManager()

        # Create a string buffer
        output = StringIO()

        # Create a text converter object
        converter = TextConverter(
            resource_manager, output, laparams=LAParams()
        )

        # Create a PDF interpreter object
        interpreter = PDFPageInterpreter(resource_manager, converter)

        # Process each page contained in the pdf file
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            interpreter.process_page(page)

        # Close the converter
        converter.close()

        # Get the text from the buffer
        text = output.getvalue()

        # Close the buffer
        output.close()

        # Return the text
        return text

        
def NeuromabScan(
    text: str   
) -> str:

    # Search for the string "NeuroMab clone" in the text
    if "NeuroMab clone" in text:
        
        # Get the index of the first occurrence of the string
        index = text.index("NeuroMab clone")

        # Get the string that comes immediately after the string "NeuroMab clone"
        #global NeuroMab
        NeuroMab = text[index + len("NeuroMab clone"):].split()[0]
            
        # Remove any whitespace or parenthesis from the value
        NeuroMab = NeuroMab.strip()
        NeuroMab = NeuroMab.strip("()")

        
    else:
        NeuroMab = ""
    
    return NeuroMab

def OrganismScan(text):
    
    global Organism

    # Search for the string in the text
    if "rat" in text:
        
        Organism = "Rat"
    
    elif "human" in text:

        Organism = "Human"
    
    elif "mouse" in text:

        Organism = "Mouse"
        
    elif "goldfish" in text:
        
        Organism = "Goldfish"
        
    elif "rat brain" in text:

        Organism = "Human"
        
    elif "jellyfish" in text:
        
        Organism = "Jellyfish"
    
#     elif "of\nrat" in text:
        
#         Organism = "Rat"
        
#     elif "of\nhuman" in text:
        
#         Organism = "Human"
        
#     elif "of\nmouse" in text:
        
#         Organism = "Mouse"
        
    else:
        Organism = ""
        
    return Organism
        
def AminoScan(text):

    # Search for the string "acids" in the text
    if "acids" in text:
        
        # Get the index of the first occurrence of the string
        index = text.index("acids")

        # Get the string that comes immediately after the string "acids"
        global Amino
        Amino = text[index + len("acids"):].split()[0]
        
        # Remove any whitespace or parenthesis from the value
        Amino = Amino.strip()
        Amino = Amino.strip("()")
        Amino = Amino.strip(",")
        
        # Files with peptide or protein are special circumstance :
        if Amino == 'identical':
            
            if "peptide" in text:
                myid = "peptide" 
            elif "protein" in text:
                myid = "protein"
            index = text.index(myid)
            Amino = text[index + len(myid):].split()[0]
            Amino = Amino.strip()
            Amino = Amino.strip("()")
            Amino = Amino.strip(",")
                
            if Amino == 'target(s':
                Amino = ''
                
            else:
                Amino = ""
                
        elif Amino == 'of':
            Amino = ''
                                 

    else:
        
        if "peptide" in text:
            index = text.index("peptide")
            Amino = text[index + len("peptide"):].split()[0]
            Amino = Amino.strip()
            Amino = Amino.strip("()")
            Amino = Amino.strip(",")
            
        elif "protein" in text:
            index = text.index("protein")
            Amino = text[index + len("protein"):].split()[0]
            Amino = Amino.strip()
            Amino = Amino.strip("()")
            Amino = Amino.strip(",")
                
        else:
            Amino = ""

def AccessionScan(text):

    # Search for the string "accession number" in the text
    if "number" in text:
        # Get the index of the first occurrence of the string
        index = text.index("number")

        # Get the string that comes immediately after the string "accession number"
        value = text[index + len("number"):].split()[0]
            
        # Remove any whitespace or parenthesis from the value
        value = value.strip()
        value = value.strip("()")

        # Add a row to the dataframe with the accession number
        global df
        global pdf_name
        global NeuroMab
        global Amino
        global Organism
        df = df.append(
            {
                "DataSheetFileName": pdf_name,
                "NeuroMab clone": NeuroMabScan(text=text),
                "Accession Number": value,
                "Amino Acid Range" : "(" + Amino + ")",
                "Organism" : Organism
            },
            ignore_index=True
            )
    else:
        value = ""
        df = df.append(
            {
                "DataSheetFileName": pdf_name,
                "NeuroMab clone": NeuroMabScan(text=text),
                "Accession Number": value,
                "Amino Acid Range" : "(" + Amino + ")",
                "Organism" : Organism
                    
            },
            ignore_index=True
        )

    # Print the dataframe
    #print(df)



    
def main():

    # Set the directory containing the pdf files
    pdf_dir = './Desktop/TXT_ANTIBODY/PDFdatasheets'
    # Create an empty dataframe
    global df
    df = pd.DataFrame()

    # Iterate over all pdf files in the directory
    for pdf_file in os.listdir(pdf_dir):
        
        # Get the file path
        pdf_path = os.path.join(pdf_dir, pdf_file)
        
        # Set global variable for file name
        global pdf_name
        pdf_name = os.path.basename(pdf_path)

        # Extract the text from the pdf file
        text = extract_text_from_pdf(pdf_path)
        
        # Extract Neuromab ID
        NeuromabScan(text)
        
        # Extract Organism 
        OrganismScan(text)
        
        # Extract Amino Acid Range 
        AminoScan(text)
        
        # Extract Accession Number 
        AccessionScan(text)
        


main()

# Create the dataframe to a csv so it can be opened with Excel 

df.to_csv('Neuromab.csv')




  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.append(
  df = df.

In [4]:
import os 
path = f"https://neuromab.ucdavis.edu/datasheet/{neuromab}"
os.system(f"wget {path}")

0

In [5]:
os.system("mkdir pdfs/")

0

In [6]:
os.listdir()

['wipe_status_data.py',
 'DuplicatedLC_ASVs.tsv',
 'wipe_db.py',
 'TestingDeepAb.ipynb',
 'Untitled1.ipynb',
 'updated_metadata.tsv',
 '.DS_Store',
 'run_metadata_update.py',
 'oldduplicates',
 'paswd_regex.py',
 'environment.yml',
 'generate_blat.py',
 'Untitled.ipynb',
 'metadata_difference.tsv',
 'all_meta_master.tsv',
 'DropoutandDups.ipynb',
 'cat2sum.csv',
 'sequence_db',
 'final_stars.csv',
 'mydatabase 2',
 'management',
 'trimmer',
 '__init__.py',
 'DatabaseNotebook.ipynb',
 'Untitled2.ipynb',
 'dropout_groupings.tsv',
 '2021-11-02-Aggregated_Sequences.tsv',
 'Diagnostics Aggregated vs Metadata.ipynb',
 'run_status_update.py',
 'Samples_with_few_reads.tsv',
 'Pdf_miner-Copy1.ipynb',
 'N144_14.pdf.1',
 'N144_14.pdf',
 'Samples_without_overlaps.tsv',
 'cat123_cooborating.xlsx',
 'DuplicatedHC_ASVs.tsv',
 'DeepAb',
 'heavy_chain_cat.tsv',
 'pdfs',
 'N358_68.pdf',
 'manage.py',
 '.ipynb_checkpoints',
 'old_meta',
 'mydatabase_01_15_2022',
 'mydatabase',
 'N358_68.pdf.1',
 'light_c

In [4]:
# Merge the created dataframe and the Neuromab dataframe for cross checking 

import pandas as pd

# load the first dataframe
df1 = pd.read_csv('Neuromab.csv')

# load the second dataframe
df2 = pd.read_csv('/Users/HMans_MacBook_Pro/Desktop/TXT_ANTIBODY/NeuroMab-UCDavis.csv')

# merge the dataframes on the 'ID' column
df_merged = pd.merge(df1, df2, on='DataSheetFileName')

# Move the Accession number columns next to each other to make cross reference easier

df_merged=df_merged[['DataSheetFileName', 'NeuroMab clone', 'Clone', 'Accession Number','AccessionNum','HumanGeneName','TargetType','Target','Amino Acid Range']]


df_merged.to_csv('Neuromab_merged.csv')
# From this point I manually changed the accession numbers to match the neuromab dataframe
# There was also 2 errors - 1 column said Human, and 1 said Epitope (both were changed to accession from Neuromab) - stored in Neuromab3

In [11]:
print("rat\n\tof")

rat
	of


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Load the dataset into a pandas DataFrame
df = pd.read_csv('Neuromab_merged_corrected.csv')

# Set the shared path for the URLs
path = 'https://rest.uniprot.org/uniprotkb/'
json = '.json'

# Create an empty list to store the Amino Acid sequence we retrieve from the URLs
Sequence_list = []


# Loop through the URLs in the 'url' column of the DataFrame
for index, row in df.iterrows():
    url_info = row['Accession Number']
    url_info = str(url_info)
    
    
    # Only retrieve information for URLs with 6-letter 'url info' strings
    if len(url_info) == 6:
        # Combine the shared path and the URL information to create the full URL
        url = path + url_info + json
        
        # Send a request to the URL and retrieve the HTML content
        response = requests.get(url)
        
        if response.json:
            
            # Parse the JSON data from the response
            data = response.json()

            # Search for the specific key-value in the data
            key = 'sequence'
    
            if key in data:
                value = data[key]
                Sequence_list.append(f'{value}')
            else:
                Sequence_list.append('')
    else:
        Sequence_list.append('')

#print(Sequence_list)


["{'value': 'MMMMMMMKKMQHQRQQQEDHANEANYARGTRLPISGEGPTSQPNSSKQTVLSWQAAIDAARQAKAAQTMSTSAPPPVGSLSQRKRQQYAKSKKQGNSSNSRPARALFCLSLNNPIRRACISIVDWKPFDIFILLAIFANCVALAIYIPFPEDDSNSTNHNLEKVEYAFLIIFTVETFLKIIASGLLLHPNASVRNGWNLLDFVIVIVGLFSVILEQLTKETEGGNHSSGKSGGFDVKALRAFRVLRPLRLVSGVPSLQVVLNSIIKAMVPLLHIALLVLFVIIIYAIIGLELFIGKMHKTCFFADSDIVAEEDPAPCAFSGNGRQCAANGTECRSGWVGPNGGITNFDNFAFAMLTVFQCITMEGWTDVLYWVNDAIGWEWPWVYFVSLIILGSFFVLNLVLGVLSGEFSKEREKAKARGDFQKLREKQQLEEDLKGYLDWITQAEDIDPENEEEGGEEGKRNTSMPTSETESVNTENVSGEGETQGCCGSLWCWWKRRGAAKTGPSGCRRWGQAISKSKLRSHGAREALCVCRCSLESLVKLWTSRFSAHLQAAYVRPYSRRWRRWNRFNRRRCRAAVKSVTFYWLVIVLVFLNTLTISSEHYNQPDWLTQIQDIANKVLLALFTCEMLVKMYSLGLQAYFVSLFNRFDCFVVCGGITETILVELELMSPLGVSVFRCVRLLRIFKVTRHWTSLSNLVASLLNSMKSIASLLLLLFLFIIIFSLLGMQLFGGKFNFDETQTKRSTFDNFPQALLTVFQILTGEDWNAVMYDGIMAYGGPSSSGMIVCIYFIILFICGNYILLKLFLAIAVDNLADAESLNTAQKEEAEEKERKKIARKESLENKKNNKPEVNQIANSDNKVTIDDYQEEAEDKDPYPPCDVPVGEEEEEEEEDEPEVPAGPRPRRISELNMKEKIAPIPEGSAFFILSKTNPIRVGCHKLINHHIFTNLILVFIMLSSAALAAEDPIRSHSFRNTILGYFDYAFTA

In [24]:
df2 = pd.DataFrame(list(zip(df['DataSheetFileName'],df['Accession Number'], Sequence_list)),
               columns =['DataSheetFileName','Accession Number_check', 'AminoSeq'])

df_merged = pd.merge(df, df2, on='DataSheetFileName')
df_merged=df_merged[['DataSheetFileName', 'NeuroMab clone', 'Clone', 'Accession Number','AccessionNum','Accession Number_check','HumanGeneName','TargetType','Target','Amino Acid Range','AminoSeq']]
df_merged.to_csv('12-20-22_Neuromab.csv')

In [23]:
df_merged

Unnamed: 0,DataSheetFileName,NeuroMab clone,Clone,Accession Number,AccessionNum,HumanGeneName,TargetType,Target,Amino Acid Range,AminoSeq
0,L48A_9.pdf,L48A/9,L48A/9,P27732,P27732,CACNA1D,Ca2+ channels,Cav1.3 Ca2+ channel,(859-875),{'value': 'MMMMMMMKKMQHQRQQQEDHANEANYARGTRLPIS...
1,N195A_16.pdf,N195A/16,N195A/16,Q96PU,Q96PU8,QKI,Other,QKI-5,(315-331),
2,N167_7.pdf,N167/7,N167/7,Q8R418,Q8R418,DICER1,Epigenetics,Dicer,(1638-1899),{'value': 'MKSPALQPLSMAGLQLMTPASSPMGPFFGLPWQQE...
3,N297_59.pdf,N297/59,N297/59,,,----,Epigenetics,5-hydroxymethylcytidine,(),
4,N228A_16.pdf,N228A/16,N228A/16,Q9R1R0,Q9R1R0,LHX6,Other,Lhx6.1,(347-363),{'value': 'MAQPGSGCKATTRCLEGTAPPAMAQSDAEALAGAL...
...,...,...,...,...,...,...,...,...,...,...
470,N161_20.pdf,N161/20,N161/20,P55011,P55011,SLC12A2,Transporters,NKCC1,(208-223),{'value': 'MEPRPTAPSSGAPGLAGVGETPSAAALAAARVELP...
471,N291C_22.pdf,N291C/22,N291C/22,Q8C437,Q8C437,PEX5L,Ion channels,TRIP8b (exon 1a/5),(1-12),{'value': 'MYQGHMQLVNEQQESRPLLSPSIDDFLCETKSEAI...
472,N454_91.pdf,N454/91,N454/91,A2AHL1,A2AHL1,ANO3,Ion channels,ANO3/TMEM16C,(163-257),{'value': 'MVHHSGSIQSFKQQKGMNISKSEITTEASLKPSRR...
473,K96_7.pdf,K96/7,K96/7,Q99784,Q99784,OLFM1,Other,Pancortin,(131-150),{'value': 'MSVPLLKIGVVLSTMAMITNWMSQTLPSLVGLNTT...
