<a href="https://colab.research.google.com/github/tpipernob/ConvertCsvSpringerToBib/blob/main/ConvertCsvSpringerToBib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1**: Add the csv file on google collab with the name "springer.csv"

**Step 2**: Play on all codes in the order they are arranged

**Result**: A file named springer.bib will be generated with the result

If it is necessary to generate another bib file, just rerun the "Generating the bib" section.

# Installing and Important Libraries

In [None]:
!pip install furl
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import os


import pandas as pd
import bs4
from bs4 import BeautifulSoup
import sys

sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from lxml.html.soupparser import fromstring
from sys import argv, exit
from math import isnan

# Capturing source code

In [None]:
def captureSourceCode(url):
  url = url

  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')

  browser = webdriver.Chrome('chromedriver',options=chrome_options)
  browser.get(url)
  
  sourceCode = BeautifulSoup(browser.page_source, "html.parser")
 
  return(sourceCode)

# Finding the Abstract

In [None]:
def findAbstract(sourceCode):
  
  if (sourceCode.find(id='Par1')):         
    resultAbstract = sourceCode.find(id='Par1').text    
    
  elif (sourceCode.find("div", {"id": "Abs1-content"})):          
    abstract = sourceCode.find("div", {"id": "Abs1-content"}).select("p")[0]
    resultAbstract = abstract.text   
    
  elif (sourceCode.find("section", {"id": "Abs1"})):       
    abstract = sourceCode.find("section", {"id": "Abs1"}).select("p")[0]
    resultAbstract = abstract.text    
    
  else:        
    resultAbstract = " " 
  
  return resultAbstract

# Finding the keywords

In [None]:
def findKeywords(sourceCode, contentType):

  keywords = [] 

  if (contentType == "Article"):
    returnKeywords = sourceCode.find_all("span", {"itemprop": "about"})  
  else:  
    returnKeywords = sourceCode.find_all("span", {"class": "Keyword"})   

  for keyword in returnKeywords:        
    keyword = keyword.get_text(strip=True)    
    keywords.append(keyword)

  return keywords

# Generating the bib

In [None]:
'''
Step 1: Add the csv file on google collab with the name "springer.csv"
Step 2: Play on all codes in the order they are arranged
A file named springer.bib will be generated with the result
'''

class BibEntry(object):
    def __init__(self):
        self.Item_Title = None
        self.Publication_Title = None
        self.Book_Series_Title = None
        self.Journal_Volume = None
        self.Journal_Issue = None
        self.Item_DOI = None
        self.Authors = None
        self.Publication_Year = None
        self.URL = None
        self.Content_Type = None
        self.Keywords = None
        self.Abstract = None

    def generateBib(self, header_id: str) -> str:
        
        #create tiltle
        initialTitle = self.Item_Title.split()[:2] #the first two words of the title
        initialTitle = "".join(map(str, initialTitle)).replace(",","") #if it exists, remove commas                 
        
        #header according to document type
        if (self.Content_Type == "Article" or self.Content_Type == "Chapter"):
            header = "@article{"+header_id+"_"+initialTitle+",\n\t"
        elif(self.Content_Type == "Book" or self.Content_Type == "ReferenceWork"):
            header = "@book{"+header_id+"_"+initialTitle+",\n\t" 
        else:
            header = "@article{"+header_id+"_"+initialTitle+",\n\t"
        
        #general data
        author = "author = {"+self.Authors+"},\n\t"
        title = "title = {"+self.Item_Title+"},\n\t"
        journal_book = "journal = {"+self.Publication_Title+"},\n\t"
        volume = "volume = {"+str(self.Journal_Volume)+"},\n\t"
        number = "number = {"+str(self.Journal_Issue)+"},\n\t"
        year = "year = {"+str(self.Publication_Year)+"},\n\t"
        url = "url = {"+str(self.URL)+"},\n\t"
        doi = "doi = {"+str(self.Item_DOI)+"},\n\t"
        document_type = "document_type = {"+str(self.Content_Type)+"},\n\t" 

        #scraping data 
        keywords = "keywords = {"+str(self.Keywords)+"},\n\t"        
        abstract = "abstract = {"+str(self.Abstract)+"},\n}\n\n"

        return header+author+title+journal_book+volume+number+year+url+keywords+document_type+doi+abstract


def _getBibEntries(filepath: str) -> list:
    if os.path.exists(filepath):

        bibEntries = []
        springer_csv = pd.read_csv(filepath)
        springer_csv.fillna('', inplace=True)        

        for index, row in springer_csv.iterrows():

            url = row['URL']
            contentType = row['Content Type']

            #get the complete code from the job page
            sourceCode = captureSourceCode(url)

            abstractScrapping = findAbstract(sourceCode)

            keywordsScrapping = findKeywords(sourceCode, contentType)
            stringKeywords = ', '.join(keywordsScrapping)                   

            bibEntry = BibEntry()
            bibEntry.Authors = row['Authors']
            bibEntry.Book_Series_Title = row['Book Series Title']
            bibEntry.Content_Type = row['Content Type']
            bibEntry.Item_DOI = row['Item DOI']
            bibEntry.Item_Title = row['Item Title']
            bibEntry.Journal_Issue = row['Journal Issue']
            bibEntry.Journal_Volume = row['Journal Volume']
            bibEntry.Publication_Title = row['Publication Title']
            bibEntry.Publication_Year = row['Publication Year']
            bibEntry.URL = row['URL']

            bibEntry.Keywords = stringKeywords
            bibEntry.Abstract = abstractScrapping           

            bibEntries.append(bibEntry.generateBib(str(index)))

        return bibEntries
    else:
        print("{0} not found!".format(filepath))
        return []


def convertCsvToBib(filepath: str) -> bool:
        
    bibEntries = _getBibEntries(filepath)
    bibFile = filepath.split(".")[0] + ".bib"

    for bib in bibEntries:      
      with open(bibFile, 'a', encoding="utf-8") as f:
        f.write(bib)


try:
    filepath = 'springer.csv'
except:    
    exit(0)
convertCsvToBib(filepath)