<a href="https://colab.research.google.com/github/kohlisaurabhsagar/miscellaenous/blob/main/google_scholar_web_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**WEB Scraping using BeautifulSoup**

**Importing Library**

In [48]:
import requests
import time
from bs4 import BeautifulSoup
from IPython.display import Image, display
import sqlite3

**Establishing connection with website**

In [67]:
url = 'https://scholar.google.com/citations?user=9BJP_xEAAAAJ&hl=en&oi=ao'
response = requests.get(url, headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'})

if response.status_code == 200:
    html_content = response.text
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

**Extracting information**

In [68]:
# extract information
soup = BeautifulSoup(response.text, 'html.parser')
name = soup.find(id="gsc_prf_in").text.strip()
user_image = soup.find('div', class_='gs_rimg').find('img')['src']
affiliation = soup.find(class_="gsc_prf_il").text.strip()
email = soup.find('a', class_='gsc_prf_ila', string='Homepage')['href']
research_interests = [a.text.strip() for a in soup.find_all(class_="gsc_prf_inta")]

table = soup.find('table', {'id': 'gsc_rsb_st'}).find_all('tr')
metrics_dict = {}
for row in table[1:]:
    columns = row.find_all('td')
    metric_name = columns[0].text.strip()
    all_value = columns[1].text.strip()
    since_2018_value = columns[2].text.strip()
    metrics_dict[metric_name] = {'All': all_value, 'Since 2018': since_2018_value}


# print the extracted information
print("Name:", name)
display(Image(url=user_image))
print("Affiliation:", affiliation)
print("Email:", email)
print("Research Interests:", research_interests)
for metric_name, values in metrics_dict.items():
    print(f'{metric_name}: {values}')

Name: Neelima Gupta


Affiliation: Department of Computer Science, University of Delhi
Email: http://people.du.ac.in/~ngupta/
Research Interests: ['Algorithms', 'Approximation Algorithms', 'Computer Networks', 'Data Mining', 'Bio-informatics']
Citations: {'All': '665', 'Since 2018': '229'}
h-index: {'All': '13', 'Since 2018': '7'}
i10-index: {'All': '20', 'Since 2018': '6'}


**Establishing connection with database**

In [69]:
db_path = '/content/sample.db'
connection = sqlite3.connect(db_path)
cursor = connection.cursor()

**Creating the Database and storing the information**

In [70]:
create_table_query = '''
CREATE TABLE IF NOT EXISTS scholars (
    id INTEGER PRIMARY KEY,
    name TEXT NOT NULL,
    affiliation TEXT,
    email TEXT,
    research_interests TEXT,
    citations INTEGER,
    h_index INTEGER,
    i10_index INTEGER
);
'''
cursor.execute(create_table_query)
connection.commit()


insert_query = '''
INSERT INTO scholars (name, affiliation, email, research_interests, citations, h_index, i10_index)
VALUES (?, ?, ?, ?, ?, ?, ?);
'''

sample_data = (
    name,
    affiliation,
    email,
    ', '.join(research_interests),
    metrics_dict.get('Citations', {}).get('All', 0),
    metrics_dict.get('h-index', {}).get('All', 0),
    metrics_dict.get('i10-index', {}).get('All', 0),
)

cursor.execute(insert_query, sample_data)
connection.commit()


select_query = 'SELECT * FROM scholars;'
cursor.execute(select_query)
result = cursor.fetchall()


for row in result:
    print(row)


connection.close()

(1, 'Dr Punam Bedi', 'Senior Professor, Department of Computer Science, University of Delhi, India', 'http://people.du.ac.in/~pbedi/', 'Artificial Intelligence, Recommender Systems, Intrusion Detection, AI for Healthcare, AI for Agriculture', 4557, 31, 100)
(2, 'Dilip Senapati', 'Associate Professor, Department of Computer Science, University of Delhi, New Delhi', 'https://ravenshawuniversity.ac.in/?page_id=31519', 'Dynamical Systems Modeling and Simulation', 281, 9, 9)
(3, 'Neelima Gupta', 'Department of Computer Science, University of Delhi', 'http://people.du.ac.in/~ngupta/', 'Algorithms, Approximation Algorithms, Computer Networks, Data Mining, Bio-informatics', 665, 13, 20)
