In [11]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np 
from pathlib import Path
from bs4 import BeautifulSoup
import re, time
import requests
tqdm.pandas()
import tldextract
from selenium import webdriver

path = 'google_scholar'

root_folder = Path(path)
root_folder.mkdir(parents=True, exist_ok=True)

In [12]:
files = list(root_folder.rglob("*"))  
files = [f for f in files if f.is_file() and f.suffix == ".html"]

df_searchterms = pd.read_csv(path+'/google_scholar_search_base.csv',sep =';')
lst_name_files = [file.name for file in files]
df_searchterms = df_searchterms[df_searchterms['file'].isin(lst_name_files)].copy()
df_searchterms['path_file'] = path +'/raw_data/'+ df_searchterms['folder']+'/'+df_searchterms['file']

In [13]:


def process_file(path_file):
    f = open(path_file,"r", encoding="utf-8")
    soup = BeautifulSoup(f.read(), "html.parser")
    f.close()

    lst_divs_resultado_busca = soup.find_all("div", attrs={"data-rp": True})

    lst_records = []

    for bloco_resultado in lst_divs_resultado_busca:

        main_div = bloco_resultado.find('div',{'class':'gs_ri'})
        record = {}

        record['href'] = None
        href_div = main_div.find('a', href = True)
        if href_div:
            record['href'] = href_div['href']

        record['title'] = None
        title_div =  main_div.find('h3')
        if title_div:
            record['title'] = title_div.text

        record['author_div'] = None
        author_div = main_div.find('div',{'class':'gs_a'})
        if author_div:
            record['author_div'] = author_div.text

        record['description'] = None
        description_div = main_div.find('div',{'class':'gs_rs'})
        if description_div:
            record['description'] = description_div.text


        record['citations'] = 0
        citations_div = main_div.find('div',{'class':'gs_fl gs_flb'})
        if citations_div:
            match = re.search(r"citado por (\d+)", citations_div.text, re.IGNORECASE)
            if match:
                record['citations'] = int(match.group(1))

        secondary_div = bloco_resultado.find('div',{'class':'gs_ggs gs_fl'})

        record['direct_label'] = None 
        record['direct_href'] = None 
        if secondary_div:
            record['direct_label'] = secondary_div.text
            direct_href_div = secondary_div.find('a',href = True)

            if direct_href_div :
                record['direct_href'] = direct_href_div['href']

        lst_records.append(record)

    return lst_records #pd.DataFrame(lst_records)

df_searchterms['df'] = df_searchterms['path_file'].progress_map(lambda x : process_file(x))
df_searchterms = df_searchterms.explode('df')
for key in ['href', 'title', 'author_div', 'description', 'citations','direct_label','direct_href']:
    df_searchterms[key] = df_searchterms['df'].map(lambda x : x[key])
df_searchterms.drop(columns = 'df', inplace=True)

  0%|          | 0/440 [00:00<?, ?it/s]

In [14]:
df_searchterms['PDF'] = df_searchterms['direct_label'].str.contains('PDF', regex=False).astype('bool')
df_searchterms['target_url'] = df_searchterms['direct_href'].fillna(df_searchterms['href'])


ix = df_searchterms['target_url'].str.startswith('https://arxiv.org')
df_searchterms.loc[ix,'PDF'] = True 
df_searchterms['target_url'] = df_searchterms['target_url'].str.replace('abs','pdf')

ix = df_searchterms['target_url'].str.endswith('.pdf')
df_searchterms.loc[ix,'PDF'] = True 

ix = df_searchterms['target_url'].str.startswith('https://dl.acm.org/doi/pdf/')
df_searchterms.loc[ix,'target_url'] = df_searchterms.loc[ix,'target_url'].str.replace("pdf/",'', regex = False)
df_searchterms.loc[ix,'PDF'] = False 


df_searchterms['article_file'] = (
    df_searchterms['target_url'].str.slice(0,100)
    .str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)  # substitui caracteres especiais por _
    .str.replace(r'_+', '_', regex=True)             # múltiplos _ por 1 _
    .str.strip('_')                         # remove _ no começo/fim
)

ix = df_searchterms['PDF'] 
df_searchterms.loc[ix,'article_file']+='.pdf'
df_searchterms.loc[~ix,'article_file']+='.html'




In [15]:
ix = df_searchterms['target_url'].notnull()
df_searchterms.loc[ix,'domain'] = df_searchterms.loc[ix,'target_url'].map(lambda x:tldextract.extract(x).domain ) 

df_searchterms['domain_count'] = df_searchterms.groupby('domain')['domain'].transform('count')

df_searchterms.sort_values(by = ['domain_count','domain','folder'],\
                            ascending=[False,False,False], ignore_index=True, inplace=True)

In [16]:
df_searchterms.to_csv(path+'/google_scholar_search_base_article_detail.csv', sep = ';', index = False)

In [6]:
df_searchterms = df_searchterms[df_searchterms['domain'] != 'sciencedirect']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'acm']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'researchgate']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'wiley']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'tandfonline']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'iop']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'oup']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'onepetro']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'acs']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'proquest']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'arvojournals']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'jstor']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'jacc']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'informit']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'asme']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'ascopubs']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'ascelibrary']
df_searchterms = df_searchterms[df_searchterms['domain'] != 'cell']

In [14]:
records = df_searchterms.to_dict(orient = 'records')

In [15]:
# Inicializa o navegador
driver = webdriver.Chrome()

In [16]:
url = df_searchterms['target_url'].values[0]
driver.get(url)

In [43]:
driver.get(url)

In [None]:
for record in tqdm(records):


    full_path_folder =  root_folder / 'articles' / record['folder']
    full_path_folder.mkdir(parents=True, exist_ok=True)
    full_path = full_path_folder / record['article_file']

    if full_path.exists() and full_path.is_file():  
        continue
    
    url = record['target_url']
    if not isinstance(url, str):
        continue 

    if record['PDF']:
        try:
            response = requests.get(url,  timeout=20)

            if response.status_code == 200:
                with open(full_path, "wb") as f:
                    f.write(response.content)
                    
        except Exception as e:
            print('erro em ',url)
            print(e)
            with open(full_path, "w", encoding="utf-8") as f:
                f.write('')

    else:
        try:
            driver.get(url)
            time.sleep(3)
            alpha, beta = 2,5
            val = np.random.beta(alpha, beta)
            wait_time = 1 + val * (3 - 1)
            time.sleep(wait_time) 

            with open(full_path, "w", encoding="utf-8") as f:
                f.write(driver.page_source)
        except Exception as e:
            print('erro em ',url)
            print(e)

            with open(full_path, "w", encoding="utf-8") as f:
                f.write('')

    

        


  0%|          | 0/3186 [00:00<?, ?it/s]

erro em  https://koreascience.kr/article/JAKO201506566439705.pdf
HTTPSConnectionPool(host='koreascience.kr', port=443): Max retries exceeded with url: /article/JAKO201506566439705.pdf (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
erro em  https://koreascience.kr/article/JAKO200631670573650.pdf
HTTPSConnectionPool(host='koreascience.kr', port=443): Read timed out. (read timeout=20)
erro em  https://ui.adspdf.harvard.edu/pdf/2023arXiv230200569P/pdftract
Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=132.0.6834.160)
Stacktrace:
	GetHandleVerifier [0x00007FF7A21202F5+28725]
	(No symbol) [0x00007FF7A2082AE0]
	(No symbol) [0x00007FF7A1F1510A]
	(No symbol) [0x00007FF7A1F12861]
	(No symbol) [0x00007FF7A1F03559]
	(No symbol) [0x00007FF7A1F052AF]
	(No symbol) [0x00007FF7A1F0381F]
	(No symbol) [0x00007FF7A1F032ED]
	(No symbol) [0x00007FF7A1F02FBA]
	(No symbol) [0x00007FF7A1F00DD1]
	(No sy

In [74]:
record['target_url']

'https://dl.acm.org/doi/pdf/10.1145/3678698.3678702'

In [None]:
https_dl_acm_org_doi_pdf_10_1145_3290605_3300358.pdf

In [None]:
#TODO : apagar todos os arquivos vazios

In [None]:
i = 0
for record in tqdm(records):
    if i>1000 and i<1050:
        full_path_folder =  root_folder / 'articles' / record['folder']
        full_path_folder.mkdir(parents=True, exist_ok=True)
        full_path = full_path_folder / record['article_file']

        if full_path.exists() and full_path.is_file():  
            continue
        
        url = record['target_url']
        if not isinstance(url, str):
            continue 

        if record['PDF']:
            pass

        else:
            print(full_path)

    i+=1


  0%|          | 0/2338 [00:00<?, ?it/s]

google_scholar\articles\ai_monitor\javascript_void_0.html
google_scholar\articles\ai_monitor\javascript_void_0.html
google_scholar\articles\ai_monitor\javascript_void_0.html
google_scholar\articles\ai_monitor\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\ai_logging\javascript_void_0.html
google_scholar\articles\model_observability\https_agupubs_onlinelibrary_wiley_com_doi_pdf_10_1029_WR019i001p00260.html
google_scholar\articles\gui_machine_learning\https_advanced_onlinelib