In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

#Searches on china pubmed depending on what user inputs
search_keyword = input ("Please enter search: ")
#Creates lower case keyword used for creating a keyword series in Function F and for naming the output csv file
final_keyword = search_keyword.lower()
#If the search_keyword has a space character in it(e.g contains more than one word), it is replaced with a '+'
if ' ' in search_keyword:
    search_keyword = search_keyword.replace(' ', '+')
#Taking user input, 搜索. Then 选择“发表日期” and create search url
search_url = 'http://www.chinapubmed.net/search/?q=' + search_keyword + '&sort=year'

# Extraction

In [None]:
#RECORDS CURRENT PAGE NUMBER AS A GLOBAL VARIABLE
current_page = 1
#Creates soup object for current search_url
result = requests.get(search_url)
soup = BeautifulSoup(result.content, 'lxml') #use lxml, much faster than html.parser
#FUNCTION A
#THIS FUNCTION RETURNS IMPACT FACTORS ON GIVEN WEBPAGE AS A SERIES
def impact_factor():
    impact_factors = soup.find_all('span')
    #append each impact factor into a list which is then turned into a series
    factors_list = []
    for factors in impact_factors:
        if len(factors.text)>2 and (factors.text[0]).isdigit() == True and (factors.text[-1]).isdigit() == True and ('.' in factors.text) == True:
            factors_list.append(float(factors.text))
    return pd.Series(factors_list)

#FUNCTION B
#THIS FUNCTION RETURNS RELEVANT 原始url ON GIVEN WEBPAGE AS A SERIES
def article_links():
    test = soup.find_all('a')
    final_urls = []
    for link in test:
        if any(char.isdigit() for char in link.attrs['href']) == True and ('search' not in link.attrs['href']) and ('javascript' not in link.attrs['href']):
            final_urls.append('http://www.chinapubmed.net' + link.attrs['href'])
    final_urls = list(dict.fromkeys(final_urls))
    return pd.Series(final_urls)

#FUNCTION C
#THIS FUNCTION RETURNS ALL 英文标题 ON GIVEN WEBPAGE AS A SERIES (ONLY WORKS FOR EXTRACTING ENGLISH TITLE)
def article_titles():
    titles = soup.find_all('div', class_ = 'paper-list-title')
    final_list = []
    for t in titles:
        final_list.append(t.text)
    title_list = []
    for name in final_list:
        a,b,c,d= name.split('\n')
        title_list.append(c)
    return pd.Series(title_list)  


#FUNCTION D
#THIS FUNCTION RETURNS ALL PMID VALUES ON GIVEN WEBPAGE AS A SERIES
def get_pmid():
    test = soup.find_all('a')
    final_urls = []
    for link in test:
        if any(char.isdigit() for char in link.attrs['href']) == True and ('search' not in link.attrs['href']) and ('javascript' not in link.attrs['href']):
            string = link.attrs['href']
            final_urls.append(string[1:len(string)])
    final_urls = list(dict.fromkeys(final_urls))
    return pd.Series(final_urls)

#FUNCTION E
#THIS FUNCTION returns all 期刊 values on webpage as a series
def get_periodical():
    test = soup.find_all('span', class_ = 'journal_fifi')
    urls = []
    for search in test:
        urls.append(search.string)
    return pd.Series(urls)

#FUNCTION F
#THIS FUNCTION CREATES A COLUMN FOR THE SEARCH KEYWORD
def keyword_series(final_keyword):
    series = pd.Series(final_keyword)
    final_series = series.repeat(40)
    final_series = final_series.reset_index(drop=True)
    return final_series
    
#FUNCTION G
#THIS FUNCTION RETURNS A DATAFRAME FOR GIVEN WEBPAGE
def firstpage_dataframe(final_keyword):
    data = {'Keyword': keyword_series(final_keyword),
            '原始url': article_links(),
            '影响因子': impact_factor(),
            '英文标题': article_titles(),
            'pmid': get_pmid(),
            '期刊': get_periodical()}
    df = pd.DataFrame(data)
    df = df.where(df['影响因子'] > 3.000)
    #THIS FILTERING WORKS ONLY IF SEARCH KEYWORD IS IN ENGLISH
    titles_column = df['英文标题']
    titles = titles_column.str.contains(final_keyword, flags=re.IGNORECASE, regex=True)
    df['Contains keyword'] = titles
    df = df.where(df['Contains keyword'] == True)
    df = df.dropna()
    df = df.drop(columns=['Contains keyword'], axis = 1) 
    return df

#CREATES THE FIRST DATAFRAME AND RECORDS NUMBER OF ROWS OF DATA EXTRACTED
initial_df = firstpage_dataframe(final_keyword)
length_df = len(initial_df)

#FUNCTION H
#THIS FUNCTION FINDS THE NEXT PAGE ON CHINESE PUB MED AND UPDATES current_page and search_url
def findnextpage():
    global current_page, search_url, result, soup
    search_url = search_url + '&current_page=' + str(current_page + 1)
    #Note: The sort by year is maintained as it carried over from the previous search url
    current_page += 1
    result = requests.get(search_url)
    soup = BeautifulSoup(result.content, 'lxml')

#FUNCTION I
#THIS FUNCTION ADDS TO THE FIRST DATAFRAME AS MORE INFORMATION IS EXTRACTED
def append_dataframe():
    global initial_df, length_df
    findnextpage()
    updated_dataframe = initial_df.append(firstpage_dataframe(final_keyword))
    length_df = len(updated_dataframe)
    initial_df = updated_dataframe
    return initial_df

#CONTINUES TO EXTRACT INFORMATION MATCHING CRITERIA UNTIL FIRST 40 ARTICLES HAVE BEEN FOUND
while length_df < 40:
     append_dataframe()

#RESETS INDEX OF FINAL DATAFRAME AND EXPORTS AS CSV FILE
initial_df = initial_df.reset_index(drop=True)
initial_df.to_csv(final_keyword+'_first_40.csv',index=False, encoding='utf_8_sig')
print("Success!")