In [1]:
import pandas as pd 
import tensorflow
from bs4 import BeautifulSoup
import re
import timeit
import requests
import wikipedia

In [18]:
def crawl_year(year, yearContent, df):
    """ Crawl the different years of the wikipedia's archieved deletion discussions page and store the content
        in a Data Frame. 

        Args: 
            year: the year in which the archived articles were flagged for deletion
            yearContent: the html content containing the year (a h2 tag)
            df: the data frame where the data are stored in the format year | month | title | Id | Gender

        Returns:
            a data frame
    """
    for monthContent in yearContent.find_next_siblings(limit=24):
        if monthContent.name == "h2":
            # Crawl only this year. If the year doesn't yet have 12 months(e.g. 2019), don't go for more.
            break
        elif monthContent.name == "h3":
            month = monthContent.get_text().split(str(year)+" ")[1].split("[")[0]
            print("Month",month)
        elif monthContent.name == "ul":
            # Go through the list of days
            for dayRelative in monthContent.find_all("a"):
                print(dayRelative['href'])
                dayPageLink = "https://en.wikipedia.org/"+dayRelative['href']
                try :
                    dayPage = requests.get(dayPageLink)
                except requests.exceptions.RequestException as e:
                    continue
                soupPage = BeautifulSoup(dayPage.content, "html.parser")

                if dayPage.status_code == 200:
                    
                    # Get the number of articles in a particular day
                    articlesLength = float(soupPage.find_all("ul")[2].find_all("li")[-1].get_text().split(" ")[0])
                    numberDec = round(articlesLength % 1 * 10, 2)
                    if int(numberDec) != numberDec:
                        numberDec *= 10
                    articlesLength = int(articlesLength) + numberDec
                    print("Articles to be crawled in this page: ",articlesLength)

                    # Every article is located in an <h3> tag
                    for article in soupPage.find_all("h3", limit = articlesLength):
                        try:
                            # Don't read deleted articles
                            if article.find("a")['title'].find("(page does") == -1:
                                articleTitle = article.get_text().split("[")[0]
                                pageLink = "https://en.wikipedia.org"+article.find("a")['href']
                                df = crawl_article(year, month, articleTitle, pageLink, df)
                        except Exception as e:
                            continue
    return df


In [19]:
def crawl_article(year, month, title, pageLink, df):
    """ Crawl the content of the corresponding dbpedia page of a wikipedia article in order to get its id and gender.
        Store an entry in the dataframe.

        Args:
            year: the year in which the current article was flagged for deletion
            month: the month in which the current article was flagged for deletion
            articleTitle: the title of the article flagged for deletion
            pageLink: the wikipedia link of the article
            df: the data frame where the data are stored in the format year | month | title | Id | gender

        Returns:
            A data frame
    """
    url = "http://dbpedia.org/page/"+pageLink.split("/wiki/")[1]
    try :
        dbpediaPage = requests.get(url)
    except requests.exceptions.RequestException as e:
        return df
    soup = BeautifulSoup(dbpediaPage.content, "html.parser")
    wikiIdTag = soup.find("span", {"property":"dbo:wikiPageID"})
    genderTag = soup.find("span", {"property":"foaf:gender"})
    if genderTag == None:
        # Not a person
        return df
    dic = {"Year":year, "Month":month, "Tile":title, "Id": wikiIdTag.contents[0]
           , "Gender":genderTag.contents[0]}
    if df.empty:
        df = pd.DataFrame(data=dic, index=[0])
    else:
        df_temp = pd.DataFrame(data=dic, index=[0])
        df = pd.concat([df, df_temp])
    return df

In [20]:
startTime = timeit.default_timer()

seedURL = "https://en.wikipedia.org/wiki/Wikipedia:Archived_deletion_discussions#Deletion_discussions/"
archivePage = requests.get(seedURL)
soup = BeautifulSoup(archivePage.content, "html.parser")

# Get the year
years = []
yearContents = []
for yearContent in soup.find_all("h2", limit=10):
    year = yearContent.get_text().split("[")[0]
    if year == "Contents":
        continue
    years.append(year)
    yearContents.append(yearContent)

# print(yearContents[5].find_next_sibling)
df = pd.DataFrame()
df = crawl_year(years[5], yearContents[5], df)

elapsedTime = timeit.default_timer() - startTime
print("Crawl time ", elapsedTime)

Month December
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_31
Articles to be crawled in this page:  39.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_30
Articles to be crawled in this page:  85.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_29
Articles to be crawled in this page:  37.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_28
Articles to be crawled in this page:  41.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_27
Articles to be crawled in this page:  65.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_26
Articles to be crawled in this page:  49.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_25
Articles to be crawled in this page:  59.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_24
Articles to be crawled in this page:  34.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_23
Articles to be crawled in this page:  96.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_December_22
Articles

Articles to be crawled in this page:  24.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_11
Articles to be crawled in this page:  51.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_10
Articles to be crawled in this page:  57.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_9
Articles to be crawled in this page:  63.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_8
Articles to be crawled in this page:  50.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_7
Articles to be crawled in this page:  57.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_6
Articles to be crawled in this page:  48.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_5
Articles to be crawled in this page:  56.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_4
Articles to be crawled in this page:  63.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_3
Articles to be crawled in this page:  46.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_October_

Articles to be crawled in this page:  98.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_22
Articles to be crawled in this page:  52.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_21
Articles to be crawled in this page:  40.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_20
Articles to be crawled in this page:  78.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_19
Articles to be crawled in this page:  74.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_18
Articles to be crawled in this page:  51.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_17
Articles to be crawled in this page:  57.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_16
Articles to be crawled in this page:  67.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_15
Articles to be crawled in this page:  69.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_14
Articles to be crawled in this page:  53.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_July_13
Articles to be crawl

Articles to be crawled in this page:  48.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_29
Articles to be crawled in this page:  52.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_28
Articles to be crawled in this page:  37.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_27
Articles to be crawled in this page:  25.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_26
Articles to be crawled in this page:  32.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_25
Articles to be crawled in this page:  41.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_24
Articles to be crawled in this page:  50.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_23
Articles to be crawled in this page:  72.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_22
Articles to be crawled in this page:  81.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_21
Articles to be crawled in this page:  52.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_April_20
Articles t

/wiki/Wikipedia:Articles_for_deletion/Log/2014_February_6
Articles to be crawled in this page:  103.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_February_5
Articles to be crawled in this page:  41.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_February_4
Articles to be crawled in this page:  45.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_February_3
Articles to be crawled in this page:  60.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_February_2
Articles to be crawled in this page:  44.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_February_1
Articles to be crawled in this page:  52.0
Month January
/wiki/Wikipedia:Articles_for_deletion/Log/2014_January_31
Articles to be crawled in this page:  46.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_January_30
Articles to be crawled in this page:  57.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_January_29
Articles to be crawled in this page:  39.0
/wiki/Wikipedia:Articles_for_deletion/Log/2014_January_28
Articles to be cra

In [22]:
df

Unnamed: 0,Year,Month,Tile,Id,Gender
0,2014,December,Dr. Carl Marci,44731082,male
0,2014,December,Leelah Alcorn,44889641,female
0,2014,December,Kogan Plan,44576988,male
0,2014,December,Nguyễn Công Phượng,45421322,male
0,2014,December,Andrew Schwab,44971775,male
0,2014,December,Milancy Khongstia,44871200,male
0,2014,December,Brian Solis,17912166,male
0,2014,December,Lele Pons,43774211,female
0,2014,December,Sennacherib's campaign in Judah,28334,male
0,2014,December,Charles Lucas Anthony,20752624,male


In [23]:
export_csv = df.to_csv (r'C:\Users\neiral\WS_semester2\CSS\nominatedForDeletion\2014.csv', index = None, header=True)

In [24]:
df2 = pd.read_csv('2014.csv')

In [25]:
print(len(df2))
print(len(df))

3231
3231


In [15]:
pageLink = "/wiki/Patriotic_Nigras"
url = "http://dbpedia.org/page/"+pageLink.split("/wiki/")[1]
try :
    dbpediaPage = requests.get(url)
except requests.exceptions.RequestException as e:
    print(e)
soup = BeautifulSoup(dbpediaPage.content, "html.parser")
wikiIdTag = soup.find("span", {"property":"dbo:wikiPageID"})
genderTag = soup.find("span", {"property":"foaf:gender"})
print("Gender attr:",genderTag)
print("WikiId attr:",wikiIdTag)
if genderTag == None:
    print("Not a person")

Gender attr: None
WikiId attr: <span property="dbo:wikiPageID" xmlns:dbo="http://dbpedia.org/ontology/">18799258</span>
Not a person


In [13]:
print(url)
print("http://dbpedia.org/page/Carl_Marci")

http://dbpedia.org/page/Carl_Marci
http://dbpedia.org/page/Carl_Marci


In [12]:
pageLink = "/wiki/Carl_Marci"
pageLink.split("/wiki/")[1]

'Carl_Marci'

In [153]:
number = 27.58
number_int = number % 100
number_dec = number % 1

In [154]:
number_dec = round((number % 1 * 10), 2)
if int(number_dec) != number_dec:
    number_dec *= 10
    
print(int(number) + number_dec)
print(number_dec)

85.0
58.0


In [149]:
number

27.8