In [1]:
%%time
from operator import itemgetter
import networkx as nx
from mediawiki import MediaWiki
import matplotlib.pyplot as plt
wikipedia = MediaWiki()


Topic = "Bubble Tea".title()
degree=2
#ignore non-wiki article pages
IgnoreLinksWith = ("International Standard Serial Number",
    "International Standard Book Number",
    "National Diet Library",
    "International Standard Name Identifier",
    "International Standard Book Number (Identifier)",
    "Pubmed Identifier", 
    "Pubmed Central",
    "Digital Object Identifier", 
    "Arxiv",
    "Proc Natl Acad Sci Usa", 
    "Bibcode",
    "Library Of Congress Control Number", 
    "Jstor",
    "Doi (Identifier)",
    "Isbn (Identifier)",
    "Pmid (Identifier)",
    "Arxiv (Identifier)",
    "Bibcode (Identifier)",
    "Issn (Identifier)",
    "hdl (identifier)")
ToDoList = [(0, Topic)] # Initialize the list of links
ToDoSet = set(Topic) # Make a set of the article title
FinishedSet = set() # Set of parsed article titles
g = nx.DiGraph()
layer, page = ToDoList[0]

Wall time: 2.78 s


In [2]:
%%time

while layer < degree:
# Remove the current page from ToDoList, and add it to FinishedSet 
# it will skip over the page if it sees it again
  
    del ToDoList[0]
    FinishedSet.add(page)
  
  # test intial should be Topic page
    print(layer, page) 
  
  # if page no loady
    try:
        wiki = wikipedia.page(page)
    except:
        layer, page = ToDoList[0]
        print(page,"did not work")
        continue
  
    for link in wiki.links:
        link = link.title()
        #added to also catch other identifier pages
        if link not in IgnoreLinksWith and not link.startswith("List Of") and "(Identifier)" not in link:
            if link not in ToDoSet and link not in FinishedSet:
                ToDoList.append((layer + 1, link))
                ToDoSet.add(link)
            g.add_edge(page, link)
    layer, page = ToDoList[0]

0 Bubble Tea
1 2021 Suez Canal Obstruction
1 A-Gei
1 Adzuki Bean
1 Agriculture In Taiwan
1 Aiyu Jelly
1 Al Jazeera
1 Albuquerque
1 Almond Milk
1 Aloe
1 Aloe Vera
1 American Tea Culture
1 André Chiang
1 Anji Bai Cha
1 Aquaculture In Taiwan
1 Arabic Tea
1 Aracha
1 Arcadia, California
1 Argentine Tea Culture
1 Arnold Palmer (Drink)
1 Asian Boss
1 Assam Tea
1 Azerbaijani Tea Culture
1 Azuki Bean
1 Ba-Wan
1 Bai Jiguan Tea
1 Baihao Yinzhen
1 Baimao Hou
1 Baimudan Tea
1 Bakkwa
1 Ban Tian Yao Tea
1 Bancha
1 Barista
1 Beer In Taiwan
1 Bento
1 Biluochun
1 Black Bean Paste
1 Black Tea
1 Boba Tea Company
1 Boba Liberal
1 Brazilian Tea Culture
1 Bu Zhi Chun Tea
1 Builder'S Tea
1 Burmese Milk Tea
1 Butter Tea
1 Caffeine
1 Camellia Japonica
1 Camellia Sasanqua
1 Camellia Sinensis
1 Camellia Taliensis
1 Cannabis Tea
1 Cassava
1 Catechin
1 Cellophane
1 Cendol
1 Ceylon Tea
1 Cha Chaan Teng
1 Chamomile
1 Chashitsu
1 Chatime
1 Chhau-A-Koe
1 Chifir
1 Chinese Language
1 Chinese Sausage
1 Chinese Tea
1 Chine

In [3]:
%%time
# remove self loops
g.remove_edges_from(nx.selfloop_edges(g))

# identify duplicates like "apple" and "apples"
duplicates = [(node, node + "s")
              for node in g if node + "s" in g
             ]

for dup in duplicates:
  # *dup is a technique named "unpacking"
  g = nx.contracted_nodes(g, *dup, self_loops=False)

print(duplicates)

#more filterings: page "x-y" is the same as "x y" 
duplicates = [(x, y) for x, y in 
              [(node, node.replace("-", " ")) for node in g]
                if x != y and y in g]
print(duplicates)

for dup in duplicates:
    g = nx.contracted_nodes(g, *dup, self_loops=False)

    # nx.contracted creates a new node/edge attribute called contraction
    # the value of the attribute is a dictionary, but GraphML
    # does not support dictionary attributes
nx.set_node_attributes(g, 0,"contraction")
nx.set_edge_attributes(g, 0,"contraction")

[('Azuki Bean', 'Azuki Beans'), ('Catechin', 'Catechins'), ('Drink', 'Drinks'), ('Herbal Tea', 'Herbal Teas'), ('Mung Bean', 'Mung Beans'), ('Shopping Mall', 'Shopping Malls'), ('Spring Roll', 'Spring Rolls'), ('Tea Bag', 'Tea Bags'), ('Mobile App', 'Mobile Apps'), ('Carrot', 'Carrots'), ('B Vitamin', 'B Vitamins'), ('Banana', 'Bananas'), ('Bean', 'Beans'), ('Carbohydrate', 'Carbohydrates'), ('Cowpea', 'Cowpeas'), ('Cultivar', 'Cultivars'), ('Flower', 'Flowers'), ('Loanword', 'Loanwords'), ('Milligram', 'Milligrams'), ('Mooncake', 'Mooncakes'), ('Muffin', 'Muffins'), ('Sichuan Pepper', 'Sichuan Peppers'), ('Vitamin', 'Vitamins'), ('Grape', 'Grapes'), ('Pineapple', 'Pineapples'), ('Scallion', 'Scallions'), ('Vegetable', 'Vegetables'), ('Egyptian American', 'Egyptian Americans'), ('African American', 'African Americans'), ('Asian American', 'Asian Americans'), ('Christian', 'Christians'), ('Conquistador', 'Conquistadors'), ('County Seat', 'County Seats'), ('Federal Information Processing

In [4]:
%%time
# filter nodes by degree
core = [node for node, deg in dict(g.degree()).items() if deg >= degree]

# select a subgraph with filtered nodes
gsub = nx.subgraph(g, core)

print("{} nodes, {} edges".format(len(gsub), nx.number_of_edges(gsub)))

# convert graph to gelphi format
nx.write_gexf(gsub, "wikipedia.gexf")

16297 nodes, 112879 edges
Wall time: 3.94 s
