Skip to content

Commit

Permalink
Drafttopic: Add bootstrap code and WikiProjects parsing script
Browse files Browse the repository at this point in the history
  • Loading branch information
codez266 committed Sep 5, 2017
1 parent 81e1bef commit cfee782
Show file tree
Hide file tree
Showing 8 changed files with 340 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# drafttopic

Predicting topics to new drafts based on Wikiprojects on English Wikipedia
5 changes: 5 additions & 0 deletions drafttopic/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .about import (__author__, __author_email__, __description__, __name__,
__url__, __version__)

__all__ = [__name__, __version__, __author__, __author_email__,
__description__, __url__]
8 changes: 8 additions & 0 deletions drafttopic/about.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__name__ = "drafttopic"
__version__ = "0.4.2"
__author__ = ["Aaron Halfaker", "Sumit Asthana"]
__author_email__ = ["ahalfaker@wikimedia.org", "asthana.sumit23@gmail.com"]
__description__ = "A library for automatic detection of topics of new drafts on Wikipedia" + \
" based on WikiProjects."
__url__ = "https://github.com/wiki-ai/drafttopic"
__license__ = "MIT"
42 changes: 42 additions & 0 deletions drafttopic/drafttopic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
This script provides access to a set of utilities for extracting features and
building draft topic predictors.
Usage:
drafttopic (-h | --help)
drafttopic <utility> [-h | --help]
Options:
-h | --help Prints this documentation
<utility> The name of the utility to run
"""
import sys
import traceback
from importlib import import_module


USAGE = """Usage:
drafttopic (-h | --help)
drafttopic <utility> [-h | --help]\n"""


def main():

if len(sys.argv) < 2:
sys.stderr.write(USAGE)
sys.exit(1)
elif sys.argv[1] in ("-h", "--help"):
sys.stderr.write(__doc__ + "\n")
sys.exit(1)
elif sys.argv[1][:1] == "-":
sys.stderr.write(USAGE)
sys.exit(1)

module_name = sys.argv[1]
try:
module = import_module(".utilities." + module_name,
package="drafttopic")
except ImportError:
sys.stderr.write(traceback.format_exc())
sys.stderr.write("Could not load utility {0}.\n".format(module_name))
sys.exit(1)

module.main(sys.argv[2:])
1 change: 1 addition & 0 deletions drafttopic/utilities/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

235 changes: 235 additions & 0 deletions drafttopic/utilities/fetch_wikiprojects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
"""
Generates a machine readable WikiProjects directory as:
{
'culture': {
'name': 'Culture',
'url':
'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Council/Directory/Culture',
'root_url': <root_url>
'index': <index>
'topics': {
'arts': {'name':..., 'url': culture_url+'#arts',
topics:{
'Architecture': {'name':
'Wikipedia:WikiProject_Architecture','url':...}
}
}
}
}
}
Here:
* root_url: Url of page from which this entry was parsed
* index: sections index to which this entry belongs to
* name: name of entry
All the above mentioned fields will be absent from the base entry
which contain actual WikiProjects name and has only three fields:
name, shortname, active
Usage:
fetch_wikiprojects [--output=<path>] [--debug]
Options:
--output=<path> Path to an file to write output to
[default: <stdout>]
--debug Print debug logging
"""
import mwapi
import json
import re
import logging
import docopt
import pdb
import sys

def main(argv=None):
args = docopt.docopt(__doc__, argv=argv)

logging.basicConfig(
level=logging.DEBUG if args['--debug'] else logging.WARNING,
format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
)

if args['--output'] == "<stdout>":
output_f = sys.stdout
else:
output_f = open(args['--output'], "w")

run(output_f)

WPDPage = 'Wikipedia:WikiProject_Council/Directory'
WPMainHeadingRegex =\
r'\[\[Wikipedia:WikiProject Council/Directory/([A-Za-z_, ]+)\|([A-Za-z_, ]+)\]\]='
WPListingRegex =\
r'See the full listing \[\[Wikipedia:WikiProject Council/Directory/([A-Za-z_,/ ]+)'
WPMainLinksRegex1 =\
r'\[\[Wikipedia:WikiProject_Council/Directory/([A-Za-z_]+)/([A-Za-z_])\|([A-Za-z]+)\]\]'
WPMainLinksRegex2 =\
r'\[\[Wikipedia:WikiProject_Council/Directory/([A-Za-z_]+)\|([A-Za-z]+)\]\]'

WPSectionNextHeadingRegex = r'(.+)[=]{2,}'

WPSectionRegex =\
r'{{Wikipedia:WikiProject Council/Directory/WikiProject\n'\
'\|project = ([a-zA-Z_: ]+)\n'\
'\|shortname = ([a-zA-Z ]+)\n'\
'\|active = (yes|no)\n([^}]*)}}'
# To check listing in other wikiprojects
WPSectionRegexListed =\
r'listed-in = ([A-Za-z#/:_ ]+)'

def run(output):
logger = logging.getLogger(__name__)
parser = WikiProjectsParser(WPDPage, logger)
wps = parser.parseWpDirectory()
output.write(json.dumps(wps, indent=4))

class WikiProjectsParser:
def __init__(self, WPDPage, logger):
self.rootDir = WPDPage
self.logger = logger
self.session = mwapi.Session('https://en.wikipedia.org', user_agent='WP-dev')

def parseWpDirectory(self):
"""
Parses the top level WikiProjects directory
Entry point for WikiProjects parsing
"""
dirname = self.rootDir
self.logger.info("Starting WikiProjects parsing")
wp = {}
topSections = self.getSections(self.session, dirname)
projectsStarted = False
for sec in topSections:
# Ignore starting sections
if sec['toclevel'] == 1:
if projectsStarted:
break
else:
continue
projectsStarted = True
name = sec['line'].replace('&nbsp;', '')
wp[sec['line']] = {'name': name,
'root_url': sec['fromtitle'],
'index': sec['index']}
# Get entries in this section
self.logger.info("Fetching entries for section:{}".format(name))
section = self.getSectionText(self.session, dirname, sec['index'])
mainHeading = re.search(WPMainHeadingRegex, section)
if mainHeading:
wp[sec['line']]['url'] = WPDPage + '/' + mainHeading.group(1)
wp[sec['line']]['topics'] = self.getSubCategories(self.session,
wp[sec['line']]['url'])


self.logger.info("Ended WikiProjects parsing")
return wp

def getSubCategories(self, session, page):
"""
Parses a one level down-sub directory page, looking for sections
and table listings
"""
self.logger.info("Starting subdirectory {} parsing".format(page))
wp = {}
sections = self.getSections(session, page)
titlestack = [sections[0]['line']]
prevLevel = sections[0]['toclevel']
wp['name'] = sections[0]['line']
wp['root_url'] = sections[0]['fromtitle']
wp['index'] = sections[0]['index']
wp['topics'] = {}
wpS = {}
wpS['topics'] = {}
wpS['topics'][wp['name']] = wp
for sec in sections:
# New level starts
if sec['toclevel'] > prevLevel:
initVals = {'name': sec['line'], 'index':
sec['index'], 'root_url':
sec['fromtitle'], 'topics': {}}
titlestack.append(sec['line'])
tmpwp = wpS
for title in titlestack[:-1]:
tmpwp = tmpwp['topics'][title]

tmpwp['topics'][sec['line']] = initVals
elif sec['toclevel'] < prevLevel:
titlestack.pop()
titlestack.pop()
initVals = {'name': sec['line'], 'index':
sec['index'], 'root_url':
sec['fromtitle'], 'topics': {}}
titlestack.append(sec['line'])
tmpwp = wpS
for title in titlestack[:-1]:
tmpwp = tmpwp['topics'][title]

tmpwp['topics'][sec['line']] = initVals
introProjects = self.getSectionIntro(session, page, sec['index'])
if introProjects:
self.setWikiProjects(wpS, titlestack, introProjects)
prevLevel = sec['toclevel']
return wpS['topics']

def setWikiProjects(self, wp, titlestack, projects, key = None):
tmpwp = wp
for title in titlestack:
tmpwp = tmpwp['topics'][title]
if key:
tmpwp['topics'][key] = {**tmpwp['topics'], **projects}
else:
tmpwp['topics'] = {**tmpwp['topics'], **projects}


def getSectionIntro(self, session, page, index):
"""
Only gets wikiprojects in intro part of sections, or if this is the leaf
section, subsections handled recursively
"""
wp = {}
wikitext = self.getSectionText(session, page, index).split('\n')
# remove first heading
wikitext = '\n'.join(wikitext[1:])
match = re.search(WPSectionNextHeadingRegex, wikitext, re.MULTILINE)
if match:
wikitext = wikitext[:match.start()]
wp = self.getWpSection(wikitext)
else:
wp = self.getWpSection(wikitext)
if not wp:
# Try to match a 'See full listing here' entry
match = re.search(WPListingRegex, wikitext)
if match:
wp = self.getSubCategories(session, WPDPage + '/' + match.group(1))
return wp

def getSectionText(self, session, page, section):
self.logger.info("Fetching section {} from page {}".format(section, page))
section = session.get(action='parse', page=page, prop='wikitext',
section=section)
return section['parse']['wikitext']['*']

def getSections(self, session, page):
"""
Takes an api session and a page title and returns the sections on a page
"""
self.logger.info("Fetching sections of {}".format(page))
sections = session.get(action='parse', page=page, prop='sections')
return sections['parse']['sections']

def getWpSection(self,wikitext):
"""
Takes a WikiProject section, and returns individual WikiProjects
"""
wp = {}
matches = re.findall(WPSectionRegex, wikitext)
for match in matches:
remaining = match[3]
listedIn = re.search(WPSectionRegexListed, remaining)
# Listed somewhere else, so skip
if listedIn:
continue
wp[match[1]] = {'name': match[0], 'shortname': match[1], 'active':
match[2]}
return wp

44 changes: 44 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os

from setuptools import find_packages, setup

about_path = os.path.join(os.path.dirname(__file__), "drafttopic/about.py")
exec(compile(open(about_path).read(), about_path, "exec"))


def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()


def requirements(fname):
for line in open(os.path.join(os.path.dirname(__file__), fname)):
yield line.strip()

setup(
name=__name__, # noqa
version=__version__, # noqa
author=__author__, # noqa
author_email=__author_email__, # noqa
description=__description__, # noqa
url=__url__, # noqa
license=__license__, # noqa
packages=find_packages(),
entry_points={
'console_scripts': [
'drafttopic=drafttopic.drafttopic:main'
],
},
long_description=read('README.md'),
install_requires=requirements('requirements.txt'),
classifiers=[
"Development Status :: 4 - Beta",
"Topic :: Software Development :: Libraries :: Python Modules",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Topic :: Utilities",
"Topic :: Scientific/Engineering"
],
)
4 changes: 4 additions & 0 deletions utility
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env python
from drafttopic import drafttopic

drafttopic.main()

0 comments on commit cfee782

Please sign in to comment.