From cfee78254c68bbeb2b4da9fff186ed513b9673f6 Mon Sep 17 00:00:00 2001 From: Sumit Asthana Date: Tue, 5 Sep 2017 20:41:47 +0530 Subject: [PATCH] Drafttopic: Add bootstrap code and WikiProjects parsing script --- README.md | 1 + drafttopic/__init__.py | 5 + drafttopic/about.py | 8 + drafttopic/drafttopic.py | 42 ++++ drafttopic/utilities/__init__.py | 1 + drafttopic/utilities/fetch_wikiprojects.py | 235 +++++++++++++++++++++ setup.py | 44 ++++ utility | 4 + 8 files changed, 340 insertions(+) create mode 100644 drafttopic/__init__.py create mode 100644 drafttopic/about.py create mode 100644 drafttopic/drafttopic.py create mode 100644 drafttopic/utilities/__init__.py create mode 100644 drafttopic/utilities/fetch_wikiprojects.py create mode 100644 setup.py create mode 100755 utility diff --git a/README.md b/README.md index 04e499e..db939ff 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ # drafttopic + Predicting topics to new drafts based on Wikiprojects on English Wikipedia diff --git a/drafttopic/__init__.py b/drafttopic/__init__.py new file mode 100644 index 0000000..1789d54 --- /dev/null +++ b/drafttopic/__init__.py @@ -0,0 +1,5 @@ +from .about import (__author__, __author_email__, __description__, __name__, + __url__, __version__) + +__all__ = [__name__, __version__, __author__, __author_email__, + __description__, __url__] diff --git a/drafttopic/about.py b/drafttopic/about.py new file mode 100644 index 0000000..a9e2512 --- /dev/null +++ b/drafttopic/about.py @@ -0,0 +1,8 @@ +__name__ = "drafttopic" +__version__ = "0.4.2" +__author__ = ["Aaron Halfaker", "Sumit Asthana"] +__author_email__ = ["ahalfaker@wikimedia.org", "asthana.sumit23@gmail.com"] +__description__ = "A library for automatic detection of topics of new drafts on Wikipedia" + \ + " based on WikiProjects." +__url__ = "https://github.com/wiki-ai/drafttopic" +__license__ = "MIT" diff --git a/drafttopic/drafttopic.py b/drafttopic/drafttopic.py new file mode 100644 index 0000000..d7533f2 --- /dev/null +++ b/drafttopic/drafttopic.py @@ -0,0 +1,42 @@ +""" +This script provides access to a set of utilities for extracting features and +building draft topic predictors. +Usage: + drafttopic (-h | --help) + drafttopic [-h | --help] +Options: + -h | --help Prints this documentation + The name of the utility to run +""" +import sys +import traceback +from importlib import import_module + + +USAGE = """Usage: + drafttopic (-h | --help) + drafttopic [-h | --help]\n""" + + +def main(): + + if len(sys.argv) < 2: + sys.stderr.write(USAGE) + sys.exit(1) + elif sys.argv[1] in ("-h", "--help"): + sys.stderr.write(__doc__ + "\n") + sys.exit(1) + elif sys.argv[1][:1] == "-": + sys.stderr.write(USAGE) + sys.exit(1) + + module_name = sys.argv[1] + try: + module = import_module(".utilities." + module_name, + package="drafttopic") + except ImportError: + sys.stderr.write(traceback.format_exc()) + sys.stderr.write("Could not load utility {0}.\n".format(module_name)) + sys.exit(1) + + module.main(sys.argv[2:]) diff --git a/drafttopic/utilities/__init__.py b/drafttopic/utilities/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/drafttopic/utilities/__init__.py @@ -0,0 +1 @@ + diff --git a/drafttopic/utilities/fetch_wikiprojects.py b/drafttopic/utilities/fetch_wikiprojects.py new file mode 100644 index 0000000..98a7cf8 --- /dev/null +++ b/drafttopic/utilities/fetch_wikiprojects.py @@ -0,0 +1,235 @@ +""" +Generates a machine readable WikiProjects directory as: +{ +'culture': { + 'name': 'Culture', + 'url': + 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Council/Directory/Culture', + 'root_url': + 'index': + 'topics': { + 'arts': {'name':..., 'url': culture_url+'#arts', + topics:{ + 'Architecture': {'name': + 'Wikipedia:WikiProject_Architecture','url':...} + } + } + } +} +} +Here: +* root_url: Url of page from which this entry was parsed +* index: sections index to which this entry belongs to +* name: name of entry +All the above mentioned fields will be absent from the base entry +which contain actual WikiProjects name and has only three fields: + name, shortname, active + +Usage: + fetch_wikiprojects [--output=] [--debug] + +Options: + --output= Path to an file to write output to + [default: ] + --debug Print debug logging +""" +import mwapi +import json +import re +import logging +import docopt +import pdb +import sys + +def main(argv=None): + args = docopt.docopt(__doc__, argv=argv) + + logging.basicConfig( + level=logging.DEBUG if args['--debug'] else logging.WARNING, + format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' + ) + + if args['--output'] == "": + output_f = sys.stdout + else: + output_f = open(args['--output'], "w") + + run(output_f) + +WPDPage = 'Wikipedia:WikiProject_Council/Directory' +WPMainHeadingRegex =\ + r'\[\[Wikipedia:WikiProject Council/Directory/([A-Za-z_, ]+)\|([A-Za-z_, ]+)\]\]=' +WPListingRegex =\ + r'See the full listing \[\[Wikipedia:WikiProject Council/Directory/([A-Za-z_,/ ]+)' +WPMainLinksRegex1 =\ + r'\[\[Wikipedia:WikiProject_Council/Directory/([A-Za-z_]+)/([A-Za-z_])\|([A-Za-z]+)\]\]' +WPMainLinksRegex2 =\ + r'\[\[Wikipedia:WikiProject_Council/Directory/([A-Za-z_]+)\|([A-Za-z]+)\]\]' + +WPSectionNextHeadingRegex = r'(.+)[=]{2,}' + +WPSectionRegex =\ + r'{{Wikipedia:WikiProject Council/Directory/WikiProject\n'\ + '\|project = ([a-zA-Z_: ]+)\n'\ + '\|shortname = ([a-zA-Z ]+)\n'\ + '\|active = (yes|no)\n([^}]*)}}' +# To check listing in other wikiprojects +WPSectionRegexListed =\ + r'listed-in = ([A-Za-z#/:_ ]+)' + +def run(output): + logger = logging.getLogger(__name__) + parser = WikiProjectsParser(WPDPage, logger) + wps = parser.parseWpDirectory() + output.write(json.dumps(wps, indent=4)) + +class WikiProjectsParser: + def __init__(self, WPDPage, logger): + self.rootDir = WPDPage + self.logger = logger + self.session = mwapi.Session('https://en.wikipedia.org', user_agent='WP-dev') + + def parseWpDirectory(self): + """ + Parses the top level WikiProjects directory + Entry point for WikiProjects parsing + """ + dirname = self.rootDir + self.logger.info("Starting WikiProjects parsing") + wp = {} + topSections = self.getSections(self.session, dirname) + projectsStarted = False + for sec in topSections: + # Ignore starting sections + if sec['toclevel'] == 1: + if projectsStarted: + break + else: + continue + projectsStarted = True + name = sec['line'].replace(' ', '') + wp[sec['line']] = {'name': name, + 'root_url': sec['fromtitle'], + 'index': sec['index']} + # Get entries in this section + self.logger.info("Fetching entries for section:{}".format(name)) + section = self.getSectionText(self.session, dirname, sec['index']) + mainHeading = re.search(WPMainHeadingRegex, section) + if mainHeading: + wp[sec['line']]['url'] = WPDPage + '/' + mainHeading.group(1) + wp[sec['line']]['topics'] = self.getSubCategories(self.session, + wp[sec['line']]['url']) + + + self.logger.info("Ended WikiProjects parsing") + return wp + + def getSubCategories(self, session, page): + """ + Parses a one level down-sub directory page, looking for sections + and table listings + """ + self.logger.info("Starting subdirectory {} parsing".format(page)) + wp = {} + sections = self.getSections(session, page) + titlestack = [sections[0]['line']] + prevLevel = sections[0]['toclevel'] + wp['name'] = sections[0]['line'] + wp['root_url'] = sections[0]['fromtitle'] + wp['index'] = sections[0]['index'] + wp['topics'] = {} + wpS = {} + wpS['topics'] = {} + wpS['topics'][wp['name']] = wp + for sec in sections: + # New level starts + if sec['toclevel'] > prevLevel: + initVals = {'name': sec['line'], 'index': + sec['index'], 'root_url': + sec['fromtitle'], 'topics': {}} + titlestack.append(sec['line']) + tmpwp = wpS + for title in titlestack[:-1]: + tmpwp = tmpwp['topics'][title] + + tmpwp['topics'][sec['line']] = initVals + elif sec['toclevel'] < prevLevel: + titlestack.pop() + titlestack.pop() + initVals = {'name': sec['line'], 'index': + sec['index'], 'root_url': + sec['fromtitle'], 'topics': {}} + titlestack.append(sec['line']) + tmpwp = wpS + for title in titlestack[:-1]: + tmpwp = tmpwp['topics'][title] + + tmpwp['topics'][sec['line']] = initVals + introProjects = self.getSectionIntro(session, page, sec['index']) + if introProjects: + self.setWikiProjects(wpS, titlestack, introProjects) + prevLevel = sec['toclevel'] + return wpS['topics'] + + def setWikiProjects(self, wp, titlestack, projects, key = None): + tmpwp = wp + for title in titlestack: + tmpwp = tmpwp['topics'][title] + if key: + tmpwp['topics'][key] = {**tmpwp['topics'], **projects} + else: + tmpwp['topics'] = {**tmpwp['topics'], **projects} + + + def getSectionIntro(self, session, page, index): + """ + Only gets wikiprojects in intro part of sections, or if this is the leaf + section, subsections handled recursively + """ + wp = {} + wikitext = self.getSectionText(session, page, index).split('\n') + # remove first heading + wikitext = '\n'.join(wikitext[1:]) + match = re.search(WPSectionNextHeadingRegex, wikitext, re.MULTILINE) + if match: + wikitext = wikitext[:match.start()] + wp = self.getWpSection(wikitext) + else: + wp = self.getWpSection(wikitext) + if not wp: + # Try to match a 'See full listing here' entry + match = re.search(WPListingRegex, wikitext) + if match: + wp = self.getSubCategories(session, WPDPage + '/' + match.group(1)) + return wp + + def getSectionText(self, session, page, section): + self.logger.info("Fetching section {} from page {}".format(section, page)) + section = session.get(action='parse', page=page, prop='wikitext', + section=section) + return section['parse']['wikitext']['*'] + + def getSections(self, session, page): + """ + Takes an api session and a page title and returns the sections on a page + """ + self.logger.info("Fetching sections of {}".format(page)) + sections = session.get(action='parse', page=page, prop='sections') + return sections['parse']['sections'] + + def getWpSection(self,wikitext): + """ + Takes a WikiProject section, and returns individual WikiProjects + """ + wp = {} + matches = re.findall(WPSectionRegex, wikitext) + for match in matches: + remaining = match[3] + listedIn = re.search(WPSectionRegexListed, remaining) + # Listed somewhere else, so skip + if listedIn: + continue + wp[match[1]] = {'name': match[0], 'shortname': match[1], 'active': + match[2]} + return wp + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0315f44 --- /dev/null +++ b/setup.py @@ -0,0 +1,44 @@ +import os + +from setuptools import find_packages, setup + +about_path = os.path.join(os.path.dirname(__file__), "drafttopic/about.py") +exec(compile(open(about_path).read(), about_path, "exec")) + + +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + + +def requirements(fname): + for line in open(os.path.join(os.path.dirname(__file__), fname)): + yield line.strip() + +setup( + name=__name__, # noqa + version=__version__, # noqa + author=__author__, # noqa + author_email=__author_email__, # noqa + description=__description__, # noqa + url=__url__, # noqa + license=__license__, # noqa + packages=find_packages(), + entry_points={ + 'console_scripts': [ + 'drafttopic=drafttopic.drafttopic:main' + ], + }, + long_description=read('README.md'), + install_requires=requirements('requirements.txt'), + classifiers=[ + "Development Status :: 4 - Beta", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Utilities", + "Topic :: Scientific/Engineering" + ], +) diff --git a/utility b/utility new file mode 100755 index 0000000..32c5a69 --- /dev/null +++ b/utility @@ -0,0 +1,4 @@ +#!/usr/bin/env python +from drafttopic import drafttopic + +drafttopic.main()