Skip to content

Commit

Permalink
Merge pull request #219 from wiki-ai/tune
Browse files Browse the repository at this point in the history
Adds hyperparameter tuning utility
  • Loading branch information
Ladsgroup committed Dec 4, 2015
2 parents fc03250 + 5840b91 commit 3d16a58
Show file tree
Hide file tree
Showing 21 changed files with 551 additions and 274 deletions.
7 changes: 7 additions & 0 deletions config/gradient_boost.params.yaml
@@ -0,0 +1,7 @@
GradientBoostingClassifier:
class: sklearn.ensemble.GradientBoostingClassifier
params:
n_estimators: [100, 300, 500, 700]
max_depth: [1, 3, 5, 7]
max_features: ["log2"]
learning_rate: [0.01, 0.1, 0.5, 1]
6 changes: 6 additions & 0 deletions config/logistic_regression.params.yaml
@@ -0,0 +1,6 @@

LogisticRegression:
class: sklearn.linear_model.LogisticRegression
params:
penalty: ["l1", "l2"]
C: [0.1, 1, 10]
10 changes: 10 additions & 0 deletions config/naive_bayes.params.yaml
@@ -0,0 +1,10 @@
GaussianNB:
class: sklearn.naive_bayes.GaussianNB
params: {}
BernoulliNB:
class: sklearn.naive_bayes.BernouliNB
params: {}
MultinomialNB:
class: sklearn.naive_bayes.MultinomialNB
params:
alpha: [0.1, 1, 10]
4 changes: 0 additions & 4 deletions config/ptwiki_api.yaml

This file was deleted.

4 changes: 0 additions & 4 deletions config/ptwiki_svc.yaml

This file was deleted.

7 changes: 7 additions & 0 deletions config/random_forest.params.yaml
@@ -0,0 +1,7 @@
RandomForestClassifier:
class: sklearn.ensemble.RandomForestClassifier
params:
n_estimators: [10, 20, 40, 80, 160, 320, 640]
min_samples_leaf: [1, 3, 5, 7, 13]
max_features: ["log2"]
criterion: ["gini", "entropy"]
14 changes: 14 additions & 0 deletions config/svc.params.yaml
@@ -0,0 +1,14 @@
SVC:
class: sklearn.svm.SVC
params:
-
kernel: ["rbf"]
probability: [true]
gamma: [0.0, 0.001, 0.0001]
cache_size: [1000]
C: [0.1, 1, 10]
-
kernel: ["linear"]
probability: [true]
cache_size: [1000]
C: [0.1, 1, 10]
2 changes: 1 addition & 1 deletion revscoring/__init__.py
Expand Up @@ -108,6 +108,6 @@
from .languages import Language
from .scorer_models import ScorerModel

__version__ = "0.7.3"
__version__ = "0.7.4"

__all__ = [Datasource, Dependent, Extractor, Feature, Language, ScorerModel]
10 changes: 7 additions & 3 deletions revscoring/revscoring.py
@@ -1,11 +1,15 @@
"""
Provides access to a set of utilities for working with revision scorer models.
Utilities
Utilities:
* score Scores a set of revisions
* score Scores a set of revisions using a trained model
* extract_features Extracts a list of features for a set of revisions
* train_test Trains and tests a MLScorerModel with extracted features.
* model_info Reads a model-file and reports metadata and testing
statistics
* train_test Trains and tests a MLScorerModel with extracted features
* tune Tunes a set of models against a training set to identify
the best model/configuration
Usage:
revscoring (-h | --help)
Expand Down
3 changes: 2 additions & 1 deletion revscoring/scorer_models/__init__.py
Expand Up @@ -27,7 +27,8 @@
from .svc import SVC, SVCModel, LinearSVC, LinearSVCModel, RBFSVC, RBFSVCModel
from .nb import (NB, NBModel, GaussianNB, GaussianNBModel, MultinomialNB,
MultinomialNBModel, BernoulliNB, BernoulliNBModel)
from .scorer_model import ScorerModel, MLScorerModel, ScikitLearnClassifier
from .scorer_model import ScorerModel, MLScorerModel
from .sklearn_classifier import ScikitLearnClassifier
from .rf import RF, RFModel

__all__ = [
Expand Down
2 changes: 1 addition & 1 deletion revscoring/scorer_models/nb.py
Expand Up @@ -17,7 +17,7 @@

from sklearn import naive_bayes

from .scorer_model import ScikitLearnClassifier
from .sklearn_classifier import ScikitLearnClassifier

logger = logging.getLogger("revscoring.scorers.nb")

Expand Down
2 changes: 1 addition & 1 deletion revscoring/scorer_models/rf.py
Expand Up @@ -9,7 +9,7 @@

from sklearn.ensemble import RandomForestClassifier

from .scorer_model import ScikitLearnClassifier
from .sklearn_classifier import ScikitLearnClassifier

logger = logging.getLogger("revscoring.scorers.rf")

Expand Down
184 changes: 0 additions & 184 deletions revscoring/scorer_models/scorer_model.py
Expand Up @@ -184,187 +184,3 @@ def from_config(cls, config, name, section_key="scorer_models"):
return cls.load(open(section['model_file'], 'rb'))
else:
return cls(**{k: v for k, v in section.items() if k != "class"})


class ScikitLearnClassifier(MLScorerModel):

def __init__(self, features, classifier_model, version=None):
super().__init__(features, version=version)
self.classifier_model = classifier_model
self.stats = None

def __getattr__(self, attr):
if attr is "stats":
return None
else:
raise AttributeError(attr)

def train(self, values_labels):
"""
:Returns:
A dictionary with the fields:
* seconds_elapsed -- Time in seconds spent fitting the model
"""
start = time.time()

values, labels = zip(*values_labels)

# Fit SVC model
self.classifier_model.fit(values, labels)
self.trained = time.time()

return {
'seconds_elapsed': time.time() - start
}

def score(self, feature_values):
"""
Generates a score for a single revision based on a set of extracted
feature_values.
:Parameters:
feature_values : collection(`mixed`)
an ordered collection of values that correspond to the
`Feature` s provided to the constructor
:Returns:
A dict with the fields:
* predicion -- The most likely class
* probability -- A mapping of probabilities for input classes
corresponding to the classes the classifier was
trained on. Generating this probability is
slower than a simple prediction.
"""
prediction = self.classifier_model.predict([feature_values])[0]
labels = self.classifier_model.classes_
probas = self.classifier_model.predict_proba([feature_values])[0]
probability = {label: proba for label, proba in zip(labels, probas)}

doc = {
'prediction': prediction,
'probability': probability
}
return normalize_json(doc)

def test(self, values_labels):
"""
:Returns:
A dictionary of test statistics with the fields:
* accuracy -- The mean accuracy of classification
* table -- A truth table for classification
* roc
* auc -- The area under the ROC curve
"""
values, labels = zip(*values_labels)

scores = [self.score(feature_values) for feature_values in values]

self.stats = {
'table': self._label_table(scores, labels),
'accuracy': self.classifier_model.score(values, labels),
'roc': self._roc_stats(scores, labels,
self.classifier_model.classes_)
}
return self.stats

def info(self):
return normalize_json({
'type': self.__class__.__name__,
'version': self.version,
'trained': self.trained,
'stats': self.stats
})

def format_info(self):
info = self.info()
formatted = io.StringIO()
formatted.write("ScikitLearnClassifier\n")
formatted.write(" - type: {0}\n".format(info.get('type')))
formatted.write(" - version: {0}\n".format(info.get('version')))
if isinstance(info['trained'], float):
date_string = datetime.fromtimestamp(info['trained']).isoformat()
formatted.write(" - trained: {0}\n".format(date_string))
else:
formatted.write(" - trained: {0}\n".format(info.get('trained')))

formatted.write("\n")
formatted.write(self.format_stats())
return formatted.getvalue()

def format_stats(self):
if self.stats is None:
return "No stats available"
else:
formatted = io.StringIO()
predicted_actuals = self.stats['table'].keys()
possible = list(set(actual for _, actual in predicted_actuals))
possible.sort()

formatted.write("Accuracy: {0}\n\n".format(self.stats['accuracy']))
if 'auc' in self.stats['roc']:
formatted.write("ROC-AUC: {0}\n\n"
.format(self.stats['roc']['auc']))
else:
formatted.write("ROC-AUC:\n")

table_data = [[comparison_label,
self.stats['roc'][comparison_label]['auc']]
for comparison_label in possible]
formatted.write(tabulate(table_data))
formatted.write("\n\n")

table_data = []

for actual in possible:
table_data.append(
[(str(actual))] +
[self.stats['table'].get((predicted, actual), 0)
for predicted in possible]
)
formatted.write(tabulate(
table_data,
headers=["~{0}".format(p) for p in possible]))

return formatted.getvalue()

@classmethod
def _roc_stats(cls, scores, labels, possible_labels):

if len(possible_labels) <= 2:
# Binary classification, class choice doesn't matter.
comparison_label = possible_labels[0]
return cls._roc_single_class(scores, labels, comparison_label)
else:
roc_stats = {}
for comparison_label in possible_labels:
roc_stats[comparison_label] = \
cls._roc_single_class(scores, labels, comparison_label)

return roc_stats

@classmethod
def _roc_single_class(cls, scores, labels, comparison_label):
probabilities = [s['probability'][comparison_label]
for s in scores]

true_positives = [l == comparison_label for l in labels]
fpr, tpr, thresholds = roc_curve(true_positives, probabilities)

return {
'auc': auc(fpr, tpr)
}

@staticmethod
def _label_table(scores, labels):

predicteds = [s['prediction'] for s in scores]

table = {}
for pair in zip(labels, predicteds):
table[pair] = table.get(pair, 0) + 1

return table

0 comments on commit 3d16a58

Please sign in to comment.