## Import required libraries

In [3]:
! sudo pip install pandas
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import train_test_split

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Load train and test data

In [4]:
df_data = pd.read_csv("./enwiki.draft_quality.50k_stratified.feature_labels.tsv", sep="\t")
df_data["OK"] = df_data['draft_quality'].apply(lambda x: x == "OK")
df_data["spam"] = df_data['draft_quality'].apply(lambda x: x == "spam")
df_data["vandalism"] = df_data['draft_quality'].apply(lambda x: x == "vandalism")
df_data["attack"] = df_data['draft_quality'].apply(lambda x: x == "attack")

In [15]:
df_data.head()

Unnamed: 0,feature.wikitext.revision.chars,feature.wikitext.revision.whitespace_chars,feature.wikitext.revision.markup_chars,feature.wikitext.revision.cjk_chars,feature.wikitext.revision.entity_chars,feature.wikitext.revision.url_chars,feature.wikitext.revision.word_chars,feature.wikitext.revision.uppercase_word_chars,feature.wikitext.revision.punctuation_chars,feature.wikitext.revision.break_chars,...,feature.enwiki.revision.cn_templates.1,"feature.(enwiki.revision.cn_templates / max(wikitext.revision.content_chars, 1)).1",feature.enwiki.main_article_templates,"feature.(enwiki.main_article_templates / max(wikitext.revision.content_chars, 1))","feature.(english.stemmed.revision.stems_length / max(wikitext.revision.content_chars, 1))",draft_quality,OK,spam,vandalism,attack
0,54,6,4,0,0,0,32,8,0,0,...,0,0.0,0,0.0,0.653061,OK,True,False,False,False
1,61,7,8,0,0,0,45,8,0,0,...,0,0.0,0,0.0,1.952381,OK,True,False,False,False
2,1914,218,106,0,0,224,1170,37,32,10,...,0,0.0,0,0.0,0.821457,OK,True,False,False,False
3,1007,110,96,0,0,0,727,31,22,8,...,0,0.0,0,0.0,0.904632,OK,True,False,False,False
4,687,82,89,0,0,39,407,0,9,7,...,0,0.0,0,0.0,1.641256,OK,True,False,False,False


In [16]:
features = df_data.columns[:-5]
len(features)
data = df_data[features]

## Select feature columns

## Select target columns

In [73]:
targets = ["spam", "OK", "vandalism", "attack"]

## Train and Run models 

### Defining models 

In [75]:
models = [dict(), dict(), dict(), dict()]
models[0]["model"] = RandomForestClassifier(n_jobs=64)
models[0]["name"] = "RandomForestClassifier"
models[1]["model"] = GradientBoostingClassifier()
models[1]["name"] = "GradientBoostingClassifier"
models[2]["model"] = SVC()
models[2]["name"] = "SVC"
models[3]["model"] = GaussianNB()
models[3]["name"] = "GaussianNB"

### Adding model parametes

In [76]:
for model in models : 
    model["params"] = model["model"].get_params()

### Training models

In [77]:
for model in models : 
    for category in targets : 
        X_train, X_test, y_train, y_test = train_test_split(df_data[features], df_data[category], test_size=0.4, random_state=0)
        
        model["model"].fit(X_train, y_train)
        y_pred = model["model"].predict(X_test)
        model["metrics_for_" + category] = precision_recall_fscore_support(y_test, y_pred, average='binary')
    mean_precision = 0.0
    mean_recall = 0.0
    for category in targets : 
        mean_precision += model["metrics_for_" + category][0]
        mean_recall += model["metrics_for_" + category][1]
    model["mean_precision"] = mean_precision / len(targets)
    model["mean_recall"] = mean_recall / len(targets)

## Evaluation

In [78]:
for model in models: 
    print ("%s\n\tPrecision: %.2f\n\tRecall: %.2f\n\n" % (model["name"],  model["mean_precision"], model["mean_recall"]))

RandomForestClassifier
	Precision: 0.69
	Recall: 0.54


GradientBoostingClassifier
	Precision: 0.72
	Recall: 0.56


SVC
	Precision: 0.67
	Recall: 0.46


GaussianNB
	Precision: 0.30
	Recall: 0.75




In [80]:
models

[{'mean_precision': 0.69314216782224825,
  'mean_recall': 0.54143947489662647,
  'metrics_for_OK': (0.95641838351822506,
   0.92234215302321143,
   0.93907123753950905,
   None),
  'metrics_for_attack': (0.36548223350253806,
   0.088452088452088448,
   0.14243323442136496,
   None),
  'metrics_for_spam': (0.84326064686082247,
   0.8130379568223508,
   0.82787356321839078,
   None),
  'metrics_for_vandalism': (0.6074074074074074,
   0.34192570128885519,
   0.43754547659471255,
   None),
  'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=64,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False),
  'name': 'RandomForestClassifier',
  'params': {'bootstrap': True,
   'class_weight': None,
   'criterion': 'gini',
   'max_dept

In [83]:
from sklearn.externals import joblib

In [85]:
joblib.dump(models, 'models.pkl')
models2 = joblib.load('models.pkl')