In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import sklearn
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
# from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
import graphviz
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

In [3]:
h = .02

In [4]:
names = [ "Linear SVM", "RBF SVM", "Gaussian Process",
         "Adaboost", "Random Forest", "Nearest Neighbors",
         "Decision tree", "QDA"]

In [5]:
classifiers = [
    SVC(kernel='linear', C = 0.025),
    SVC(gamma=2, C=1),
    GaussianNB(),
    AdaBoostClassifier(),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    QuadraticDiscriminantAnalysis()
]

In [6]:
def get_baseline(X_train, y_train, X_test, y_test):
    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X_train, y_train)
    from sklearn.utils import validation
    checked_X_test_ = validation.check_array(X_test, accept_sparse=['csr', 'csc', 'coo'])
    classes_ = dummy_clf.classes_
    class_prior_ = dummy_clf.class_prior_
    n_samples = int(checked_X_test_.shape[0])
    predicted_y = np.tile([classes_[k][class_prior_[k].argmax()] for k in range(dummy_clf.n_outputs_)], [n_samples, 1])
    from sklearn.metrics import classification
    return classification.accuracy_score(y_test, predicted_y, sample_weight=None)

### Load dataset

In [7]:
rfc_dataset = np.genfromtxt("/Users/jane/wikum/wikum/ml_features/rfc_features_21.csv", delimiter=",")

In [8]:
# get rid of column header
rfc_dataset = rfc_dataset[1:]

In [9]:
rfc_dataset.shape

(5432, 24)

1 "participant_num"
2 "comment_num"
3 "sum_edit_counts"
4 "sum_days_since_join"
5 "sum_rev_counts_before_rfc"
6 "new_par_ratio"
7 "initiator_edit_counts"
8 "initiator_day_since_joined"
9 "initiator_rev_count_before_rfc"

In [10]:
def extract_inputs(dataset, input_idx):
    total_inputs = []
    for row in dataset:
        selected_input = [row[id] for id in input_idx]
        total_inputs.append(selected_input)
    return total_inputs

In [11]:
def extract_outputs(dataset, output_idx):
    total_outputs = []
    for row in dataset:
        selected_output = [row[id] for id in output_idx]
        total_outputs.append(selected_output)
    return total_outputs

In [12]:
def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

#### Get inputs and outputs

In [13]:
total_inputs = extract_inputs(rfc_dataset, [1, 4,5, 8, 10,13])

In [14]:
total_outputs = extract_outputs(rfc_dataset, [22])

In [15]:
X = StandardScaler().fit_transform(total_inputs)
# X = total_inputs

In [16]:
Y = np.array(total_outputs).reshape(5432, 1)

In [17]:
# X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, Y, test_size=0.4, random_state=42)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.4, random_state=42)

In [18]:
dataset = [X, Y]

In [19]:
get_baseline(X_train, y_train, X_test, y_test)

0.49838932351587667

### Feature selection

In [22]:
dc_clf = DecisionTreeClassifier(max_depth=10)
dc_clf = dc_clf.fit(X_train, y_train)
dc_pred = dc_clf.predict(X_test)

In [23]:
import pandas as pd
pd.DataFrame(dc_clf.feature_importances_).sort_values(0)

Unnamed: 0,0
0,0.070273
2,0.07553
3,0.081455
4,0.089547
5,0.09116
1,0.592035


In [24]:
dc_model = SelectFromModel(dc_clf, prefit=True)
X_new = dc_model.transform(X)
X_new.shape

(5432, 1)

### 1. Logistic Regession LG

In [25]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)  

  y = column_or_1d(y, warn=True)


0.78601012425218597

In [26]:
logreg_pred = logreg.predict(X_test)

#### Accuracy

In [189]:
accuracy_score(y_test, logreg_pred)

0.78601012425218597

#### Precision

In [190]:
precision_score(y_test, logreg_pred)

0.79372623574144485

#### Recall

In [191]:
recall_score(y_test, logreg_pred)

0.77100646352723912

#### AUC

In [192]:
roc_auc_score(y_test, logreg_pred)

0.78596194735994984

#### F1 score

In [193]:
f1_score(y_test, logreg_pred)

0.78220140515222469

### 2. ADT

In [194]:
adt = AdaBoostClassifier()
adt.fit(X_train, y_train)
adt.score(X_test, y_test)  

0.79521398987574776

In [195]:
adt_pred = adt.predict(X_test)

#### Accuracy

In [196]:
accuracy_score(y_test, adt_pred)

0.79521398987574776

#### Precision

In [197]:
precision_score(y_test, adt_pred)

0.76104746317512273

#### Recall

In [198]:
recall_score(y_test, adt_pred)

0.8587257617728532

#### AUC

In [199]:
roc_auc_score(y_test, adt_pred)

0.79541792675798628

#### F1 score

In [200]:
f1_score(y_test, adt_pred)

0.80694143167028209

### 3. Random Forests

In [201]:
rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)  

  


0.79429360331339161

In [202]:
rf_pred = rf.predict(X_test)

#### Accuracy

In [203]:
accuracy_score(y_test, rf_pred)

0.79429360331339161

#### Precision

In [204]:
precision_score(y_test, rf_pred)

0.7539936102236422

#### Recall

In [205]:
recall_score(y_test, rf_pred)

0.87165281625115421

#### AUC

In [206]:
roc_auc_score(y_test, rf_pred)

0.79454200445585244

#### F1 score

In [207]:
f1_score(y_test, rf_pred)

0.80856531049250535

### 4. SVM-RBF

In [219]:
C_2d_range = [1e-2, 1, 1e2]
gamma_2d_range = [1e-1, 1, 1e1]
for C in C_2d_range:
    for gamma in gamma_2d_range:
        clf = SVC(C=C, gamma=gamma)
        clf.fit(X_train, y_train)
        svm_rbf_pred = clf.predict(X_test)
        print 'c: ' + str(C) + ' , g: ' + str(gamma)
        print accuracy_score(y_test, svm_rbf_pred)
        print precision_score(y_test, svm_rbf_pred)
        print recall_score(y_test, svm_rbf_pred)
        print roc_auc_score(y_test, svm_rbf_pred)
        print f1_score(y_test, svm_rbf_pred)
        print '======================================'

c: 0.01 , g: 0.1
0.707777266452
0.7166344294
0.684210526316
0.707701593433
0.700047236656
c: 0.01 , g: 1
0.666359871146
0.615932642487
0.87811634349
0.667039823121
0.724019794442
c: 0.01 , g: 10.0
0.498389323516
0.498389323516
1.0
0.5
0.665233415233
c: 1 , g: 0.1
0.794753796595
0.779631255487
0.819944598338
0.79483468449
0.799279927993
c: 1 , g: 1
0.786470317533
0.761622992392
0.831948291782
0.786616347726
0.795233892321
c: 1 , g: 10.0
0.746893695352
0.697553743514
0.868882733149
0.747285403272
0.773848684211
c: 100.0 , g: 0.1
0.802577082375
0.773869346734
0.853185595568
0.802739586775
0.811594202899
c: 100.0 , g: 1
0.756097560976
0.751592356688
0.76269621422
0.756118749312
0.757103574702
c: 100.0 , g: 10.0
0.725724804418
0.683773584906
0.836565096953
0.726080713614
0.752491694352


#### Decision Tree

In [209]:
dc_clf = DecisionTreeClassifier(max_depth=10)
dc_clf = dc_clf.fit(X_train, y_train)
dc_pred = dc_clf.predict(X_test)

#### Accuracy

In [210]:
accuracy_score(y_test, dc_pred)

0.77404509894155549

#### Precision

In [211]:
precision_score(y_test, dc_pred)

0.74915824915824913

#### Recall

In [212]:
recall_score(y_test, dc_pred)

0.82179132040627889

#### AUC

In [213]:
roc_auc_score(y_test, dc_pred)

0.77419841249671761

#### F1 score

In [214]:
f1_score(y_test, dc_pred)

0.78379568472038741

#### Visualize

In [215]:
dc_clf_5 = DecisionTreeClassifier(max_depth=5)
dc_clf_5 = dc_clf_5.fit(X_train, y_train)
dc_pred_5 = dc_clf_5.predict(X_test)

In [216]:
class_names = ["unclosed","closed"]
# feature_names = ["participant_num","comment_num","sum_edit_counts","sum_days_since_join","sum_rev_counts_before_rfc","new_par_ratio","initiator_edit_counts","initiator_day_since_joined","initiator_rev_count_b
index = ["number of participants", "sum of users' joined days", "sum of revisions before RfC", "initiator's days since joined", "averge reply level", "positive tone"]

In [217]:
dd = sklearn.tree.export_graphviz(dc_clf_5, out_file=None, filled=True, feature_names=index, class_names=class_names, rounded=True)
graph = graphviz.Source(dd)
graph.render('filtered_decision_tree')

'filtered_decision_tree.pdf'