In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import export_graphviz
import graphviz
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression

In [3]:
def get_baseline(X_train, y_train, X_test, y_test):
    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X_train, y_train)
    from sklearn.utils import validation
    checked_X_test_ = validation.check_array(X_test, accept_sparse=['csr', 'csc', 'coo'])
    classes_ = dummy_clf.classes_
    class_prior_ = dummy_clf.class_prior_
    n_samples = int(checked_X_test_.shape[0])
    predicted_y = np.tile([classes_[k][class_prior_[k].argmax()] for k in range(dummy_clf.n_outputs_)], [n_samples, 1])
    from sklearn.metrics import classification
    return classification.accuracy_score(y_test, predicted_y, sample_weight=None)

### Load dataset

In [4]:
rfc_dataset = np.genfromtxt("/Users/jane/rfc-analysis/analysis/new_input_features/no_closer_participation/after_closing_excluded/csv/with_ids/classification_features_before_closing.csv", delimiter=",")[1:]

In [5]:
rfc_dataset.shape

(6474, 39)

In [6]:
MIN_INPUT_INDEX = 1
MAX_INPUT_INDEX = 37
OUTPUT_INDEX = 38
ROW_NUM = 6474
COLUMN_NUM = 39

In [7]:
def extract_inputs(dataset, input_idx):
    total_inputs = []
    for row in dataset:
        selected_input = [row[id] for id in input_idx]
        total_inputs.append(selected_input)
    return total_inputs

In [8]:
def extract_output(dataset, output_idx):
    return list(row[output_idx] for row in dataset)

#### Get inputs and outputs

In [9]:
total_inputs = extract_inputs(rfc_dataset, range(MIN_INPUT_INDEX, MAX_INPUT_INDEX+1))

In [10]:
total_outputs = extract_output(rfc_dataset, OUTPUT_INDEX)

In [11]:
X = StandardScaler().fit_transform(total_inputs)

In [12]:
Y = np.array(total_outputs).reshape(ROW_NUM, 1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [14]:
shaped_dataset = [X, Y]

In [15]:
get_baseline(X_train, y_train, X_test, y_test)

0.5667953667953668

### Feature selection

In [16]:
features = {'comments_count':1, 'participants_count':2, 'sum_participant_edit_count':3, 
            'avg_participant_edit_count':4,'initiator_edit_count':5, 'initiator_expertise_(days)':6, 
            'weighted_reciprocity':7, 'avg_reply_num':8, 'avg_reply_depth':9, 
            'article_word_count':10, 'article_character_count':11, 'rfc_positive':12,
            'rfc_negative':13, 'rfc_certain':14, 'rfc_tentative':15, 
            'rfc_anger':16, 'rfc_swear':17, 'rfc_insight':18, 
            'rfc_incl':19, 'rfc_hostile':20, 'rfc_i':21, 
            'rfc_percept':22, 'rfc_excl':23, 'rfc_cogmech':24,
            'rfc_affect':25, 'revision_count_before_rfc':26, 'initiator_revision_before_rfc_count':27, 
            'new_participant_ratio':28, 'one_week_recent_rev_count':29,'two_weeks_recent_rev_count':30,
            'three_weeks_recent_rev_count':31, 'one_month_recent_rev_count':32, 'two_months_recent_rev_count':33,
            'avg_expertise_except_closer_(days)':34, 'max_expertise_except_closer_(days)':35,
            'sd_expertise_except_closer_(days)':36, 'sum_expertise_except_closer_(days)':37
           }

In [17]:
feature_names = sorted(features.iteritems(), key=lambda (k,v): (v,k))

In [18]:
features_weight = {}
for i in range(0,50):
    X = StandardScaler().fit_transform(total_inputs)
    Y = np.array(total_outputs).reshape(ROW_NUM, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    dc_clf = DecisionTreeClassifier(max_depth=10)
    dc_clf = dc_clf.fit(X_train, y_train)
    dc_pred = dc_clf.predict(X_test)
    dctree_pd = pd.DataFrame({'feature_importance':dc_clf.feature_importances_},index=feature_names).sort_values('feature_importance', ascending=False)
    selected_feature = dctree_pd.to_dict()['feature_importance']
    for idx, val in selected_feature.items():
        if idx in features_weight:
            features_weight[idx] += val
        else:
            features_weight[idx] = val

In [19]:
averaged_features_weight = {}
for idx, val in features_weight.items():
    averaged_features_weight[idx] = val/50.0

In [58]:
feature_importance_pdf = pd.DataFrame({'feature_importance':averaged_features_weight},index=averaged_features_weight.keys()).sort_values('feature_importance', ascending=False)

In [59]:
feature_importance_pdf

Unnamed: 0,feature_importance
"(participants_count, 2)",0.230349
"(max_expertise_except_closer_(days), 35)",0.142956
"(avg_reply_num, 8)",0.076621
"(comments_count, 1)",0.037983
"(avg_reply_depth, 9)",0.037889
"(rfc_cogmech, 24)",0.036185
"(rfc_hostile, 20)",0.031508
"(sum_expertise_except_closer_(days), 37)",0.028648
"(sd_expertise_except_closer_(days), 36)",0.026957
"(sum_participant_edit_count, 3)",0.026794


In [66]:
selected_features = feature_importance_pdf[:7].to_dict()['feature_importance']

## Visualize decision tree with selected features

In [67]:
chosen_index = []
chosen_feature_names = []
for name, index in selected_features.keys():
    chosen_index.append(index)
    chosen_feature_names.append(name)

In [68]:
filtered_inputs = extract_inputs(rfc_dataset, chosen_index)

In [69]:
filtered_outputs = extract_output(rfc_dataset, OUTPUT_INDEX)

In [70]:
X_filtered = StandardScaler().fit_transform(filtered_inputs)

In [71]:
Y_filtered = np.array(total_outputs).reshape(6474, 1)

In [72]:
# X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, Y, test_size=0.4, random_state=42)
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(X_filtered, Y_filtered, test_size=0.4, random_state=42)

In [73]:
dataset = [X_filtered, Y_filtered]

In [74]:
dc_clf_5 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=150)
dc_clf_5 = dc_clf_5.fit(X_train_filtered, y_train_filtered)
dc_pred_5 = dc_clf_5.predict(X_test_filtered)

In [75]:
class_names = ["unclosed","closed"]

In [76]:
dd = export_graphviz(dc_clf_5, out_file=None, filled=True, feature_names=chosen_feature_names, class_names=class_names, rounded=True)
graph = graphviz.Source(dd)
graph.render('decision_tree_(min-leaf 150 2)')

'decision_tree_(min-leaf 150 2).pdf'

## Test out various classifiers

In [31]:
def test_50_times(classifier, inputs, outputs):
    scores = {'accuracy':0.0, 'precision':0.0, 'recall':0.0, 'roc_auc':0.0, 'f1':0.0}
    for i in range(50):
        X = StandardScaler().fit_transform(inputs)
        Y = np.array(outputs).reshape(ROW_NUM, 1)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

        classifier.fit(X_train, y_train)
        classifier_pred = classifier.predict(X_test)
        scores['accuracy'] += accuracy_score(y_test, classifier_pred)
        scores['precision'] += precision_score(y_test, classifier_pred)
        scores['recall'] += recall_score(y_test, classifier_pred)
        scores['roc_auc'] += roc_auc_score(y_test, classifier_pred)
        scores['f1'] += f1_score(y_test, classifier_pred)
    for score_type, val in scores.items():
        scores[score_type] /= 50
    print 'Accuracy: ' + str(scores['accuracy'])
    print 'Precision: ' + str(scores['precision'])
    print 'Recall: ' + str(scores['recall'])
    print 'Roc_auc:' + str(scores['roc_auc'])
    print 'F1:' + str(scores['f1'])
    return scores

In [32]:
def show_scores(scores, name):
    return pd.DataFrame({name:scores})

### 1. Logistic Regession LG

In [33]:
logreg = LogisticRegression(C=1e5)

In [34]:
logreg_averaged_scores = test_50_times(logreg, total_inputs, total_outputs)

  y = column_or_1d(y, warn=True)


Accuracy: 0.778378378378
Precision: 0.77972465582
Recall: 0.848773841962
Roc_auc:0.767524175883
F1:0.812785388128


In [35]:
show_scores(logreg_averaged_scores, 'logreg_scores')

Unnamed: 0,logreg_scores
accuracy,0.778378
f1,0.812785
precision,0.779725
recall,0.848774
roc_auc,0.767524


##### How about filtered ones?

In [77]:
logreg_avg_scores_on_chosen = test_50_times(logreg, filtered_inputs, filtered_outputs)
show_scores(logreg_avg_scores_on_chosen, 'logreg_averaged_scores')

Accuracy: 0.759845559846
Precision: 0.766708701135
Recall: 0.828337874659
Roc_auc:0.749284801857
F1:0.796332678454


Unnamed: 0,logreg_averaged_scores
accuracy,0.759846
f1,0.796333
precision,0.766709
recall,0.828338
roc_auc,0.749285


### 2. ADT

In [37]:
adt = AdaBoostClassifier()  

In [38]:
adt_averaged_scores = test_50_times(adt, total_inputs, total_outputs)

Accuracy: 0.794980694981
Precision: 0.798597833015
Recall: 0.853542234332
Roc_auc:0.785951152817
F1:0.825156404346


In [39]:
show_scores(adt_averaged_scores, 'adt_averaged_scores')

Unnamed: 0,adt_averaged_scores
accuracy,0.794981
f1,0.825156
precision,0.798598
recall,0.853542
roc_auc,0.785951


##### How about filtered ones?

In [78]:
adt_avg_scores_on_chosen = test_50_times(adt, filtered_inputs, filtered_outputs)
show_scores(adt_avg_scores_on_chosen, 'adt_averaged_scores')

Accuracy: 0.784942084942
Precision: 0.782043343653
Recall: 0.860354223433
Roc_auc:0.773314366619
F1:0.819331819656


Unnamed: 0,adt_averaged_scores
accuracy,0.784942
f1,0.819332
precision,0.782043
recall,0.860354
roc_auc,0.773314


### 3. Random Forests

In [41]:
rf = RandomForestClassifier(max_depth=5)

In [42]:
rf_averaged_scores = test_50_times(rf, total_inputs, total_outputs)

  


Accuracy: 0.773822393822
Precision: 0.763991131923
Recall: 0.869673024523
Roc_auc:0.75904328588
F1:0.813380884163


In [43]:
show_scores(rf_averaged_scores, 'rf_averaged_scores')

Unnamed: 0,rf_averaged_scores
accuracy,0.773822
f1,0.813381
precision,0.763991
recall,0.869673
roc_auc,0.759043


##### How about filtered ones?

In [79]:
rf_avg_scores_on_chosen = test_50_times(rf, filtered_inputs, filtered_outputs)
show_scores(rf_avg_scores_on_chosen, 'rf_averaged_scores')

  


Accuracy: 0.785034749035
Precision: 0.773146357493
Recall: 0.878801089918
Roc_auc:0.770577015547
F1:0.8225116551


Unnamed: 0,rf_averaged_scores
accuracy,0.785035
f1,0.822512
precision,0.773146
recall,0.878801
roc_auc,0.770577


### 4. SVM-RBF

In [45]:
# C_2d_range = [1e-2, 1, 1e2]
# gamma_2d_range = [1e-1, 1, 1e1]
C=1
gamma=1e-1
svm_rbf = SVC(C=C, gamma=gamma)

In [46]:
svm_rbf_averaged_scores = test_50_times(svm_rbf,total_inputs, total_outputs)

Accuracy: 0.760231660232
Precision: 0.770953294946
Recall: 0.820844686649
Roc_auc:0.750885801435
F1:0.795117123062


In [47]:
show_scores(svm_rbf_averaged_scores, 'svm_rbf_averaged_scores')

Unnamed: 0,svm_rbf_averaged_scores
accuracy,0.760232
f1,0.795117
precision,0.770953
recall,0.820845
roc_auc,0.750886


##### How about filtered ones?

In [80]:
svm_rbf_avg_scores_on_chosen = test_50_times(svm_rbf, filtered_inputs, filtered_outputs)
show_scores(svm_rbf_avg_scores_on_chosen, 'svm_rbf_averaged_scores')

Accuracy: 0.776061776062
Precision: 0.764285714286
Recall: 0.874659400545
Roc_auc:0.760859112037
F1:0.815756035578


Unnamed: 0,svm_rbf_averaged_scores
accuracy,0.776062
f1,0.815756
precision,0.764286
recall,0.874659
roc_auc,0.760859


### 5. Decision Tree

In [49]:
dc_clf = DecisionTreeClassifier(max_depth=10)

In [50]:
dc_clf_averaged_scores = test_50_times(dc_clf, total_inputs, total_outputs)

Accuracy: 0.745088803089
Precision: 0.744518367713
Recall: 0.837738419619
Roc_auc:0.730803256155
F1:0.788377667175


In [51]:
show_scores(dc_clf_averaged_scores, 'dc_clf_averaged_scores')

Unnamed: 0,dc_clf_averaged_scores
accuracy,0.745089
f1,0.788378
precision,0.744518
recall,0.837738
roc_auc,0.730803


##### How about filtered ones?

In [81]:
dc_clf_avg_scores_on_chosen = test_50_times(dc_clf, filtered_inputs, filtered_outputs)
show_scores(dc_clf_avg_scores_on_chosen, 'dc_clf_averaged_scores')

Accuracy: 0.748918918919
Precision: 0.754030526624
Recall: 0.826689373297
Roc_auc:0.736927574349
F1:0.788687795213


Unnamed: 0,dc_clf_averaged_scores
accuracy,0.748919
f1,0.788688
precision,0.754031
recall,0.826689
roc_auc,0.736928
