In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import export_graphviz
import graphviz
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression

In [3]:
def get_baseline(X_train, y_train, X_test, y_test):
    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X_train, y_train)
    from sklearn.utils import validation
    checked_X_test_ = validation.check_array(X_test, accept_sparse=['csr', 'csc', 'coo'])
    classes_ = dummy_clf.classes_
    class_prior_ = dummy_clf.class_prior_
    n_samples = int(checked_X_test_.shape[0])
    predicted_y = np.tile([classes_[k][class_prior_[k].argmax()] for k in range(dummy_clf.n_outputs_)], [n_samples, 1])
    from sklearn.metrics import classification
    return classification.accuracy_score(y_test, predicted_y, sample_weight=None)

### Load dataset

In [13]:
rfc_dataset = np.genfromtxt("/Users/jane/rfc-analysis/analysis/new_input_features/no_closer_participation/after_closing_excluded/csv/with_ids/classification_features_including_contentiousness_before_closing.csv", delimiter=",")[1:]

In [14]:
rfc_dataset.shape

(3257, 40)

In [15]:
MIN_INPUT_INDEX = 1
MAX_INPUT_INDEX = 38
OUTPUT_INDEX = 39
ROW_NUM = 3257
COLUMN_NUM = 40

In [16]:
def extract_inputs(dataset, input_idx):
    total_inputs = []
    for row in dataset:
        selected_input = [row[id] for id in input_idx]
        total_inputs.append(selected_input)
    return total_inputs

In [17]:
def extract_output(dataset, output_idx):
    return list(row[output_idx] for row in dataset)

#### Get inputs and outputs

In [18]:
total_inputs = extract_inputs(rfc_dataset, range(MIN_INPUT_INDEX, MAX_INPUT_INDEX+1))

In [19]:
total_outputs = extract_output(rfc_dataset, OUTPUT_INDEX)

In [20]:
X = StandardScaler().fit_transform(total_inputs)

In [21]:
Y = np.array(total_outputs).reshape(ROW_NUM, 1)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [23]:
shaped_dataset = [X, Y]

In [24]:
get_baseline(X_train, y_train, X_test, y_test)

0.73369148119723715

### Feature selection

In [29]:
features = {'comments_count':1, 'participants_count':2, 'sum_participant_edit_count':3, 
            'avg_participant_edit_count':4,'initiator_edit_count':5, 'initiator_expertise_(days)':6, 
            'weighted_reciprocity':7, 'avg_reply_num':8, 'avg_reply_depth':9, 
            'article_word_count':10, 'article_character_count':11, 
            'contentiousness':12,
            'rfc_positive':13,
            'rfc_negative':14, 'rfc_certain':15, 'rfc_tentative':16, 
            'rfc_anger':17, 'rfc_swear':18, 'rfc_insight':19, 
            'rfc_incl':20, 'rfc_hostile':21, 'rfc_i':22, 
            'rfc_percept':23, 'rfc_excl':24, 'rfc_cogmech':25,
            'rfc_affect':26, 'revision_count_before_rfc':27, 'initiator_revision_before_rfc_count':28, 
            'new_participant_ratio':29, 'one_week_recent_rev_count':30,'two_weeks_recent_rev_count':31,
            'three_weeks_recent_rev_count':32, 'one_month_recent_rev_count':33, 'two_months_recent_rev_count':34,
            'avg_expertise_except_closer_(days)':35, 'max_expertise_except_closer_(days)':36,
            'sd_expertise_except_closer_(days)':37, 'sum_expertise_except_closer_(days)':38
           }

In [30]:
feature_names = sorted(features.iteritems(), key=lambda (k,v): (v,k))

In [31]:
features_weight = {}
for i in range(0,50):
    X = StandardScaler().fit_transform(total_inputs)
    Y = np.array(total_outputs).reshape(ROW_NUM, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    dc_clf = DecisionTreeClassifier(max_depth=10)
    dc_clf = dc_clf.fit(X_train, y_train)
    dc_pred = dc_clf.predict(X_test)
    dctree_pd = pd.DataFrame({'feature_importance':dc_clf.feature_importances_},index=feature_names).sort_values('feature_importance', ascending=False)
    selected_features = dctree_pd.to_dict()['feature_importance']
    for idx, val in selected_features.items():
        if idx in features_weight:
            features_weight[idx] += val
        else:
            features_weight[idx] = val

In [32]:
averaged_features_weight = {}
for idx, val in features_weight.items():
    averaged_features_weight[idx] = val/50.0

In [33]:
pd.DataFrame({'feature_importance':averaged_features_weight},index=averaged_features_weight.keys()).sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_importance
"(max_expertise_except_closer_(days), 36)",0.134876
"(avg_reply_num, 8)",0.076009
"(participants_count, 2)",0.064165
"(sum_participant_edit_count, 3)",0.058309
"(avg_expertise_except_closer_(days), 35)",0.057195
"(rfc_hostile, 21)",0.047387
"(comments_count, 1)",0.043309
"(avg_participant_edit_count, 4)",0.043125
"(rfc_insight, 19)",0.039296
"(rfc_swear, 18)",0.035081


## Visualize decision tree with selected features

In [34]:
chosen_index = []
chosen_feature_names = []
for name, index in selected_features.keys():
    chosen_index.append(index)
    chosen_feature_names.append(name)

In [35]:
filtered_inputs = extract_inputs(rfc_dataset, chosen_index)

In [36]:
filtered_outputs = extract_output(rfc_dataset, OUTPUT_INDEX)

In [37]:
X_filtered = StandardScaler().fit_transform(filtered_inputs)

In [38]:
Y_filtered = np.array(total_outputs).reshape(ROW_NUM, 1)

In [39]:
# X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, Y, test_size=0.4, random_state=42)
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(X_filtered, Y_filtered, test_size=0.4, random_state=42)

In [40]:
dataset = [X_filtered, Y_filtered]

In [41]:
dc_clf_5 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=150)
dc_clf_5 = dc_clf_5.fit(X_train_filtered, y_train_filtered)
dc_pred_5 = dc_clf_5.predict(X_test_filtered)

In [42]:
class_names = ["unclosed","closed"]

In [43]:
dd = export_graphviz(dc_clf_5, out_file=None, filled=True, feature_names=chosen_feature_names, class_names=class_names, rounded=True)
graph = graphviz.Source(dd)
graph.render('decision_tree_(contentiousness_chosen)')

'decision_tree_(contentiousness_chosen).pdf'

## Test out various classifiers

In [44]:
def test_50_times(classifier, inputs, outputs):
    scores = {'accuracy':0.0, 'precision':0.0, 'recall':0.0, 'roc_auc':0.0, 'f1':0.0}
    for i in range(50):
        X = StandardScaler().fit_transform(inputs)
        Y = np.array(outputs).reshape(ROW_NUM, 1)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

        classifier.fit(X_train, y_train)
        classifier_pred = classifier.predict(X_test)
        scores['accuracy'] += accuracy_score(y_test, classifier_pred)
        scores['precision'] += precision_score(y_test, classifier_pred)
        scores['recall'] += recall_score(y_test, classifier_pred)
        scores['roc_auc'] += roc_auc_score(y_test, classifier_pred)
        scores['f1'] += f1_score(y_test, classifier_pred)
    for score_type, val in scores.items():
        scores[score_type] /= 50
    print 'Accuracy: ' + str(scores['accuracy'])
    print 'Precision: ' + str(scores['precision'])
    print 'Recall: ' + str(scores['recall'])
    print 'Roc_auc:' + str(scores['roc_auc'])
    print 'F1:' + str(scores['f1'])
    return scores

In [45]:
def show_scores(scores, name):
    return pd.DataFrame({name:scores})

### 1. Logistic Regession LG

In [46]:
logreg = LogisticRegression(C=1e5)

In [47]:
logreg_averaged_scores = test_50_times(logreg, total_inputs, total_outputs)

  y = column_or_1d(y, warn=True)


Accuracy: 0.761320030698
Precision: 0.779705117086
Recall: 0.940376569038
Roc_auc:0.604194048208
F1:0.852536747274


In [48]:
show_scores(logreg_averaged_scores, 'logreg_scores')

Unnamed: 0,logreg_scores
accuracy,0.76132
f1,0.852537
precision,0.779705
recall,0.940377
roc_auc,0.604194


##### How about filtered ones?

In [49]:
logreg_avg_scores_on_chosen = test_50_times(logreg, filtered_inputs, filtered_outputs)
show_scores(logreg_avg_scores_on_chosen, 'logreg_averaged_scores')

Accuracy: 0.761320030698
Precision: 0.779705117086
Recall: 0.940376569038
Roc_auc:0.604194048208
F1:0.852536747274


Unnamed: 0,logreg_averaged_scores
accuracy,0.76132
f1,0.852537
precision,0.779705
recall,0.940377
roc_auc,0.604194


### 2. ADT

In [50]:
adt = AdaBoostClassifier()  

In [51]:
adt_averaged_scores = test_50_times(adt, total_inputs, total_outputs)

Accuracy: 0.772570990023
Precision: 0.804929422849
Recall: 0.910732217573
Roc_auc:0.651331526654
F1:0.854568413689


In [52]:
show_scores(adt_averaged_scores, 'adt_averaged_scores')

Unnamed: 0,adt_averaged_scores
accuracy,0.772571
f1,0.854568
precision,0.804929
recall,0.910732
roc_auc,0.651332


##### How about filtered ones?

In [53]:
adt_avg_scores_on_chosen = test_50_times(adt, filtered_inputs, filtered_outputs)
show_scores(adt_avg_scores_on_chosen, 'adt_averaged_scores')

Accuracy: 0.77244819647
Precision: 0.804900559316
Recall: 0.910564853556
Roc_auc:0.651247844646
F1:0.854478458623


Unnamed: 0,adt_averaged_scores
accuracy,0.772448
f1,0.854478
precision,0.804901
recall,0.910565
roc_auc,0.651248


### 3. Random Forests

In [54]:
rf = RandomForestClassifier(max_depth=5)

In [55]:
rf_averaged_scores = test_50_times(rf, total_inputs, total_outputs)

  


Accuracy: 0.764343821949
Precision: 0.772314512568
Recall: 0.962656903766
Roc_auc:0.59031980635
F1:0.8570259344


In [56]:
show_scores(rf_averaged_scores, 'rf_averaged_scores')

Unnamed: 0,rf_averaged_scores
accuracy,0.764344
f1,0.857026
precision,0.772315
recall,0.962657
roc_auc,0.59032


##### How about filtered ones?

In [57]:
rf_avg_scores_on_chosen = test_50_times(rf, filtered_inputs, filtered_outputs)
show_scores(rf_avg_scores_on_chosen, 'rf_averaged_scores')

  


Accuracy: 0.76515732924
Precision: 0.773013681866
Recall: 0.962656903766
Roc_auc:0.591847183871
F1:0.857450266391


Unnamed: 0,rf_averaged_scores
accuracy,0.765157
f1,0.85745
precision,0.773014
recall,0.962657
roc_auc,0.591847


### 4. SVM-RBF

In [58]:
# C_2d_range = [1e-2, 1, 1e2]
# gamma_2d_range = [1e-1, 1, 1e1]
C=1
gamma=1e-1
svm_rbf = SVC(C=C, gamma=gamma)

In [59]:
svm_rbf_averaged_scores = test_50_times(svm_rbf,total_inputs, total_outputs)

Accuracy: 0.76055257099
Precision: 0.765238879736
Recall: 0.971757322176
Roc_auc:0.57521583688
F1:0.856221198157


In [60]:
show_scores(svm_rbf_averaged_scores, 'svm_rbf_averaged_scores')

Unnamed: 0,svm_rbf_averaged_scores
accuracy,0.760553
f1,0.856221
precision,0.765239
recall,0.971757
roc_auc,0.575216


##### How about filtered ones?

In [61]:
svm_rbf_avg_scores_on_chosen = test_50_times(svm_rbf, filtered_inputs, filtered_outputs)
show_scores(svm_rbf_avg_scores_on_chosen, 'svm_rbf_averaged_scores')

Accuracy: 0.76055257099
Precision: 0.765238879736
Recall: 0.971757322176
Roc_auc:0.57521583688
F1:0.856221198157


Unnamed: 0,svm_rbf_averaged_scores
accuracy,0.760553
f1,0.856221
precision,0.765239
recall,0.971757
roc_auc,0.575216


### 5. Decision Tree

In [62]:
dc_clf = DecisionTreeClassifier(max_depth=10)

In [63]:
dc_clf_averaged_scores = test_50_times(dc_clf, total_inputs, total_outputs)

Accuracy: 0.734013814275
Precision: 0.792428768304
Recall: 0.863723849372
Roc_auc:0.620190454946
F1:0.826531993211


In [64]:
show_scores(dc_clf_averaged_scores, 'dc_clf_averaged_scores')

Unnamed: 0,dc_clf_averaged_scores
accuracy,0.734014
f1,0.826532
precision,0.792429
recall,0.863724
roc_auc,0.62019


##### How about filtered ones?

In [65]:
dc_clf_avg_scores_on_chosen = test_50_times(dc_clf, filtered_inputs, filtered_outputs)
show_scores(dc_clf_avg_scores_on_chosen, 'dc_clf_averaged_scores')

Accuracy: 0.733430544896
Precision: 0.792121174648
Recall: 0.863221757322
Roc_auc:0.619535950707
F1:0.826131934996


Unnamed: 0,dc_clf_averaged_scores
accuracy,0.733431
f1,0.826132
precision,0.792121
recall,0.863222
roc_auc,0.619536
