### Imports

In [None]:
import json
import time
import pandas as pd
from py2neo import Graph, Node, Relationship
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Connect to graph

In [None]:
graph = Graph("bolt://neo4j-allquanta:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

### Build dataset

In [None]:
years_to_use = 3
start_year = 1950
end_year = 2018

print("Getting dataset...", end=" ")
cites_str = ',\n    '.join(['CASE WHEN {} < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {{year: {}}})) END as c{}'.format(
    yr, yr, yr) for yr in range(start_year, end_year+1)])
tspr_str = ',\n    '.join(['q.tspr{} as tspr{}'.format(
    yr, yr) for yr in range(start_year, end_year+1)])
query = """
MATCH (q:Quanta)
WHERE 
    (q.year>={} AND q.year <= {} AND q.venue="Nature") 
RETURN
    q.year as year,
    q.title as title,
    q.id as id,
    {},
    {}
 LIMIT 15000          
""".format(start_year, end_year-years_to_use, tspr_str, cites_str)
print(query)
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))

In [None]:
df_new = df[['year','title','id']]
for i in range(years_to_use+1):
    df_new['c{}'.format(i)] = df.apply(lambda row: row['c{}'.format(row['year']+i)], axis=1)
    df_new['p{}'.format(i)] = df.apply(lambda row: row['tspr{}'.format(row['year']+i)], axis=1)


# Add in community features


In [None]:
import glob
feature_paths = ["/tmp/data/result/FeatureExtractionResults/EarlyAdopters/"]

for feature_path in feature_paths:
    all_files = sorted(glob.glob(feature_path +"*.csv"), reverse=True)
    feature_vec_chunks = []
    for file in all_files:
        feature_vec_chunks.append(pd.read_csv(file))
        
    total_feature = pd.concat(feature_vec_chunks)
#     print(sorted(total_feature.title))
    df_features = df_new.merge(pd.concat(feature_vec_chunks), on='title')

# ONLY USES FEATURES WITH FULL SERIES
df_features = df_features.dropna()
df_features

In [None]:
def balanced_subsample(x,y,subsample_size=1.0):
    class_xs = []
    min_elems = None
    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]
    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)
    xs = []
    ys = []
    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            this_xs = this_xs.reindex(np.random.RandomState(seed=42).permutation(this_xs.index))
        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)
        xs.append(x_)
        ys.append(y_)

    xs = pd.concat(xs)
    ys = pd.Series(data=np.concatenate(ys),name='target')
    return xs,ys

# Feature extraction + Class Labeling
# Train + Test Split
## Balanced Subsampling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

cs = ['c{}'.format(x) for x in range(years_to_use+1)]
ps = ['p{}'.format(x) for x in range(years_to_use+1)]
ea = ['early_adopters_{}'.format(x) for x in range(1,years_to_use+1)]
cs.extend(ps)
cs.extend(ea)
X = df_features[cs]
y = df_features['p{}'.format(years_to_use)] >= df_features['p{}'.format(years_to_use)].quantile(0.95)


to_keep = []
num_signals = 1
for i in range(years_to_use+1):
    
    if i == num_signals:
        break
#     to_keep.append('c{}'.format(i))
    to_keep.append('p{}'.format(i))
    pass

for i in range(1, years_to_use+1):
    if i-1 == num_signals:
        break
#     to_keep.append('early_adopters_{}'.format(i))

y = label_binarize(y, classes=[True,False])
n_classes = y.shape[1]


X_restricted = X[to_keep]
X_train, X_test, y_train, y_test = train_test_split(X_restricted, y, test_size=0.33, random_state=42)
X_bal_train, y_bal_train = balanced_subsample(X_train,y_train)

In [None]:
!pip install scikit-learn
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, accuracy_score


sensitivity = make_scorer(recall_score, pos_label = 1)
specificity = make_scorer(recall_score, pos_label = 0)

scoring = {'acc': 'accuracy',
           'sensitivity': sensitivity,
           'specificity': specificity}
# clf = DecisionTreeClassifier(random_state=0)
clf = RandomForestClassifier(random_state=0)
cv_results = cross_validate(clf, X_bal_train, y_bal_train, scoring=scoring,
                         cv=5, return_train_score=True, return_estimator=True)


best_est_index = np.argmax(cv_results['test_acc'])
best_estimator = cv_results['estimator'][best_est_index]

sensitivity(best_estimator, X_test, y_test), specificity(best_estimator, X_test, y_test)


In [None]:
from sklearn.metrics import roc_curve, auc

y_score = best_estimator.predict_proba(X_test)



# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()


roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, 1]) # HACK MUST FIX THIS IMMEDIATELY, PROB(CLASS = 1) IS 1ST INDEX IN Y_SCORE
    roc_auc[i] = auc(fpr[i], tpr[i])


plt.figure()
lw = 2
plt.plot(fpr[0], tpr[0], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()