### Imports

In [56]:
import json
import time
import pandas as pd
from py2neo import Graph, Node, Relationship
import numpy as np
import matplotlib.pyplot as plt
!pip install --user --upgrade scikit-learn

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

Requirement already up-to-date: scikit-learn in /home/jovyan/.local/lib/python3.6/site-packages (0.20.3)


### Connect to graph

In [None]:
graph = Graph("bolt://neo4j-magone:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

### Build dataset

In [None]:
years_to_use = 3
start_year = 2000
end_year = 2018

print("Getting dataset...", end=" ")
cites_str = ',\n    '.join(['CASE WHEN {} < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {{year: {}}})) END as c{}'.format(
    yr, yr, yr) for yr in range(start_year, end_year+1)])
tspr_str = ',\n    '.join(['q.tspr{} as tspr{}'.format(
    yr, yr) for yr in range(start_year, end_year+1)])
query = """
MATCH (q:Quanta)
WHERE 
    (q.year>={} AND q.year <= {} AND q.venue="Nature") 
RETURN
    q.year as year,
    q.title as title,
    q.id as id,
    {},
    {}
    
LIMIT 15000
""".format(start_year, end_year-years_to_use, tspr_str, cites_str)
print(query)
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))

In [None]:
df_new = df[['year','title','id']]
for i in range(years_to_use+1):
    df_new['c{}'.format(i)] = df.apply(lambda row: row['c{}'.format(row['year']+i)], axis=1)
    df_new['p{}'.format(i)] = df.apply(lambda row: row['tspr{}'.format(row['year']+i)], axis=1)


# Add in community features


In [None]:
import glob
feature_paths = ["/tmp/data/result/FeatureExtractionResults/EarlyAdopters/"]

for feature_path in feature_paths:
    all_files = sorted(glob.glob(feature_path +"*.csv"), reverse=True)
    feature_vec_chunks = []
    for file in all_files:
        feature_vec_chunks.append(pd.read_csv(file))
        
    total_feature = pd.concat(feature_vec_chunks)
#     print(sorted(total_feature.title))
    df_features = df_new.merge(pd.concat(feature_vec_chunks), on='title')

# ONLY USES FEATURES WITH FULL SERIES
df_features = df_features.dropna()
df_features

In [None]:
def balanced_subsample(x,y,subsample_size=1.0):
    class_xs = []
    min_elems = None
    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]
    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)
    xs = []
    ys = []
    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            this_xs = this_xs.reindex(np.random.RandomState(seed=42).permutation(this_xs.index))
        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)
        xs.append(x_)
        ys.append(y_)

    xs = pd.concat(xs)
    ys = pd.Series(data=np.concatenate(ys),name='target')
    return xs,ys

# Feature extraction + Class Labeling
# Train + Test Split
## Balanced Subsampling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

cs = ['c{}'.format(x) for x in range(years_to_use+1)]
ps = ['p{}'.format(x) for x in range(years_to_use+1)]
ea = ['early_adopters_{}'.format(x) for x in range(1,years_to_use+1)]
cs.extend(ps)
cs.extend(ea)
X = df_features[cs]
y = df_features['p{}'.format(years_to_use)] >= df_features['p{}'.format(years_to_use)].quantile(0.95)


to_keep = []
num_signals = 1
for i in range(years_to_use+1):
    
    if i == num_signals:
        break
#     to_keep.append('c{}'.format(i))
    to_keep.append('p{}'.format(i))
    pass

for i in range(1, years_to_use+1):
    if i-1 == num_signals:
        break
#     to_keep.append('early_adopters_{}'.format(i))

y = label_binarize(y, classes=[True,False])
n_classes = y.shape[1]


X_restricted = X[to_keep]
X_train, X_test, y_train, y_test = train_test_split(X_restricted, y, test_size=0.33, random_state=42)
X_bal_train, y_bal_train = balanced_subsample(X_train,y_train)

In [None]:
!pip install scikit-learn
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, accuracy_score


sensitivity = make_scorer(recall_score, pos_label = 1)
specificity = make_scorer(recall_score, pos_label = 0)

scoring = {'acc': 'accuracy',
           'sensitivity': sensitivity,
           'specificity': specificity}
# clf = DecisionTreeClassifier(random_state=0)
clf = RandomForestClassifier(random_state=0)
cv_results = cross_validate(clf, X_bal_train, y_bal_train, scoring=scoring,
                         cv=5, return_train_score=True, return_estimator=True)


best_est_index = np.argmax(cv_results['test_acc'])
best_estimator = cv_results['estimator'][best_est_index]

sensitivity(best_estimator, X_test, y_test), specificity(best_estimator, X_test, y_test)


In [None]:
from sklearn.metrics import roc_curve, auc

y_score = best_estimator.predict_proba(X_test)



# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()


roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, 1]) # HACK MUST FIX THIS IMMEDIATELY, PROB(CLASS = 1) IS 1ST INDEX IN Y_SCORE
    roc_auc[i] = auc(fpr[i], tpr[i])


plt.figure()
lw = 2
plt.plot(fpr[0], tpr[0], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

# Predicting Top Papers

## Select Training Data (first year of citations + first year of tsprn + early_adopters)

## Train on given period (balanced)

## Query for test sample

## Run predictions and sort by tsprn predictions

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

cs = ['c{}'.format(x) for x in range(years_to_use+1)]
ps = ['p{}'.format(x) for x in range(years_to_use+1)]
ea = ['early_adopters_{}'.format(x) for x in range(1,years_to_use+1)]
title = ['title']
cs.extend(ps)
cs.extend(ea)
cs.extend(title)
X = df_features[cs]
y = df_features['p{}'.format(years_to_use)] >= df_features['p{}'.format(years_to_use)].quantile(0.95)


to_keep = []
num_signals = 1
for i in range(years_to_use+1):
    
    if i == num_signals:
        break
    to_keep.append('c{}'.format(i))
    to_keep.append('p{}'.format(i))
    pass

for i in range(1, years_to_use+1):
    if i-1 == num_signals:
        break
    to_keep.append('early_adopters_{}'.format(i))

y = label_binarize(y, classes=[True,False])
n_classes = y.shape[1]

to_keep.append('title')

X_restricted = X[to_keep]
X_train, X_test, y_train, y_test = train_test_split(X_restricted, y, test_size=0.33, random_state=42)
X_bal_train, y_bal_train = balanced_subsample(X_train,y_train)



In [114]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, accuracy_score


y = df_features['p3']
X_train, X_test, y_train, y_test = train_test_split(X_restricted, y, test_size=0.33, random_state=42)
X_bal_train, y_bal_train = balanced_subsample(X_train,y_train)
#changes invariant
X_train_numeric = X_train.drop(columns=['title'])
X_bal_train_numeric = X_bal_train.drop(columns=['title'])
X_test_numeric = X_test.drop(columns=['title'])

scoring = {'mse': 'neg_mean_squared_error', 'r2': 'r2'}
clf = RandomForestRegressor(random_state=0)
cv_results = cross_validate(clf, X_train_numeric, y_train, scoring=scoring,
                         cv=5, return_train_score=True)




clf.fit(X_train_numeric, y_train)
X_train_numeric



Unnamed: 0,c0,p0,early_adopters_1
1789,6.0,0.184145,12
6264,2.0,0.150000,225
2059,6.0,23.881220,133
4786,2.0,0.150000,7
3053,1.0,0.150000,83
7800,0.0,0.150000,0
1902,4.0,0.150000,45
2750,0.0,0.150000,0
2119,7.0,0.150000,165
7695,0.0,0.150000,31


In [173]:
years_to_use = 1
start_year = 2017
end_year = 2018

print("Getting dataset...", end=" ")
cites_str = ',\n    '.join(['CASE WHEN {} < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {{year: {}}})) END as c{}'.format(
    yr, yr, yr) for yr in range(start_year, end_year+1)])
tspr_str = ',\n    '.join(['q.tspr{} as tspr{}'.format(
    yr, yr) for yr in range(start_year, end_year+1)])
query = """
MATCH (q:Quanta)
WHERE 
    (q.year>={} AND q.year <= {} AND q.venue="Nature") 
RETURN
    q.year as year,
    q.title as title,
    q.id as id,
    {},
    {}
    
""".format(start_year, end_year-years_to_use, tspr_str, cites_str)
print(query)
query_start_time = time.time()
df_2018 = graph.run(query).to_data_frame()
print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))

Getting dataset... 
MATCH (q:Quanta)
WHERE 
    (q.year>=2017 AND q.year <= 2017 AND q.venue="Nature") 
RETURN
    q.year as year,
    q.title as title,
    q.id as id,
    q.tspr2017 as tspr2017,
    q.tspr2018 as tspr2018,
    CASE WHEN 2017 < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {year: 2017})) END as c2017,
    CASE WHEN 2018 < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {year: 2018})) END as c2018
    

Done (0.07 minutes).


In [181]:
years_to_use = 1

df_new = df_2017[['year','title','id']]
for i in range(years_to_use):
    df_new['c{}'.format(i)] = df_2017.apply(lambda row: row['c{}'.format(row['year']+i)], axis=1)
    df_new['p{}'.format(i)] = df_2017.apply(lambda row: row['tspr{}'.format(row['year']+i)], axis=1)
    
df_new = df_new.drop(columns=['id','year'])

feature_paths = ["/tmp/data/result/FeatureExtractionResults/EarlyAdopters/"]

for feature_path in feature_paths:
    all_files = sorted(glob.glob(feature_path +"*.csv"), reverse=True)
    feature_vec_chunks = []
    for file in all_files:
        feature_vec_chunks.append(pd.read_csv(file))
        
    total_feature = pd.concat(feature_vec_chunks)
#     print(sorted(total_feature.title))
    df_2017_features = df_new.merge(pd.concat(feature_vec_chunks), on='title')

cs = ['c{}'.format(x) for x in range(years_to_use)]
ps = ['p{}'.format(x) for x in range(years_to_use)]
ea = ['early_adopters_{}'.format(x) for x in range(1,years_to_use+1)]
title = ['title']
cs.extend(ps)
cs.extend(ea)
cs.extend(title)
df_2017_features = df_2017_features[cs]
    

# # ONLY USES FEATURES WITH FULL SERIES
df_2017_features = df_2017_features.dropna()


TOP_N = 20

pagerank_preds = clf.predict(df_2017_features.drop(columns=['title']))

TOP_N_PAPERS = sorted(enumerate(pagerank_preds), key=lambda x: -x[1])[:TOP_N]

TOP_N_PAPERS = [df_2017_features.iloc[index] for index in [x[0] for x in TOP_N_PAPERS]]

pd.DataFrame(TOP_N_PAPERS).to_csv('/tmp/data/2017_top_20_in_3_years.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [175]:
df_2017

Unnamed: 0,c2017,c2018,id,title,tspr2017,tspr2018,year
0,0,0,0010fa71-f0e6-45c2-90f7-1c8e2c480b35,Human studies: Reforms overdue for ethical rev...,0.15000,0.15000,2017
1,0,0,00124ae5-87b7-4b62-99ca-a11d24174d8b,Marine litter: Sea change for plastic pollution,0.15000,0.15000,2017
2,2,0,005d6f9e-09f6-4dcb-8e1c-7ae4cd99d8cf,Stable colloids in molten inorganic salts,0.15000,0.15000,2017
3,1,0,00611e70-ca47-4bd9-8d2b-ba6a3d7ad742,Give the public the tools to trust scientists,0.15000,0.15000,2017
4,3,0,00636661-f9f7-48cc-8804-c5696c7f3c98,Recent increase in oceanic carbon uptake drive...,0.27750,0.27750,2017
5,0,0,0067cba2-6dcf-4ee6-b276-712700a9e12c,Surprising contenders emerge for Trump’s NIH c...,0.15000,0.15000,2017
6,0,0,00b3d255-1709-4db9-94c4-ffb5fc3105a2,Ensemble cryo-EM elucidates the mechanism of t...,0.15000,0.15000,2017
7,0,0,018b6bb9-aa32-48bd-8f47-b27c72fc4130,Trump budget would slash science programmes ac...,0.15000,0.15000,2017
8,10,0,01adb095-c662-4207-8e77-caad0b8f6737,Structure of a spliceosome remodelled for exon...,0.15000,0.15000,2017
9,2,0,01c26c30-5ce5-402d-aa50-dec7841d3b70,Supersolid formation in a quantum gas breaking...,0.19250,0.19250,2017
