### Setup

In [2]:
import json
import time
from functools import reduce

import pandas as pd
import numpy as np

from ast import literal_eval

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
from tqdm.autonotebook import tqdm

# import seaborn as sns
# sns.set(rc={'figure.figsize':(12,8)})
# sns.set(style='whitegrid')

# import matplotlib.pyplot as plt

from py2neo import Graph, Node, Relationship

In [3]:
# public_address = '54.174.175.98'
# public_address = '54.88.167.164'
public_address = 'top42_neo4j'

graph = Graph('bolt://{}:7687'.format(public_address), auth=('neo4j','myneo'))

def run_query(query, graph, print_query=False, run_query=True, 
              print_only=False, to_df=False, verbose=True):
    df = 1
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        if to_df:
            df = graph.run(query).to_data_frame()
        else:
            graph.run(query)
    end_time = time.time()
    minutes_elapsed = (end_time-start_time)/60
    if verbose:
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed))
    return df


In [4]:
years_to_consider = range(1950, 2019)
max_years_to_predict = 10
years_tracked = 5

vars_to_use = ['adopters', 'timeScaledPageRank', 'citations', 'node2vec']

max_year = max(years_to_consider)
min_year = min(years_to_consider)

### Write data to CSV

In [38]:
metrics_pattern = 'MATCH (q)-[m{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})'
metrics_string_list = [metrics_pattern.format(y=i) for i in range(years_tracked+1)]
metrics_string = '\n'.join(metrics_string_list)

var_pattern = 'coalesce(m{y}.{v},0) AS {v}_{y},'
var_string_list = [var_pattern.format(y=i,v=j) for i in range(years_tracked+1) for j in vars_to_use]
var_string = '\n'.join(var_string_list)

where_pattern = 'exists(m{y}.node2vec)'
where_string_list = [where_pattern.format(y=i) for i in range(2)]#years_tracked+1)]
where_string = ' AND '.join(where_string_list)       

author_metrics_pattern = 'MATCH (q)-[ma{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})'
author_metrics_string_list = [author_metrics_pattern.format(y=i) for i in range(years_tracked+1)]
author_metrics_string = '\n'.join(author_metrics_string_list)

author_vars = ['total_papers', 'author_age', 'max_citations', 'num_venues', 'total_citations']
author_funs = ['max', 'sum']
author_var_pattern = '{f}(coalesce(ma{y}.{v},0)) AS {v}_{f}_{y},'
author_var_string_list = [author_var_pattern.format(y=i, v=j, f=k)
                          for i in range(years_tracked+1)
                          for j in author_vars
                          for k in author_funs]
author_var_string = '\n'.join(author_var_string_list)


query = """
MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.year>={miny} AND y.year<={maxy}//-{ya}
{metrics_string}
{author_metrics_string}
WHERE {where_string}
RETURN 
    {var_string}
    {author_var_string}
    id(q) AS id, 
    y.year AS year
""".format(miny=min_year, 
           maxy=max_year,
           ya=max_years_to_predict,
           metrics_string=metrics_string,
           var_string=var_string, 
           where_string=where_string, 
           author_metrics_string='',
           author_var_string='')

query_tocsv = """
CALL apoc.export.csv.query('
{q}
','/import/quanta.predict.{miny}.{maxy}.{yt}.csv', 
{{quotes:true}});
""".format(q=query, 
           miny=min_year, 
           maxy=max_year,
           yt=years_tracked)

run_query(query_tocsv, graph, to_df=False, print_only=True)


CALL apoc.export.csv.query('

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.year>=1950 AND y.year<=2018//-10
MATCH (q)-[m0:METRICS_IN]->(:Year {year:y.year+0})
MATCH (q)-[m1:METRICS_IN]->(:Year {year:y.year+1})
MATCH (q)-[m2:METRICS_IN]->(:Year {year:y.year+2})
MATCH (q)-[m3:METRICS_IN]->(:Year {year:y.year+3})
MATCH (q)-[m4:METRICS_IN]->(:Year {year:y.year+4})
MATCH (q)-[m5:METRICS_IN]->(:Year {year:y.year+5})

WHERE exists(m0.node2vec) AND exists(m1.node2vec)
RETURN 
    coalesce(m0.adopters,0) AS adopters_0,
coalesce(m0.timeScaledPageRank,0) AS timeScaledPageRank_0,
coalesce(m0.citations,0) AS citations_0,
coalesce(m0.node2vec,0) AS node2vec_0,
coalesce(m1.adopters,0) AS adopters_1,
coalesce(m1.timeScaledPageRank,0) AS timeScaledPageRank_1,
coalesce(m1.citations,0) AS citations_1,
coalesce(m1.node2vec,0) AS node2vec_1,
coalesce(m2.adopters,0) AS adopters_2,
coalesce(m2.timeScaledPageRank,0) AS timeScaledPageRank_2,
coalesce(m2.citations,0) AS citations_

1

### Write Data To CSV (With Author Features)

In [None]:
query_ex = """
MATCH (q)-[:PUBLISHED_IN]->(y:year {{year: {}}}) 
MATCH (q)-[m:METRICS_IN]->(y)
MATCH (a)-[:AUTHORED]-(q)
MATCH ()

"""


metrics_pattern = 'MATCH (q)-[m{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})'
metrics_string_list = [metrics_pattern.format(y=i) for i in range(years_tracked+1)]
metrics_string = '\n'.join(metrics_string_list)

var_pattern = 'coalesce(m{y}.{v},0) AS {v}_{y},'
var_string_list = [var_pattern.format(y=i,v=j) for i in range(years_tracked+1) for j in vars_to_use]
var_string = '\n'.join(var_string_list)

where_pattern = 'exists(m{y}.node2vec)'
where_string_list = [where_pattern.format(y=i) for i in range(years_tracked+1)]
where_string = ' AND '.join(where_string_list)       

author_metrics_pattern = 'MATCH (q)<-[:AUTHORED]-(a:Author)-[ma{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})' #changed from q to a
author_metrics_string_list = [author_metrics_pattern.format(y=i) for i in range(years_tracked+1)]
author_metrics_string = '\n'.join(author_metrics_string_list)

# node2vec with different nodes types
# change only to average
#author_vars = ['total_papers', 'author_age', 'max_citations', 'num_venues', 'total_citations']
author_vars = [
    'hIndex', 'hIndexDelta', 'totalCitations', 'totalCitationsDelta', 'citationsPerPaper', 
    'citationsPerPaperDelta','citationsPerYear', 'totalPapers', 'totalPapersDelta', 'rankCitationsPerYear',
    'pageRank', 'weightedPageRank', 'authorAge', 'recentCoauthors', 'maxCitations', 'totalVenues', 
    'venueHIndexMean', 'venueHIndexDeltaMean', 'venueCitationsPerPaperMean', 'venueCitationsPerPaperDeltaMean',  
    'venueTotalPapersMean', 'venueTotalPapersDeltaMean', 'venueRankCitationsPerPaperMean', 'venueMaxCitationsMean']

author_funs = ['avg', 'max', 'sum'] # why are we doing sum here? should we do average instead or in addition to sum?
author_var_patterns = 
author_var_pattern = '{f}(coalesce(ma{y}.{v},0)) AS {v}_{f}_{y},'
author_var_string_list = [author_var_pattern.format(y=i, v=j, f=k)
                          for i in range(years_tracked+1)
                          for j in author_vars
                          for k in author_funs]
author_var_string = '\n'.join(author_var_string_list)

# Do we need any with statements
query = """
MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.year>={miny} AND y.year<={maxy}//-{ya}
{metrics_string}
{author_metrics_string}
WHERE {where_string}
RETURN 
    {var_string}
    {author_var_string}
    id(q) AS id, 
    y.year AS year
""".format(miny=min_year, 
           maxy=max_year,
           ya=max_years_to_predict,
           metrics_string=metrics_string,
           var_string=var_string, 
           where_string=where_string, 
           author_metrics_string='',
           author_var_string='')

query_tocsv = """
CALL apoc.export.csv.query('
{q}
','/import/quanta.predict.{miny}.{maxy}.{yt}.csv', 
{{quotes:true}});
""".format(q=query, 
           miny=min_year, 
           maxy=max_year,
           yt=years_tracked)

run_query(query_tocsv, graph, to_df=False, print_only=True)

### Read in data for each year, format it, and write it back

In [None]:
fpath = '/tmp/data/quanta.predict.{miny}.{maxy}.{yt}.csv'.format(
    miny=min_year, maxy=max_year, yt=years_tracked)
df = pd.read_csv(fpath, engine='python', error_bad_lines=False)
df = df.dropna()

for y in tqdm(range(years_tracked+1)):

    col = 'node2vec_{}'.format(y)    
    n2vdf = pd.DataFrame(df[col].apply(json.loads).tolist())
    n2v_dim = n2vdf.shape[1]
    n2vdf.columns = ['{}_{}'.format(col, i) for i in range(n2v_dim)]

    df = pd.concat([df.reset_index(drop=True), n2vdf.reset_index(drop=True)], axis=1)
    df = df.drop(col, axis=1)


df.to_csv('{}.out'.format(fpath))

### Read in prediction data and make predictions

In [18]:
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, RobustScaler
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier
from sklearn.metrics import f1_score, mean_squared_error, classification_report, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import make_pipeline, Pipeline

from sklearn.externals.joblib import parallel_backend


%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp




In [23]:
df = pd.read_csv('/tmp/data/quanta.predict.{miny}.{maxy}.{yt}.csv.out'.format(
    miny=min_year, maxy=max_year, yt=years_tracked))

In [24]:
df.describe()

Unnamed: 0.1,Unnamed: 0,adopters_0,adopters_1,adopters_2,adopters_3,adopters_4,adopters_5,citations_0,citations_1,citations_2,...,node2vec_5_2,node2vec_5_3,node2vec_5_4,node2vec_5_5,node2vec_5_6,node2vec_5_7,node2vec_5_8,node2vec_5_9,node2vec_5_10,node2vec_5_11
count,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,...,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0,5262588.0
mean,2631294.0,11.32223,28.89139,31.20092,24.97449,0.5485345,0.07468759,0.8941316,2.060872,2.016791,...,7420117000000.0,9878786000000.0,70433230000000.0,-53304090000000.0,-13977890000000.0,1691927000000.0,24060430000000.0,31382540000000.0,90321890000000.0,-136677700000000.0
std,1519178.0,68.83398,119.661,142.1127,119.3722,39.36234,5.916535,2.479747,4.74452,4.934994,...,122876000000000.0,163254000000000.0,714060700000000.0,562413600000000.0,283140800000000.0,59398740000000.0,509614600000000.0,710947200000000.0,744173800000000.0,1055877000000000.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-4462150000000000.0,-5927330000000000.0,-1.65417e+16,-3.07207e+16,-1.47557e+16,-7837340000000000.0,-1.29399e+16,-1.8044e+16,-1.70534e+16,-6.34912e+16
25%,1315647.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.11738,-0.390313,-0.22098,-0.832286,-0.8384342,-0.441695,-0.393653,-0.172001,-0.517278,-0.718276
50%,2631294.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.03850575,-0.0268932,0.00868217,-0.053288,-0.0386948,-0.0360694,-0.0241307,0.0707772,0.0435653,-0.0909748
75%,3946940.0,4.0,20.0,21.0,16.0,0.0,0.0,1.0,2.0,2.0,...,0.549651,0.193415,0.2613402,0.144435,0.179696,0.163749,0.42826,0.5274802,0.451131,0.399981
max,5262587.0,2992.0,5025.0,5025.0,4999.0,4015.0,773.0,99.0,180.0,205.0,...,5361910000000000.0,7120870000000000.0,3.88743e+16,1.3558e+16,3.36914e+16,3422790000000000.0,5.631e+16,7.78044e+16,4.66942e+16,2.35095e+16


In [25]:
df.index[df['year'] <= max_year - years_tracked + 1]

Int64Index([      0,       1,       2,       3,       4,       5,       6,
                  7,       8,       9,
            ...
            5262578, 5262579, 5262580, 5262581, 5262582, 5262583, 5262584,
            5262585, 5262586, 5262587],
           dtype='int64', length=5262588)

In [None]:


results = []

for years_to_track in tqdm(range(years_tracked)):
#     for year_to_predict in range(years_to_track+1, max_years_to_predict+1):
#         print(years_to_track, year_to_predict)
#         pass
    year_to_predict = years_to_track + 1
    
    cols_to_keep = ['{v}_{y}'.format(y=i,v=j) 
        for i in range(years_to_track+1) 
        for j in [v for v in vars_to_use if v!='node2vec']]

    n2v_cols_to_keep =  ['node2vec_{y}_{i}'.format(y=y, i=i) 
                     for y in range(years_to_track+1)
                     for i in range(n2v_dim)]
        
    cols_to_keep = cols_to_keep + n2v_cols_to_keep
    X = df.loc[:, cols_to_keep]
    
    y_col = 'timeScaledPageRank_{y}'.format(y=year_to_predict)
    y = df.loc[:, y_col] > df[y_col].quantile(q=.95)


    pipeline = Pipeline(
        memory=None,
        steps=[
            ('spl', SMOTE()),
            ('scl', QuantileTransformer()),
            ('clf', RandomForestClassifier())
        ]
    )

    grid = {'clf__n_estimators': [int(x) for x in np.linspace(200, 2000, num=10)],
            'clf__max_features': ['auto', 'sqrt'],
            'clf__max_depth': [int(x) for x in np.linspace(10, 1000, num=10)],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__bootstrap': [True, False]}

    random_search = RandomizedSearchCV(
        estimator=pipeline, 
        param_distributions=grid, 
        n_iter=10, 
        cv=3, 
        n_jobs=-1,
        verbose=3,
        random_state=42
    )


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    with parallel_backend('threading'):
        random_search.fit(X_train, y_train)

    y_pred = random_search.predict(X_test)

    results.append({
        'years_tracked': years_to_track, 
        'year_predicted': year_to_predict,
        'score': random_search.score(X=X_test, y=y_test),
        'f1': f1_score(y_pred=y_pred, y_true=y_test),
        'balanced_accuracy': balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
        'balanced_accuracy_adjusted': balanced_accuracy_score(y_true=y_test, y_pred=y_pred, adjusted=True),
        'classification_report': classification_report(y_true=y_test, y_pred=y_pred, output_dict=True),
        'random_search': random_search
    })

    pickle.dump(results, open('prediction_models_{}.pickle'.format(years_to_track),'wb'))
    
pickle.dump(results, open('prediction_models.pickle','wb'))

In [None]:
years_to_track=3
year_to_predict=5
n2v_dim=12

results = []

cols_to_keep = ['{v}_{y}'.format(y=i,v=j) 
        for i in range(years_to_track+1) 
        for j in [v for v in vars_to_use if v!='node2vec']]

n2v_cols_to_keep =  ['node2vec_{y}_{i}'.format(y=y, i=i) 
                 for y in range(years_to_track+1)
                 for i in range(n2v_dim)]

cols_to_keep = cols_to_keep + n2v_cols_to_keep
X = df.loc[:, cols_to_keep]

y_col = 'timeScaledPageRank_{y}'.format(y=year_to_predict)
y = df.loc[:, y_col] > df[y_col].quantile(q=.95)


pipeline = Pipeline(
    memory=None,
    steps=[
        ('spl', RandomUnderSampler()),
        ('scl', QuantileTransformer()),
        ('clf', RandomForestClassifier())
    ]
)

grid = {'clf__n_estimators': [int(x) for x in np.linspace(200, 2000, num=10)],
        'clf__max_features': ['auto', 'sqrt'],
        'clf__max_depth': [int(x) for x in np.linspace(10, 1000, num=10)],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__bootstrap': [True, False]}

random_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions=grid, 
    n_iter=15, 
    cv=3, 
    n_jobs=-1,
    verbose=3,
    random_state=42
)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

with parallel_backend('threading'):
    random_search.fit(X_train, y_train)
    
y_pred = random_search.predict(X_test)

results.append({
    'years_tracked': years_to_track, 
    'year_predicted': year_to_predict,
    'score': random_search.score(X=X_test, y=y_test),
    'f1': f1_score(y_pred=y_pred, y_true=y_test),
    'balanced_accuracy': balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
    'balanced_accuracy_adjusted': balanced_accuracy_score(y_true=y_test, y_pred=y_pred, adjusted=True),
    'classification_report': classification_report(y_true=y_test, y_pred=y_pred, output_dict=True),
    'random_search': random_search
})

print(classification_report(y_true=y_test, y_pred=y_pred))

In [None]:
results


In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))

In [None]:
pipeline.get_params()


In [None]:
years_to_track=1
year_to_predict=10
n2v_dim=12

results = pd.DataFrame(columns=['years_tracked', 'year_predicted', 'score'])


vars_to_use = ['adopters', 'citatons', 'node2vec']

cols_to_keep = ['{v}_{y}'.format(y=i,v=j) 
        for i in range(years_to_track+1) 
        for j in [v for v in vars_to_use if v!='node2vec']]

n2v_cols_to_keep =  ['node2vec_{y}_{i}'.format(y=y, i=i) 
                 for y in range(years_to_track+1)
                 for i in range(n2v_dim)]

cols_to_keep = cols_to_keep + n2v_cols_to_keep
X = df.loc[:, cols_to_keep]

y_col = ['timeScaledPageRank_{y}'.format(y=i) for i in range(2,10)]


y = df.loc[:, y_col] > df[y_col].quantile(q=.9)


param_grid = {
    "max_depth": [10, None],
    "max_features": [5, 10, 20],
    "n_estimators": [10, 100, 200]
}

# pipeline = make_pipeline(
# #     SMOTE(),
#     RandomUnderSampler(),
# #     QuantileTransformer(),
#     MinMaxScaler(),
#     GridSearchCV(RandomForestClassifier(class_weight={True:3, False:1}),
#                  param_grid=param_grid, 
#                  refit=True, 
#                  n_jobs=-1, 
#                  cv=2, 
#                  scoring='f1') 
# )


pipeline = Pipeline(
    memory=None,
    steps=[
#         ('spl', SMOTE()),
#         ('scl', MinMaxScaler()),
        ('clf', RandomForestClassifier())
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

results = results.append({
    'years_tracked': years_to_track, 
    'year_predicted': year_to_predict,
    'score': pipeline.score(X=X_test, y=y_test),
#     'f1': f1_score(y_pred=y_pred, y_true=y_test)
}, ignore_index=True)

print(classification_report(y_true=y_test, y_pred=y_pred))

In [None]:
pipeline.predict_proba(df.loc[df['id'] == 29052283, cols_to_keep])

In [None]:
from plot_metric.functions import  BinaryClassification

bc = BinaryClassification(y_test, y_prob, labels=["Low Impact", "High Impact"])

plt.figure(figsize=(15,10))

plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2)
bc.plot_roc_curve()

plt.subplot2grid((2,6), (0,2), colspan=2)
bc.plot_precision_recall_curve()

plt.subplot2grid((2,6), (0,4), colspan=2)
bc.plot_class_distribution()

plt.subplot2grid((2,6), (1,1), colspan=2)
bc.plot_confusion_matrix()

plt.subplot2grid((2,6), (1,3), colspan=2)
bc.plot_confusion_matrix(normalize=True)

plt.show()
bc.print_report()

In [None]:
y_pred = pipeline.predict(X_test)
f1_score(y_pred=y_pred, y_true=y_test)

In [None]:
sns.heatmap(data=results.pivot('years_tracked', 'year_predicted', 'score'), 
           annot=True, fmt='.2f', linewidth=.5, cbar=True, square=True, 
           cmap='YlGnBu', center=results['score'].mean())

In [None]:
plt.figure(figsize=(10,20))
feature_importance = pd.DataFrame.from_dict(
    dict(zip(X.columns,pipeline.steps[1][1].feature_importances_)), orient='index').T
sns.barplot(orient='h',data=feature_importance)

In [None]:
pipeline

In [None]:
.shape

In [None]:
from sklearn.model_selection import validation_curve

param = 'randomforestclassifier__max_depth'
param_range = list(range(1,100,25))
n_cv = 2

train_scores, valid_scores = validation_curve(pipeline, 
                                              X=X, 
                                              y=y,
                                              n_jobs=-1,
                                              param_name=param,
                                              scoring='roc_auc',
                                              param_range=param_range,
                                              cv=n_cv)

vdf = pd.DataFrame(np.concatenate([train_scores, valid_scores]),
             columns=['cv_fold_{}'.format(i) for i in range(n_cv)],
            )
vdf[param] = param_range*2
vdf['type'] = ['train']*len(param_range) + ['valid']*len(param_range)

sns.lineplot(data=vdf.melt(id_vars=['type', param]), 
             x=param, y='value', hue='type')

In [None]:
pd.concat([X_train, X_test]).head()

In [None]:
X.columns == 

### Predictions

In [None]:
from tpot import TPOTRegressor
tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, n_jobs=-1, 
                                                    scoring='r2', config_dict='TPOT light', 
                                                    max_time_mins=30, max_eval_time_mins=5) 
tpot.fit(X_train, y_train)

In [None]:
tpot.score(X_test, y_test)

In [None]:
tpot.export('tpot_exported_pipeline.py')

In [None]:
!cat tpot_exported_pipeline.py

In [None]:
years_ahead = 4
df = pd.read_csv('/tmp/data/quanta.predict.{miny}.{maxy}.{ya}.csv.out'.format(
        miny=min_year, maxy=max_year, ya=years_ahead))
df.shape

In [None]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *