### Setup

In [5]:
import json, time, pickle
from functools import reduce

import pandas as pd
import numpy as np

from ast import literal_eval

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
from tqdm.autonotebook import tqdm

import seaborn as sns
sns.set(rc={'figure.figsize':(12,8)})
sns.set(style='whitegrid')

import matplotlib.pyplot as plt

from py2neo import Graph, Node, Relationship

In [2]:
# public_address = '54.174.175.98'
public_address = '18.27.79.39'

graph = Graph('bolt://{}:7687'.format(public_address), auth=('neo4j','myneo'))

def run_query(query, graph, print_query=False, run_query=True, 
              print_only=False, to_df=False, verbose=True):
    df = 1
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        if to_df:
            df = graph.run(query).to_data_frame()
        else:
            graph.run(query)
    end_time = time.time()
    minutes_elapsed = (end_time-start_time)/60
    if verbose:
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed))
    return df


In [15]:
years_to_consider = range(1950, 2020)
years_tracked = 5

vars_to_use = ['adopters', 'timeScaledPageRank', 'citatons', 'node2vec']

max_year = max(years_to_consider)
min_year = min(years_to_consider)

### Write Data To CSV (With Author Features) Year By Year

In [None]:
def get_quanta_features(years):
    for year in years:
        print('Running query for year {}'.format(year))
        author_vars = [
            'hIndex', 'hIndexDelta', 'totalCitations', 'totalCitationsDelta', 'citationsPerPaper', 
            'citationsPerPaperDelta','citationsPerYear', 'totalPapers', 'totalPapersDelta', 'rankCitationsPerYear',
            'pageRank', 'authorAge', 'recentCoauthors', 'maxCitations', 'totalVenues', 
            'venueHIndexMean', 'venueHIndexDeltaMean', 'venueCitationsPerPaperMean', 'venueCitationsPerPaperDeltaMean',  
            'venueTotalPapersMean', 'venueTotalPapersDeltaMean', 'venueRankCitationsPerPaperMean', 'venueMaxCitationsMean']


        metrics_pattern = 'OPTIONAL MATCH (q)-[m{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})'
        metrics_string_list = [metrics_pattern.format(y=i) for i in range(years_tracked+1)]
        metrics_string = '\n'.join(metrics_string_list)

        where_pattern = 'exists(m{y}.node2vec)' #do we need to check all m{y}'s or just the first?
        where_string_list = [where_pattern.format(y=i) for i in range(years_tracked+1)]
        where_string = 'WHERE ' + ' AND '.join(where_string_list)   

        # figure out how to only get q and y once
        with_pattern = 'm{y}'
        with_string_list = [with_pattern.format(y=i) for i in range(years_tracked+1)]
        with_qy = 'WITH q, y, '
        with_string_part = ', '.join(with_string_list)
        with_string = with_qy + with_string_part

        author_metrics_pattern = 'OPTIONAL MATCH (a)-[ma{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})' 
        author_metrics_string_list = [author_metrics_pattern.format(y=i) for i in range(years_tracked+1)]
        author_metrics_string = '\n'.join(author_metrics_string_list)

        with_pattern2_part1 = with_string + ', '
        with_pattern2 = 'collect(ma{y}.{v}) as ma{y}_{v}'
        with_string_list2 = [with_pattern2.format(y=i, v=j) for i in range(years_tracked+1) for j in author_vars]
        with_pattern2_part2 = ', \n '.join(with_string_list2)
        with_string2 = with_pattern2_part1 + with_pattern2_part2


        var_pattern = 'coalesce(m{y}.{v},0) AS {v}_{y},'
        var_string_list = [var_pattern.format(y=i,v=j) for i in range(years_tracked+1) for j in vars_to_use]
        var_string = '\n'.join(var_string_list)

        #author_funs = ['avg', 'max'] # are these the only things we want here?
        #author_var_pattern = '{f}(coalesce(apoc.coll.avg(ma{y}_{v}), 0)) AS {v}_{f}_{y},'
        author_var_pattern = 'coalesce(apoc.coll.avg(ma{y}_{v}), 0) AS {v}_{y}, \n'

        author_var_string_list = [author_var_pattern.format(y=i, v=j) #, f=k)
                                  for i in range(years_tracked+1)
                                  for j in author_vars]
                                  #for k in author_funs]
        author_var_string = ''.join(author_var_string_list)

        query = """
        MATCH (q:Quanta)-[:PUBLISHED_IN]->(y:Year {{year:{the_year}}})
        WITH q, y
        {metrics_string}
        {with_string}
        WHERE exists(m0.node2vec)
        MATCH (q)<-[:AUTHORED]-(a:Author)
        {author_metrics_string}
        {with_string2}
        RETURN 
            {var_string}
            {author_var_string}
            id(q) AS id, 
            y.year AS year
        """.format(the_year=year,
                   metrics_string=metrics_string,
                   where_string=where_string,
                   with_string=with_string,
                   with_string2=with_string2,
                   var_string=var_string, 
                   author_metrics_string=author_metrics_string,
                   author_var_string=author_var_string)

        query_tocsv = """
        CALL apoc.export.csv.query('
        {q}
        ','/import/quanta.author.predict.{y}.csv', 
        {{quotes:true}});
        """.format(q=query, 
                   y=year)

        run_query(query_tocsv, graph, to_df=False, print_only=True)
        
get_quanta_features(years)    

### Concatenate CSV's into one file

In [None]:
import os
import glob
import pandas as pd
from tqdm.autonotebook import tqdm
#os.chdir('/ltmp/data/')
#quanta.author.predict
extension = 'csv'
all_files = [i for i in glob.glob('/ltmp/data/quanta.author.predict.*{}'.format(extension))]
print(len(all_files))

result = pd.read_csv(all_files[0])
for f in tqdm(all_files[1:]):
    result = pd.concat([result, pd.read_csv(f)], axis=0)
    

combined_csv = pd.concat([pd.read_csv(f) for f in all_files])
combined_csv.to_csv("/ltmp/data/quanta.author.test.predict.allyrs.csv", index=False, encoding='utf-8-sig')

### Write Data To CSV (With Author Features) All Years At Once

# Read in data for each year, format it, and write it back

In [None]:
#fpath = '/tmp/data/quanta.predict.{miny}.{maxy}.{yt}.csv'.format(
    #miny=min_year, maxy=max_year, yt=years_tracked, ya=years_ahead)
fpath = '/tmp/data/quanta.author.predict.allyrs.csv'.format()
df = pd.read_csv(fpath)
df = df.dropna()

#for y in tqdm(range(years_tracked+1)):
for y in tqdm(range(11)):
    col = 'node2vec_{}'.format(y)    
    n2vdf = pd.DataFrame(df[col].apply(json.loads).tolist())
    n2v_dim = n2vdf.shape[1]
    n2vdf.columns = ['{}_{}'.format(col, i) for i in range(n2v_dim)]

    df = pd.concat([df.reset_index(drop=True), n2vdf.reset_index(drop=True)], axis=1)
    df = df.drop(col, axis=1)


df.to_csv('{}.out'.format(fpath))

### Read in prediction data and make predictions

In [4]:
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, RobustScaler
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier
from sklearn.metrics import f1_score, mean_squared_error, classification_report, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import make_pipeline, Pipeline

from sklearn.externals.joblib import parallel_backend


%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp




In [None]:
df = pd.read_csv('/tmp/data/quanta.predict.{miny}.{maxy}.{yt}.csv.out'.format(
    miny=min_year, maxy=max_year, yt=years_tracked))

In [None]:
years_tracked = 6

results = []

for years_to_track in tqdm(range(years_tracked)):
    for year_to_predict in range(years_to_track+1, years_tracked+1):
        year_to_predict = years_to_track + 1

        cols_to_keep = ['{v}_{y}'.format(y=i,v=j) 
            for i in range(years_to_track+1) 
            for j in [v for v in vars_to_use if v!='node2vec']]

        n2v_cols_to_keep =  ['node2vec_{y}_{i}'.format(y=y, i=i) 
                         for y in range(years_to_track+1)
                         for i in range(n2v_dim)]

        cols_to_keep = cols_to_keep + n2v_cols_to_keep
        X = df.loc[:, cols_to_keep]

        y_col = 'timeScaledPageRank_{y}'.format(y=year_to_predict)
        y = df.loc[:, y_col] > df[y_col].quantile(q=.95)

        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


        pipeline = Pipeline(
            memory=None,
            steps=[
                ('spl', SMOTE()),
                ('scl', QuantileTransformer()),
                ('clf', RandomForestClassifier())
            ]
        )

        grid = {'clf__n_estimators': [int(x) for x in np.linspace(200, 2000, num=10)],
                'clf__max_features': ['auto', 'sqrt'],
                'clf__max_depth': [int(x) for x in np.linspace(10, 1000, num=10)],
                'clf__min_samples_split': [2, 5, 10],
                'clf__min_samples_leaf': [1, 2, 4],
                'clf__bootstrap': [True, False]}

        random_search = RandomizedSearchCV(
            estimator=pipeline, 
            param_distributions=grid, 
            n_iter=10, 
            cv=3, 
            n_jobs=-1,
            verbose=3,
            random_state=42
        )

        with parallel_backend('threading'):
            random_search.fit(X_train, y_train)
        y_pred = random_search.predict(X_test)

        results.append({
            'years_tracked': years_to_track, 
            'year_predicted': year_to_predict,
            'score': random_search.score(X=X_test, y=y_test),
            'f1': f1_score(y_pred=y_pred, y_true=y_test),
            'balanced_accuracy': balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
            'balanced_accuracy_adjusted': balanced_accuracy_score(y_true=y_test, y_pred=y_pred, adjusted=True),
            'classification_report': classification_report(y_true=y_test, y_pred=y_pred, output_dict=True),
            'random_search': random_search
        })

        pickle.dump(results, open('predmodel_{yt}_{yp}.pickle'.format(
            yt=years_to_track, yp=year_to_predict),'wb'))    
pickle.dump(results, open('predmodels_{yst}.pickle'.format(yst=years_tracked),'wb'))

In [16]:
results = pickle.load(open('/tmp/data/predmodels_{yst}.pickle'.format(yst=years_tracked),'rb'))

In [None]:
from plot_metric.functions import  BinaryClassification

bc = BinaryClassification(y_test, y_prob, labels=["Low Impact", "High Impact"])

plt.figure(figsize=(15,10))

plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2)
bc.plot_roc_curve()

plt.subplot2grid((2,6), (0,2), colspan=2)
bc.plot_precision_recall_curve()

plt.subplot2grid((2,6), (0,4), colspan=2)
bc.plot_class_distribution()

plt.subplot2grid((2,6), (1,1), colspan=2)
bc.plot_confusion_matrix()

plt.subplot2grid((2,6), (1,3), colspan=2)
bc.plot_confusion_matrix(normalize=True)

plt.show()
bc.print_report()

In [None]:
y_pred = pipeline.predict(X_test)
f1_score(y_pred=y_pred, y_true=y_test)

In [None]:
sns.heatmap(data=results.pivot('years_tracked', 'year_predicted', 'score'), 
           annot=True, fmt='.2f', linewidth=.5, cbar=True, square=True, 
           cmap='YlGnBu', center=results['score'].mean())

In [None]:
plt.figure(figsize=(10,20))
feature_importance = pd.DataFrame.from_dict(
    dict(zip(X.columns,pipeline.steps[1][1].feature_importances_)), orient='index').T
sns.barplot(orient='h',data=feature_importance)

In [None]:
pipeline

In [None]:
.shape

In [None]:
from sklearn.model_selection import validation_curve

param = 'randomforestclassifier__max_depth'
param_range = list(range(1,100,25))
n_cv = 2

train_scores, valid_scores = validation_curve(pipeline, 
                                              X=X, 
                                              y=y,
                                              n_jobs=-1,
                                              param_name=param,
                                              scoring='roc_auc',
                                              param_range=param_range,
                                              cv=n_cv)

vdf = pd.DataFrame(np.concatenate([train_scores, valid_scores]),
             columns=['cv_fold_{}'.format(i) for i in range(n_cv)],
            )
vdf[param] = param_range*2
vdf['type'] = ['train']*len(param_range) + ['valid']*len(param_range)

sns.lineplot(data=vdf.melt(id_vars=['type', param]), 
             x=param, y='value', hue='type')

In [None]:
pd.concat([X_train, X_test]).head()

In [None]:
X.columns == 

### Predictions

In [None]:
from tpot import TPOTRegressor
tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, n_jobs=-1, 
                                                    scoring='r2', config_dict='TPOT light', 
                                                    max_time_mins=30, max_eval_time_mins=5) 
tpot.fit(X_train, y_train)

In [None]:
tpot.score(X_test, y_test)

In [None]:
tpot.export('tpot_exported_pipeline.py')

In [None]:
!cat tpot_exported_pipeline.py

In [None]:
years_ahead = 4
df = pd.read_csv('/tmp/data/quanta.predict.{miny}.{maxy}.{ya}.csv.out'.format(
        miny=min_year, maxy=max_year, ya=years_ahead))
df.shape

In [None]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

### Write data to CSV (just quanta features)

In [None]:
metrics_pattern = 'MATCH (q)-[m{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})'
metrics_string_list = [metrics_pattern.format(y=i) for i in range(years_tracked+1)]
metrics_string = '\n'.join(metrics_string_list)

var_pattern = 'coalesce(m{y}.{v},0) AS {v}_{y},'
var_string_list = [var_pattern.format(y=i,v=j) for i in range(years_tracked+1) for j in vars_to_use]
var_string = '\n'.join(var_string_list)

where_pattern = 'exists(m{y}.node2vec)'
where_string_list = [where_pattern.format(y=i) for i in range(years_tracked+1)]
where_string = ' AND '.join(where_string_list)       

author_metrics_pattern = 'MATCH (q)-[ma{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})'
author_metrics_string_list = [author_metrics_pattern.format(y=i) for i in range(years_tracked+1)]
author_metrics_string = '\n'.join(author_metrics_string_list)

author_vars = ['total_papers', 'author_age', 'max_citations', 'num_venues', 'total_citations']
author_funs = ['max', 'sum']
author_var_pattern = '{f}(coalesce(ma{y}.{v},0)) AS {v}_{f}_{y},'
author_var_string_list = [author_var_pattern.format(y=i, v=j, f=k)
                          for i in range(years_tracked+1)
                          for j in author_vars
                          for k in author_funs]
author_var_string = '\n'.join(author_var_string_list)


query = """
MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.year>={miny} AND y.year<={maxy}
{metrics_string}
{author_metrics_string}
WHERE {where_string}
RETURN 
    {var_string}
    {author_var_string}
    id(q) AS id, 
    y.year AS year
""".format(miny=min_year, 
           maxy=max_year,
           metrics_string=metrics_string,
           var_string=var_string, 
           where_string=where_string, 
           author_metrics_string='',
           author_var_string='')

query_tocsv = """
CALL apoc.export.csv.query('
{q}
','/import/quanta.predict.{miny}.{maxy}.csv', 
{{quotes:true}});
""".format(q=query, 
           miny=min_year, 
           maxy=max_year,
           yt=years_tracked)

run_query(query_tocsv, graph, to_df=False, print_only=True)

In [None]:
def write_features_all_years():
    #author_vars = ['total_papers', 'author_age', 'max_citations', 'num_venues', 'total_citations']
    # todo add weighted_pagerank
    author_vars = ['hIndex', 'totalPapers', 'authorAge', 'maxCitations', 'totalCitations', 'numVenues', 'pageRank']

    author_vars_all = [
        'hIndex', 'hIndexDelta', 'totalCitations', 'totalCitationsDelta', 'citationsPerPaper', 
        'citationsPerPaperDelta','citationsPerYear', 'totalPapers', 'totalPapersDelta', 'rankCitationsPerYear',
        'pageRank', 'authorAge', 'recentCoauthors', 'maxCitations', 'totalVenues', 
        'venueHIndexMean', 'venueHIndexDeltaMean', 'venueCitationsPerPaperMean', 'venueCitationsPerPaperDeltaMean',  
        'venueTotalPapersMean', 'venueTotalPapersDeltaMean', 'venueRankCitationsPerPaperMean', 'venueMaxCitationsMean']


    metrics_pattern = 'MATCH (q)-[m{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})'
    metrics_string_list = [metrics_pattern.format(y=i) for i in range(years_tracked+1)]
    metrics_string = '\n'.join(metrics_string_list)

    where_pattern = 'exists(m{y}.node2vec)' #do we need to check all m{y}'s or just the first?
    where_string_list = [where_pattern.format(y=i) for i in range(years_tracked+1)]
    where_string = 'WHERE ' + ' AND '.join(where_string_list)   

    # figure out how to only get q and y once
    with_pattern = 'm{y}'
    with_string_list = [with_pattern.format(y=i) for i in range(years_tracked+1)]
    with_qy = 'WITH q, y, '
    with_string_part = ', '.join(with_string_list)
    with_string = with_qy + with_string_part

    author_metrics_pattern = 'MATCH (a)-[ma{y}:METRICS_IN]->(:Year {{year:y.year+{y}}})' 
    author_metrics_string_list = [author_metrics_pattern.format(y=i) for i in range(years_tracked+1)]
    author_metrics_string = '\n'.join(author_metrics_string_list)

    # figure out how to only get q and y once
    #with_pattern2 = 'WITH q, y, m{y}, collect(ma{y}.{v}) as ma{y}_{v} '
    #with_string2 = '\n'.join(with_string_list)
    with_pattern2_part1 = with_string + ', '
    with_pattern2 = 'collect(ma{y}.{v}) as ma{y}_{v}'
    with_string_list2 = [with_pattern2.format(y=i, v=j) for i in range(years_tracked+1) for j in author_vars]
    with_pattern2_part2 = ', \n '.join(with_string_list2)
    with_string2 = with_pattern2_part1 + with_pattern2_part2


    var_pattern = 'coalesce(m{y}.{v},0) AS {v}_{y},'
    var_string_list = [var_pattern.format(y=i,v=j) for i in range(years_tracked+1) for j in vars_to_use]
    var_string = '\n'.join(var_string_list)

    #author_funs = ['avg', 'max'] # are these the only things we want here?
    #author_var_pattern = '{f}(coalesce(apoc.coll.avg(ma{y}_{v}), 0)) AS {v}_{f}_{y},'
    author_var_pattern = 'coalesce(apoc.coll.avg(ma{y}_{v}), 0) AS {v}_{y}, \n'

    author_var_string_list = [author_var_pattern.format(y=i, v=j) #, f=k)
                              for i in range(years_tracked+1)
                              for j in author_vars]
                              #for k in author_funs]
    author_var_string = ''.join(author_var_string_list)

    query = """
    MATCH (q:Quanta)-[:PUBLISHED_IN]->(y:Year)
    WHERE y.year>={miny} AND y.year<={maxy}
    WITH q, y
    {metrics_string}
    {with_string}
    {where_string} 
    MATCH (q)<-[:AUTHORED]-(a:Author)
    {author_metrics_string}
    {with_string2}
    RETURN 
        {var_string}
        {author_var_string}
        id(q) AS id, 
        y.year AS year
    """.format(miny=min_year, 
               maxy=max_year,
               metrics_string=metrics_string,
               where_string=where_string,
               with_string=with_string,
               with_string2=with_string2,
               var_string=var_string, 
               author_metrics_string=author_metrics_string,
               author_var_string=author_var_string)

    query_tocsv = """
    CALL apoc.export.csv.query('
    {q}
    ','/import/quanta.author.predict.{miny}.{maxy}.csv', 
    {{quotes:true}});
    """.format(q=query, 
               miny=min_year, 
               maxy=max_year,
               yt=years_tracked)

    run_query(query_tocsv, graph, to_df=False, print_only=False)