In [None]:
import json
import time
import pandas as pd
from py2neo import Graph, Node, Relationship
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import glob


In [None]:
graph = Graph("bolt://neo4j-magone:7687", auth=('neo4j','myneo'))
top_42 = ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']

print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

# Load and Build Dataset from Disk

In [17]:

def read_build_dataset(datapath, years_to_use, target_year):
    df = pd.read_pickle(datapath)
    years_to_use = years_to_use

    df_new = df[['year','title','id']]
    for i in range(years_to_use+1):
        df_new['c{}'.format(i)] = df.apply(lambda row: row['c{}'.format(row['year']+i)], axis=1)
        df_new['p{}'.format(i)] = df.apply(lambda row: row['tspr{}'.format(row['year']+i)], axis=1)

    # ADD TARGET

    target_year = target_year

    df_new['target_pr{}'.format(target_year)] = df.apply(lambda row: row['tspr{}'.format(row['year']+target_year)], axis=1)



    feature_paths = ["/tmp/data/result/FeatureExtractionResults/EarlyAdopters/"]

    for feature_path in feature_paths:
        all_files = sorted(glob.glob(feature_path +"*.csv"), reverse=True)
        feature_vec_chunks = []
        for file in all_files:
            feature_vec_chunks.append(pd.read_csv(file))

        total_feature = pd.concat(feature_vec_chunks)
        df_features = df_new.merge(pd.concat(feature_vec_chunks), on='title')

    df_features = df_features.dropna()
    return df_features

DATAPATH = "/tmp/data/result/datasets/top_42_2000_2015.pkl"
YEARS_TO_USE = 3
TARGET_YEAR = 3
df_features = read_build_dataset(DATAPATH, YEARS_TO_USE, TARGET_YEAR)
titles = df_features['title']
df_features = df_features.drop(columns=['title'])

# Target Creation
y = df_features['target_pr{}'.format(TARGET_YEAR)]
log_y = y.apply(lambda x: np.log(x))







A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


# Signal Selection + Training/Test Split

In [None]:
from sklearn.model_selection import train_test_split

NUM_YEARS = 1
def signal_select(df_features, num_years):
    to_keep = []
    num_years = num_years
    i = 0
    while True:

        if i == num_years:
            break
        to_keep.append('c{}'.format(i))
        to_keep.append('p{}'.format(i))
        i+=1
    
    i = 1

    while True:
        if i-1 == num_years:
            break
        to_keep.append('early_adopters_{}'.format(i))
        i+=1
        
        

    df_cleaned = df_features[to_keep]

    
    return df_cleaned

df_cleaned = signal_select(df_features, NUM_YEARS)
    

X_train, X_test, y_train, y_test = train_test_split(df_cleaned, log_y, test_size=0.33, random_state=42)

X_train, y_train


# Fit Regression Models

In [None]:
!pip install scikit-learn
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor



scoring = ['r2', 'neg_mean_squared_error']


model = RandomForestRegressor(random_state=0)
cv_results = cross_validate(model, X_train, y_train, scoring=scoring,
                         cv=5, return_train_score=True)

# Predict Top N

In [None]:

fitted_model = model.fit(X_train, y_train)

prediction_results = fitted_model.predict(df_cleaned)

result_df = df_cleaned.copy(deep=True).reset_index(drop=True)
result_df['titles'] = titles.reset_index(drop=True)
result_df['predictions'] = pd.Series(prediction_results)


result_df = result_df.sort_values('predictions', ascending=False)

TOP_N = 100
# result_df.iloc[:TOP_N].to_csv('')






In [18]:
DATAPATH = "/tmp/data/result/datasets/top_42_2016_2018.pkl"
YEARS_TO_USE = 1
TARGET_YEAR = 1
df_features_new = read_build_dataset(DATAPATH, YEARS_TO_USE, TARGET_YEAR)
titles_new = df_features_new['title']
df_features_new = df_features_new.drop(columns=['title'])

df_cleaned_new = signal_select(df_features_new, YEARS_TO_USE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,year,id,c0,p0,c1,p1,target_pr1,early_adopters_1,early_adopters_2,early_adopters_3
2508,2016,0008fd5b-530a-4453-a79b-bbf028689c05,2.0,0.150000,15.0,0.150000,0.150000,20,0,0
2509,2017,005d6f9e-09f6-4dcb-8e1c-7ae4cd99d8cf,2.0,0.150000,0.0,0.150000,0.150000,0,0,0
2510,2017,00636661-f9f7-48cc-8804-c5696c7f3c98,3.0,0.277500,0.0,0.277500,0.277500,0,0,0
2511,2016,008b3a44-5798-410e-8f15-eb56fcbb4ad3,1.0,0.150000,7.0,0.150000,0.150000,14,0,0
2512,2016,0093a06f-8b8c-4dde-89d7-7984f665383d,7.0,0.192500,2.0,0.192500,0.192500,0,0,0
2513,2016,00a63c28-2a96-49f3-8a9f-837b9d3aea51,3.0,0.532500,0.0,0.532500,0.532500,0,0,0
2514,2016,00ae9401-baac-420e-aa47-a86e5c48d5ba,11.0,0.150000,13.0,0.150000,0.150000,17,0,0
2515,2016,00bf64aa-c105-492b-8e3e-2739b269b169,1.0,0.150000,0.0,0.150000,0.150000,0,0,0
2516,2016,00c27392-0465-4f77-ade8-e2b55c8e76dd,3.0,0.150000,11.0,0.150000,0.150000,3,0,0
2517,2016,00e2f7fc-ca61-4ecb-a7ed-3648b271bb15,6.0,0.245625,31.0,0.245625,0.245625,0,0,0


# Data Visualization
## Target Pagerank distribution

In [None]:
import seaborn as sns

In [None]:
log_y = y.apply(lambda x: np.log(x))
sns.distplot(log_y)


In [None]:
sns.distplot(df_cleaned.p0.apply(lambda x: np.log(x) if x > 0 else -15))
sns.distplot(df_cleaned.c0.apply(lambda x: np.log(x) if x > 0 else -15))
sns.distplot(df_cleaned.early_adopters_1.apply(lambda x: np.log(x) if x > 0 else -15))

In [None]:
titles