In [None]:
import json
from py2neo import Graph, Node, Relationship
#from py2neo.Graph import database 

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")
# graph = Graph('bolt://localhost:7687', bolt=True)

#graph.delete_all()

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
##import the publications where lang = 'en' and publisher = "Science" or "Nature" in year 2008
import pandas as pd
import time
print("load english science and nature publication into df")
start_time = time.time()
query = """
MATCH (n:Quanta) WHERE n.lang = 'en' AND ( n.venue = 'Science' OR n.venue = 'Nature') AND n.year =2008 AND EXISTS(n.fos)
RETURN 
n.venue as venue,
n.pageRank_2018 as PR_2018,
n.pageRank_2008 as PR_2008,
n.fos as fos,
n.title as title,
n.keywords as keywords,
n.publisher as publisher
ORDER BY n.pageRank_2018 DESC
"""
#n.keywords as keywords
dfs_2008_test = graph.run(query).to_data_frame()
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))
#dfs_2008_test

In [None]:
top_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import itertools

dfs_2008_test_copy = dfs_2008_test.copy()
start_time = time.time()
fos_list = dfs_2008_test_copy["fos"].tolist()
fos_list = [[] if v is None else v for v in fos_list]


dfs_2008_test_copy.head()
#Replace original fos with updated fos 
dfs_2008_test_copy['fos'] = pd.Series(fos_list).values
dfs_2008_test

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(dfs_2008_test_copy.fos)
dfs_2008_test_copy = dfs_2008_test_copy.join(pd.DataFrame(X, columns=mlb.classes_))

#del fos in the df

del dfs_2008_test_copy['fos']
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))
dfs_2008_test_copy

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(dfs_2008_test.title).toarray()
#labels = df.category_id
print(features.shape)

print(len(tfidf.get_feature_names()))
print(type(features))

title_feature_name = tfidf.get_feature_names()
count_featurename = len(title_feature_name)
for i in range(count_featurename):
    column_name = 'title_'+title_feature_name[i]
    dfs_2008_test_copy[column_name] = pd.Series(features[:,i]).values

dfs_2008_test_copy

In [None]:
dfDummies = pd.get_dummies(dfs_2008_test_copy['venue'], prefix = 'venue')
dfs_2008_test_copy = pd.concat([dfs_2008_test_copy, dfDummies], axis=1)
del dfs_2008_test_copy['venue']
del dfs_2008_test_copy['keywords']
del dfs_2008_test_copy['publisher']
del dfs_2008_test_copy['PR_2008']
del dfs_2008_test_copy['title']
del dfs_2008_test_copy['PR_2018']
dfs_2008_test_copy

In [None]:
from sklearn import (metrics, cross_validation, linear_model, preprocessing)
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
import statistics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve



def auc_machine(df, top_pt, algo):
    
    #add the result into dataframe
    if 'popular' in df:
        del df['popular']
    
    row_n = len(df)
    index = int(row_n*top_pt)
    
    popular_result = []
    for i in range(row_n):
        if i < index:
            popular_result.append(1)
        else:
            popular_result.append(0)
    
    df['popular'] = pd.Series(popular_result).values
    
    #make a balanced dataframe
    df_high = pd.DataFrame() 
    df_low = pd.DataFrame()
    df_new = pd.DataFrame()
    
    if top_pt <= 0.5:
        df_high = df.iloc[:index,:].copy()
        df_temp = df.iloc[index:,:].copy()
        n_row_less = len(df_high)
        df_low = df_temp.sample(n_row_less)
        df_new = pd.concat([df_high, df_low], ignore_index=True)
    
#     elif top_pt == 0.5:
#         df_new = df.copy()
        
        
    else:
        df_temp = df.iloc[:index,:].copy()
        df_low = df.iloc[index:,:].copy()
        n_row_less = len(df_low)
        df_high = df_temp.sample(n_row_less)
    df_new = pd.concat([df_low, df_high], ignore_index=True)

    
    if (algo == 'rf' or algo == 'lg'):
        #finished setting up models 
        feature_list = list(df.columns.values)
        df_X = df_new[feature_list[:-1]]
        df_y = df_new['popular']
    #     df_X = df[feature_list[:-1]]
    #     df_y = df['popular']

        #build up a new imbalanced 
        if algo == 'rf':
            clf = RandomForestClassifier(min_samples_split =7, n_estimators=100)
        elif algo == 'lg':
            clf = LogisticRegression()
        mean_auc = []

        SEED=43
        n = 4  # repeat the CV procedure 10 times to get more precise results
        for i in range(n):
            # for each iteration, randomly hold out 20% of the data as CV set
            X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
            df_X, df_y, test_size=.30, random_state=i * SEED)

            # train model and make predictions
            clf.fit(X_train, y_train) 
            preds = clf.predict_proba(X_cv)[:, 1]

            # compute AUC metric for this CV fold
            fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
            roc_auc = metrics.auc(fpr, tpr)
            print ("AUC (fold %d/%d): %f" % (i + 1, n, roc_auc))
            mean_auc.append(roc_auc )
            
    elif algo == 'nn':
        mean_auc = []
        value_array = df_new.values
        y = value_array[:, -1]
        X = value_array[:, :-1]
        
        SEED=43
        n = 4
        for i in range(n):
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
            X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                X, y, test_size=.30, random_state=i * SEED)
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_cv = sc.transform(X_cv)
            classifier = Sequential()
            # Adding the input layer and the first hidden layer
            classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu', input_dim = len(X[1])))
            # Adding the second hidden layer
            classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu'))
            # Adding the output layer
            classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))
            # Compiling Neural Network
            classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
            # Fitting our model 
            classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 20)
            y_pred = classifier.predict(X_cv)
            y_pred = (y_pred > 0.5)
            nn_roc_auc = roc_auc_score(y_cv, y_pred)
            mean_auc.append(nn_roc_auc)

    print ("Mean AUC: %f" % ( np.mean(mean_auc)))
    print ("std AUC: %f" % np.std(mean_auc))
    return (np.mean(mean_auc),np.std(mean_auc))

In [None]:
rf_list = []
lg_list = []


for i in top_range:
    print('currently running top: ' + str(i))
    
    rf_result = auc_machine(dfs_2008_test_copy,i, 'rf')
    rf_list.append(rf_result)
    
    lg_result = auc_machine(dfs_2008_test_copy,i, 'lg')
    lg_list.append(lg_result)
    
    #lg_list = lf_auc(dfs_2008_test_copy,i)

In [None]:
nn_list = []
start_time = time.time

for i in top_range:
    print('currently running top: ' + str(i))
    nn_result = auc_machine(dfs_2008_test_copy,i, 'nn')
    nn_list.append(nn_result)
    
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))


In [None]:
print(nn_list)

In [None]:
# print(rf_list[1])
import matplotlib.pyplot as plt


rf_auc = []
rf_err = []
lg_auc = []
lg_err = []
nn_auc = []
nn_err = []

for i in rf_list:
    rf_auc.append(i[0])
    rf_err.append(i[1])
    
for i in lg_list:
    lg_auc.append(i[0])
    lg_err.append(i[1])
    
for i in nn_list:
    nn_auc.append(i[0])
    nn_err.append(i[1])
    
plt.xlabel("Default top percentage")
plt.ylabel("modified auc")
plt.title("top-pct to auc")

plt.errorbar(top_range,rf_auc,yerr=rf_err, fmt='o')
plt.errorbar(top_range,lg_auc,yerr=lg_err, fmt='o')
plt.errorbar(top_range,nn_auc,yerr=nn_err, fmt='o')
plt.plot(top_range,rf_auc,label = 'rf_auc')
plt.plot(top_range,lg_auc,label = 'lg_auc')
plt.plot(top_range,nn_auc,label = 'nn_auc')
plt.legend(loc='best')
plt.show()