# Part 8: "Graph" Model  - Logistic Regression with FastRP Embeddings

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import glob

## Data Load and Formatting

In [2]:
path = '/data/neo-export/export/proj-features-labeled/' 
all_files = glob.glob(path + "nodes_Paper_[0-9]*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, header=None, 
                     names = ["nodeId", "embedding","encoding","ogbIndex", "split_segment","subject",
                              "subject_status","year"])
    li.append(df)


papers_df = pd.concat(li, axis=0, ignore_index=True)

#### Below breakouts of split segment and subject labels should match those from part 2

In [3]:
papers_df[['split_segment', 'ogbIndex']].groupby('split_segment').count()

Unnamed: 0_level_0,ogbIndex
split_segment,Unnamed: 1_level_1
TRAIN,1112392
VALIDATE,138949


In [4]:
papers_df[['subject', 'ogbIndex']].groupby('subject').count()

Unnamed: 0_level_0,ogbIndex
subject,Unnamed: 1_level_1
0,28041
1,2856
2,3907
3,1530
4,1910
...,...
148,865
149,815
150,837
151,22696


In [5]:
# Expand embedding vectors...sorry for the ugly performance warning :( - if you see a better way please recommend!
def string_to_float(x):
    return np.array(x.split(';')).astype(float)
papers_df[[f'embedding_{x}' for x in range(256)]] = papers_df.apply(
    lambda row: string_to_float(row.embedding), axis = 1, result_type ='expand')
papers_df

  self[k1] = value[k2]


Unnamed: 0,nodeId,embedding,encoding,ogbIndex,split_segment,subject,subject_status,year,embedding_0,embedding_1,...,embedding_246,embedding_247,embedding_248,embedding_249,embedding_250,embedding_251,embedding_252,embedding_253,embedding_254,embedding_255
0,205580338,0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0....,-4.32037;-0.81883;2.67975;2.31663;-3.12459;-2....,83179065,TRAIN,98,KNOWN,2018,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,205580373,0.0013625429;-9.808112E-4;4.393044E-4;-0.00290...,-7.31162;1.27658;1.8888;-0.87184;2.52357;-0.53...,83179100,TRAIN,106,KNOWN,2011,0.001363,-0.000981,...,-0.075629,0.062345,0.263585,-0.277317,-0.305459,-0.129912,0.197675,-0.090285,0.204399,0.027731
2,205580582,-0.0027561677;0.0019617418;-0.0030122166;0.001...,1.62784;-2.5646;-0.07218;-1.67851;3.02409;-1.0...,83181475,TRAIN,56,KNOWN,2012,-0.002756,0.001962,...,-0.177606,-0.167020,-0.138167,0.091243,0.195253,-0.106206,0.181296,-0.052775,-0.119006,-0.206935
3,205580641,5.627003E-4;-6.950726E-4;-0.0023442905;5.60485...,2.75564;-3.55;4.72421;-0.70463;-0.09119;-2.252...,83181534,TRAIN,128,KNOWN,2010,0.000563,-0.000695,...,-0.004462,0.146282,-0.412124,-0.086960,0.144308,-0.066715,-0.148467,-0.114543,-0.368114,-0.107802
4,205581193,-0.001374663;0.003907469;0.0010971766;-0.00113...,-2.8812;1.11153;3.17897;0.28759;1.80633;0.6327...,83179374,TRAIN,18,KNOWN,2012,-0.001375,0.003907,...,-0.271622,-0.020034,0.248432,0.019380,-0.187105,-0.047686,0.106186,-0.070420,0.294178,0.014652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1251336,189297276,0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0....,4.24787;-5.7E-4;-0.42477;-4.18265;2.68804;2.38...,66895631,TRAIN,90,KNOWN,2014,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1251337,189297335,-2.1663477E-4;0.0010571239;0.001026029;-0.0010...,-0.26447;3.84805;-0.54991;-3.11971;-2.58163;-0...,66895690,TRAIN,75,KNOWN,2018,-0.000217,0.001057,...,0.002889,0.013994,-0.170945,0.327328,0.041509,0.034720,0.121717,-0.018826,0.230952,0.070828
1251338,189297681,0.002675809;0.0013950182;2.3017207E-4;-0.00165...,1.02664;-1.2927;-2.50304;-3.02091;0.26892;-1.0...,66897904,TRAIN,34,KNOWN,2011,0.002676,0.001395,...,-0.238014,-0.050306,-0.025272,0.053312,0.107086,0.043762,0.017888,-0.369173,-0.216644,-0.192462
1251339,189297829,0.0020234333;6.1786047E-4;-7.3964836E-4;-9.056...,1.27171;7.01788;0.80883;5.77526;-1.82221;2.676...,66898052,TRAIN,4,KNOWN,2012,0.002023,0.000618,...,0.007818,-0.085743,-0.008575,0.036522,0.025726,0.165496,0.146139,-0.049635,-0.050294,-0.053110


In [6]:
X = papers_df[[f'embedding_{x}' for x in range(256)]]
y = papers_df.subject

## Logistic Regression with FastRP Features

In [7]:
X_train = X[papers_df.split_segment == "TRAIN"]
X_validate = X[papers_df.split_segment == "VALIDATE"]
y_train = y[papers_df.split_segment == "TRAIN"]
y_validate = y[papers_df.split_segment == "VALIDATE"]

In [8]:
model = LogisticRegression(multi_class='ovr', solver='saga', n_jobs=60)

In [9]:
model.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', n_jobs=60, solver='saga')

In [10]:
print('Accuracy of logistic regression classifier on VALIDATE set: {:.2f}'\
      .format(model.score(X_validate, y_validate)))

Accuracy of logistic regression classifier on VALIDATE set: 0.58


#### Note: We were able to increase classification accuracy by about 9% points from part 2 by substituting the FastRP graph features