# Mining & Modelling Character Networks

In [8]:
import pandas as pd
import numpy as np
import networkx as nx
import io
import random

Paper Reference [Here](https://math.ryerson.ca/~abonato/papers/CharacterNetworks_WAW_Aug1_BDAEGH.pdf)

In [2]:
# constants
data_path = './data/data.csv'

## Mining Character Networks

```python
def main():
    filename = './twilightEdgesNames.csv'
    savename = './twilightEdgesIDs.txt'
    savename2 = './twilightEdgesIDsWeights.txt'
    
    E = pd.read_csv(filename)
    E1 = E['Source']
    E2 = E['Target']
    namesText = np.unique(np.vstack((E1,E2)))
    namesInds = [i for i in range(len(namesText))]
    # print namesText,namesInds
    E1 = E1.replace(namesText,namesInds)
    E2 = E2.replace(namesText,namesInds)
    #write to file
    out = np.column_stack((E1,E2))
    # labelNames = 'Source,Target'
    np.savetxt(savename,out,fmt=('%d','%d'),delimiter='\t',comments='')
    #save weights too
    np.savetxt(savename2,np.column_stack((out,E['weight'])),fmt=('%d','%d','%d'),delimiter='\t',comments='')

    print "n: %d" % len(namesText)
    # print "E: %d" % E['weight'].sum()
    print "E: %d" % E.shape[0]
```

## Create Network

In [5]:
twilight_names = [
    'Bella Swan', 'Edward Cullen', 'Jacob Black', 'Carlisle Cullen', 'Esme Cullen', 'Alice Cullen', 'Emmett Cullen',
    'Rosalie Hale', 'Jasper Hale', 'Renesmee Cullen', 'James', 'Victoria', 'Laurent', 'Riley Biers', 'Bree Tanner',
    'Sam Uley', 'Quil Ateara V', 'Embry Call', 'Paul Lahote', 'Jared Cameron', 'Leah Clearwater', 'Seth Clearwater',
    'Collin Littlesea', 'Brady Fuller', 'Charlie Swan', 'Renée Dwyer', 'Harry Clearwater', 'Billy Black', 'Tyler Crowley',
    'Lauren Mallory', 'Mike Newton', 'Jessica Stanley', 'Angela Weber', 'Eric Yorkie', 'Emily Young', 'Sue Clearwater',
    'Quil Ateara III', 'J. Jenks'
]

In [16]:
def sample_data(n, char_list = twilight_names):
    '''
    This function will generate random data
    '''
    d = pd.DataFrame(
        {
            'Source' : [random.choice(twilight_names) for _ in range(n)],
            'Target' : [random.choice(twilight_names) for _ in range(n)],
            'Weight' : [random.randint(1, 25) for _ in range(n)]
        }
    ).drop_duplicates()
    return d

In [17]:
E = sample_data(1000)

In [18]:
E1 = E['Source']
E2 = E['Target']

In [19]:
namesText = np.unique(np.vstack((E1,E2)))
namesInds = [i for i in range(len(namesText))]

In [20]:
# print namesText,namesInds
E1 = E1.replace(namesText,namesInds)
E2 = E2.replace(namesText,namesInds)

In [21]:
#write to file
out = np.column_stack((E1,E2))

In [24]:
# labelNames = 'Source,Target'
np.savetxt(savename,out,fmt=('%d','%d'),delimiter='\t',comments='')

In [25]:
#save weights too
np.savetxt(savename2,np.column_stack((out,E['weight'])),fmt=('%d','%d','%d'),delimiter='\t',comments='')

## k-Profile

In [72]:
import numpy as np
import pandas as pd
from itertools import combinations
import io
from snap import GenPrefAttach,SaveEdgeList,TRnd
import subprocess

def getDegreeList(A):
    # n = np.unique(np.vstack((A[:,0],A[:,1]))).shape[0]
    n = int(np.max(np.vstack((A[:,0],A[:,1]))) + 1)
    degreeVec = np.zeros(n,dtype=int)
    for e in range(A.shape[0]):
        degreeVec[int(A[e,0])] += 1
        degreeVec[int(A[e,1])] += 1
    return degreeVec

def makeWeightedEdgelist(A,outname):
    #still remove self loops, as they make no sense in this context
    Atmp = np.array([row for row in A if row[0] != row[1]])
    inds = np.lexsort((Atmp[:,1],Atmp[:,0]))
    Asort = Atmp[inds,:]
    #get number of unique entries by taking diff
    Adiff1 = np.vstack((np.array([1,1]),np.diff(Asort,axis=0)))
    Adiff = np.any(Adiff1!=0,axis=1)
    #find where the diffs are equal to 1 a and diff that to get counts of unique
    outUnique = Asort[Adiff==1]
    outCounts = np.diff(np.hstack((np.where(Adiff==1)[0],Adiff.shape[0])))
    out = np.column_stack((outUnique,outCounts))
    if outname:
        np.savetxt(outname,out,fmt=('%d','%d','%d'),delimiter='\t',comments='')
    return out


def removeDuplicateEdges(X):
    #remove duplicates and self loops (and also sort)
    # xtmp = np.vstack({tuple(row) for row in X})
    xtmp = np.vstack({tuple(row) for row in X if row[0] != row[1]})
    inds = np.lexsort((xtmp[:,1],xtmp[:,0]))
    out = xtmp[inds,:]
    return out
        

def myPA(nodes,m,seed=4639):
    np.random.seed(seed)
    edgeList = []
    degreeVec = np.zeros(nodes)
    #initialize first step
    degreeVec[0:2] = np.array([1, 1])
    edgeList.append((0,1))
    for n in np.arange(2,nodes):
        #connect to existing vertices according to preferential attachment model
        # weighting of distribution is degreeVec[:n]
        probs = np.double(degreeVec[:n])
        neighbors = np.random.choice(np.arange(n),m,replace=True,p=probs/np.sum(probs))
        # print neighbors
        degreeVec[n] = m
        for dit in np.arange(m):
            #if edge included, increment both degrees and append edge to the list
            degreeVec[neighbors[dit]] += 1
            edgeList.append((neighbors[dit],n))
        # print degreeVec
        # print "avg degree: " + str(np.sum(degreeVec)/n)
    return np.asarray(edgeList)


def generateGraphs(params):
    graphname = params['graph']
    n = int(params['n'])
    numit = int(params['numGen'])
    graphType = params['type']
        
    if graphType == 'GNP':
        deg = int(params['d'])
        #every node has average degree deg, total number of edges is deg*n/2, divide by total possible edges 2/(n*(n-1))
        p = float(deg)/(n-1)
        # print "degree is " + str(p)
        np.random.seed(4639)
        #generate all randomness at once
        pairs = np.array([t for t in combinations(np.arange(n),2)])
        ps = np.random.rand(pairs.shape[0],numit) <= p
        for it in np.arange(numit):
            #keep the edges that are sampled
            pairsKeep = pairs[ps[:,it]==1]
            outname = graphname + '_' + graphType + '_' + str(it) + '.txt'
            np.savetxt(outname,pairsKeep,fmt=('%d','%d'),delimiter='\t',comments='')

    elif graphType == 'PA':
        deg = int(params['d'])
        for it in np.arange(numit):
            #is this degree right? or scale by 2
            #solve directly: 2/n + 2m = deg = 2|E|/n
            # x = myPA(n, int(deg-2./n), seed=it*4639+5011)
            x = myPA(n, int(deg/2.-1./n), seed=it*4639+5011)
            # x = myPA(n, int(deg/2.), seed=it*4639+5011)
            tmpname = graphname + '_' + graphType + '_' + str(it) + '_dup.txt'
            outname = graphname + '_' + graphType + '_' + str(it) + '.txt'
            # outname = graphname + '_' + graphType + 'mult_' + str(it) + '.txt'
            # makeWeightedEdgelist(x,tmpname)
            # np.savetxt(tmpname,x,fmt=('%d','%d'),delimiter='\t',comments='')
            xfinal = removeDuplicateEdges(x)
            np.savetxt(outname,xfinal,fmt=('%d','%d'),delimiter='\t',comments='')
            #make a weighted graph, keep track of weights for direct comparison with twilightEdgesIDsWeights.txt
            
    #keep the top edges that correspond to target |E| in original graph
    elif graphType == 'Pthresh':
        deg = int(params['d'])
        # Etarget = deg*n/2
        for it in np.arange(numit):
            #is this degree right? or scale by 2
            #solve directly: 2/n + 2m = deg = 2|E|/n
            x = myPA(n, int(deg/2.-1./n), seed=it*4639+5011)
            tmpname = graphname + '_' + graphType + '_' + str(it) + '_dup.txt'
            outname = graphname + '_' + graphType + '_' + str(it) + '.txt'
            xweighted = makeWeightedEdgelist(x,tmpname)
            #take the Etarget edges with largest weight
            Etarget = min(np.floor(deg*n/2.),xweighted.shape[0])
            eind = np.argsort(xweighted[:,2])[::-1] #sort by weight
            xtop = removeDuplicateEdges(xweighted[eind[:Etarget],:2])
            np.savetxt(outname,xfinal,fmt=('%d','%d'),delimiter='\t',comments='')
            

    elif graphType == 'PAsnap':
        deg = int(params['d'])
        Trnd1 = TRnd()
        for it in np.arange(numit):
            #generate graph
            Trnd1.PutSeed(it*4639+5011)
            x = GenPrefAttach(n,deg,Trnd1)
            #save output
            outname = graphname + '_' + graphType + '_' + str(it) + '.txt'
            SaveEdgeList(x,outname)
            #remove the top 3 lines, sed -i '' -e 1,3d tmp.txt
            emp = ''
            out = subprocess.call(["sed", "-i", emp, "-e", "1,3d", outname])
            
    elif graphType == 'CL':
        #get degree sequence from input
        w = params['dList']
        wnorm = float(np.sum(w))
        nc2 = int(n*(n-1)/2)
        pairs = np.zeros((nc2,2))
        pairComp = np.zeros(nc2)
        for e,(i,j) in enumerate(combinations(np.arange(n),2)):
            #array comparison
            pairComp[e] = w[i]*w[j]/wnorm
            pairs[e,0] = i
            pairs[e,1] = j
        rands = np.random.rand(nc2,numit)
        for it in np.arange(numit):
                pairsKeep = pairs[rands[:,it] < pairComp]
                outname = graphname + '_' + graphType + '_' + str(it) + '.txt'
                np.savetxt(outname,pairsKeep,fmt=('%d','%d'),delimiter='\t',comments='')

    elif graphType == 'CNFG':
        w = params['dList']
        wnorm = int(np.sum(w))
        elist = np.zeros(wnorm)
        st = 0
        for i,wi in enumerate(w):
            elist[st:(st+wi)] = i
            st += wi
        for it in np.arange(numit):
            plist = np.random.permutation(elist)
            x = plist.reshape(-1,2)
            #if column 1 is greater than column 0 then swap that column
            xswap = x[:,0] > x[:,1]
            x[xswap,0:2] = np.column_stack((x[xswap,1],x[xswap,0]))
            tmpname = graphname + '_' + graphType + '_' + str(it) + '_wt.txt'
            outname = graphname + '_' + graphType + '_' + str(it) + '.txt'
            #sort correctly and remove self loops, duplicates
            xweighted = makeWeightedEdgelist(x,tmpname)
            np.savetxt(outname,xweighted[:,:2],fmt=('%d','%d'),delimiter='\t',comments='')
            
if __name__ == '__main__':
    #example parameters
    #123 undirected, but 1031 total weight if including multiedges
#     params = {'graph': 'twilight','type':'PA','n': 27,'d': int(2*1031/27),'numGen': 3}
    params = {'graph': 'twilight','type':'CL','n': 3,'dList': [int(2*1031/27), 72, 76],'numGen': 3}

    # params = {'graph': 'twilight','type':'PA','n': 27,'d': int(2*123/27),'numGen': 3}
    # 575 undirected, but 9464 total weight if including multiedges
#     params = {'graph': 'goblet','type':'PA','n': 62,'d': int(2*575/62),'numGen': 3}
    generateGraphs(params)

# Modelling Character Networks

## Preferential Attachement

In [89]:
import numpy as np
import pandas as pd
from itertools import combinations
import io
from snap import GenPrefAttach,SaveEdgeList,TRnd
from sklearn import svm, base, feature_selection, linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve,auc,classification_report,f1_score,accuracy_score,roc_auc_score
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import subprocess
# from generateGraphsMain import *
from os import listdir

#to visualize
from six import StringIO
import pydot

In [90]:
def doSVMcvPrediction(Xin,yin,Xtest,ytest,modType):
    #input: training/test data and labels, model type
    #supported models: SVM (l1 and l2), AdaBoost, decision tree, and random forest
    #train with 5-fold cross validation, then test once using test (holdout) data
    #once the best estimator is chosen here, train on the entire dataset (in + test) outside this function
    #output: training accuracy, gereralization accuracy, feature weights/importances, classifier, 
    #  classification report, training f1-score and generalization f1-score
    nfolds = 5
    cv = cross_val_score(yin,nfolds,shuffle=True)
    #l1 penalty enforces sparsity in weights, only available for linear SVM classifier
    if modType in ('SVM-L2','svm-l2'):
        clasf = svm.LinearSVC(loss='squared_hinge', penalty='l2', tol=.001, dual=False, class_weight='balanced')
        cvclasf = GridSearchCV(clasf, param_grid = {
            'C' : [0.05, 0.1, 0.5, 1, 5, 10, 500, 1000]
            }, verbose=0,refit=True,
            cv=cv,
            # scoring='roc_auc',
            scoring='f1_weighted',
        n_jobs=4)

    elif modType in ('SVM-L1','svm-l1'):
        clasf = svm.LinearSVC(loss='squared_hinge', penalty='l1', tol=.001, dual=False, class_weight='balanced')
        cvclasf = GridSearchCV(clasf, param_grid = {
            'C' : [0.05, 0.1, 0.5, 1, 5, 10, 500, 1000]
            }, verbose=0,refit=True,
            cv=cv,
            scoring='f1_weighted',
        n_jobs=4)
    
    #decision tree classifiers
    elif modType in ('ada','adaboost','adaboost-tree'):
        clasf = AdaBoostClassifier()
        cvclasf = GridSearchCV(clasf, param_grid = {
            'n_estimators' : [5,10,25,50,100],
            'learning_rate' : [0.1,0.3,0.5]
            }, verbose=0,refit=True,
            cv=cv,
            scoring='f1_weighted',
        n_jobs=4)

    elif modType in ('dtree','decision-tree'):
        clasf = DecisionTreeClassifier()
        cvclasf = GridSearchCV(clasf, param_grid = {
            'splitter' : ['best'],
            'criterion' : ['entropy','gini'],
            'max_features' : [0.2,'sqrt',1.],
            'max_depth' : [2,4], 
            'class_weight' : ['balanced'], 
            }, verbose=0,refit=True,
            cv=cv,
            scoring='f1_weighted',
        n_jobs=4)

    elif modType in ('rf','random-forest'):
        clasf = RandomForestClassifier()
        cvclasf = GridSearchCV(clasf, param_grid = {
            'n_estimators' : [5,10,25,50,100],
            'criterion' : ['entropy','gini'],
            'max_features' : [0.2,'sqrt',1.],
            'max_depth' : [2,4], 
            'class_weight' : ['balanced'], 
            }, verbose=0,refit=True,
            cv=cv,
            scoring='f1_weighted',
        n_jobs=4)
        
    #TODO: add linear regression, logistic regression, etc. 

    cvclasf.fit(Xin,yin)
    bclasf = cvclasf.best_estimator_
    print("%s %d-fold CV params: %s" % (modType,nfolds,cvclasf.best_params_))
    
    if modType in ('ada','adaboost-tree','dtree','decision-tree','rf','random-forest'):
        w = bclasf.feature_importances_
    elif modType in ('SVM-L1','svm-l1','SVM-L2','svm-l2'):
        w = bclasf.coef_
    
    bclasf.fit(Xin,yin)
    y_train_pred = bclasf.predict(Xin)
    acTrain = accuracy_score(yin,y_train_pred)
    f1Train = f1_score(yin,y_train_pred,average="weighted")
    
    y_pred = bclasf.predict(Xtest)
    report = classification_report(ytest, y_pred)
    acGeneral = accuracy_score(ytest, y_pred)
    f1Gen = f1_score(ytest,y_pred,average="weighted")

    return(acTrain,np.squeeze(w),bclasf,report,(acTrain,acGeneral),(f1Train,f1Gen))

def initializeDirectory(origGraph):
    #compute graph profile features using GraphLab PowerGraph
    print("initializing directory and taking features of original graph...")
    out = subprocess.check_output(["mkdir", "graphs"])
    hdr = "#graph\tsample_prob_keep\tn3_3\tn3_2\tn3_1\tn3_0\tn4_0\tn4_1\tn4_2\tn4_3\tn4_4\tn4_5\tn4_6\tn4_7\tn4_8\tn4_9\tn4_10\truntime\n"
    with open('counts_4_profilesLocal.txt', 'w') as fpt:
        fpt.write(hdr)
    pcommand = '/Users/vatsalpatel'
    out = subprocess.check_output([pcommand, "--format", "tsv", "--graph", origGraph])
    hdr2 = "#graph\tevbin0\tevbin1\tevbin2\tevbin3\tevbin4\n"
    with open('counts_eval_bins.txt', 'w') as fpt:
        fpt.write(hdr2)
    generateEigenvalueBins(origGraph,"counts_eval_bins.txt")
    return 0

def generateEigenvalueBins(gname,outDir,nbins=5):
    #get normalized laplacian
    hbins = np.histogram(np.array([0,2]),bins=nbins)[1]
    E = np.loadtxt(gname,delimiter='\t')
    #map everything to number of unique vertices
    un = np.unique(np.vstack((E[:,0],E[:,1])))
    n = len(un)
    A = np.zeros((n,n))
    for e in np.arange(E.shape[0]):
        tmp0 = np.argwhere(un==E[e,0])
        tmp1 = np.argwhere(un==E[e,1])
        A[tmp0,tmp1] = 1
        A[tmp1,tmp0] = 1
    D = np.diag(np.sum(A,1))
    Di = np.linalg.inv(np.sqrt(D))
    L = np.eye(n) - Di.dot(A).dot(Di)
    teigs = np.linalg.eigvalsh(L)
    #take histogram
    neig = len(teigs.flatten()) + nbins
    nh = np.histogram(teigs,bins=hbins)[0]    
    ep = (nh + 1.)/neig #add smoothing and normalize
    #append to file
    with open(outDir, "a") as myfile:
        myfile.write(gname + "\t" +  "\t".join([str(e) for e in ep]) + "\n")
    return 0

def writeTree(treeModel,namesList,filename):
    #utility function that plots a decision tree and saves to file
    dot_data = StringIO()
    export_graphviz(treeModel,out_file=dot_data,feature_names=namesList)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(filename) 

if __name__ == '__main__':
    
    genData = 1 #flag to generate random graphs
    classify = 0 #flag to classify
    useSpectral = 0 #flag to include eigenvalue histograms

    if genData:

        print("generating random graphs...")
        # #generate graphs, these parameters should be easy to automate
        # n,E,filename,outname = 27,123,'./twilightEdgesIDsWeights.txt','graphs/twilight'
        # n,E,filename,outname = 27,1031,'./twilightEdgesIDsWeights.txt','graphs/twilight'
        n,E,filename,outname = 39,280,'./thestandEdgesIDsWeights.txt','graphs/thestand'
        # n,E,filename,outname = 39,6539,'./thestandEdgesIDsWeights.txt','graphs/thestand'
        # n,E,filename,outname = 62,575,'./gobletEdgesIDsWeights.txt','graphs/goblet'
        # n,E,filename,outname = 62,9464,'./gobletEdgesIDsWeights.txt','graphs/goblet'
        
        #automate initializing directory
        #this will throw an error if there is already a folder named graphs
        initializeDirectory(filename)

        A = np.loadtxt(filename,delimiter='\t')
        degreeVec = getDegreeList(A)

        params = {'graph': outname,'type':'CNFG','n': n,'dList': degreeVec,'numGen': 100}
        generateGraphs(params)
        params = {'graph': outname,'type':'CL','n': n,'dList': degreeVec,'numGen': 100}
        generateGraphs(params)
        
        # params = {'graph': outname,'type':'PA','n': n,'d': int(2*E/n),'numGen': 50}
        params = {'graph': outname,'type':'PA','n': n,'d': int(2*E/n),'numGen': 100}
        generateGraphs(params)
        params = {'graph': outname,'type':'GNP','n': n,'d': int(2*E/n),'numGen': 100}
        generateGraphs(params)
        # #generate 100 graphs each from 3 classes in < 3 seconds
        #thresholded PA model
       

        graph_dir = '/Users/vatsalpatel/Documents/novels/graphs/'
        #feature design 1
        #analyze global 4-profiles
        #separate by generative model?
        # /Users/ethan/graphlab-master-2/release/apps/4-profiles/4profile --format tsv --graph /Users/ethan/Documents/novels/twilightEdgesIDs.txt
        print("computing global subgraph counts...")
        pcommand = '/Users/vatsalpatel'
        for gname in listdir(graph_dir):
            if gname.endswith('.txt') and "_wt" not in gname:
            # if "_CL_" in gname:
                # out = subprocess.check_output([pcommand, "--format", "tsv", "--graph", gname, "--per_vertex", gname])
                out = subprocess.check_output([pcommand, "--format", "tsv", "--graph", graph_dir + gname])
                if useSpectral:
                    generateEigenvalueBins(graph_dir+gname,"counts_eval_bins.txt")
        out = subprocess.check_output(["mv", "counts_4_profilesLocal.txt", "graphs"])
        out = subprocess.check_output(["mv", "counts_eval_bins.txt", "graphs"])
        #global 4 profiles for 100 graphs each from 3 classes in ~1 minute

        # additional feature design could include distributions of local 4-profiles throughout graph
        # or pagerank or centrality measures
        
    if classify:
        #build classifiers
        #as a baseline, split data into train and test 
        #this will verify the classifier can differentiate between graph families
        
        np.random.seed(423322) #for repeatability during writeup
        graphFolder = 'graphsGoblet/'
        # graphFolder = 'graphsTwilight/'
        # graphFolder = 'graphsTheStand/'
        print("Reading data from folder %s" % graphFolder)

        #read data from 4-profile output file, read labels and split into train and test
        featInds = np.arange(2,17)
        D = pd.read_csv(graphFolder + 'counts_4_profilesLocal.txt',delimiter='\t')
        X = np.array(D.ix[1:,featInds])
        # print X[0,:]
        #add eigenvalue histogram to X
        if useSpectral:
            #assume its in the exact same order
            featInds2 = np.arange(1,6) #5 bins
            D2 = pd.read_csv(graphFolder + 'counts_eval_bins.txt',delimiter='\t')
            X = np.hstack((X,np.array(D2.ix[1:,featInds2])))
            # X = np.array(D2.ix[1:,featInds2])
        # print X[0,:]
        y = np.zeros(X.shape[0])
        # D.ix[D['#graph'].str.contains('CL'),2:17]
        RGfamilies = ['CL','GNP','PA','CNFG']
        # RGfamilies = ['CL','GNP','PA','PAmult']
        for i,s in enumerate(RGfamilies):
        #     print(i)
            y[np.array(D['#graph'].ix[1:].str.contains(s))] = i
        # print y
        holdfrac = 0.5
        Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=holdfrac,stratify=y)

        #preprocess
        scaler1=StandardScaler()
        Xtrain = scaler1.fit_transform(Xtrain.astype(np.double))
        Xtest = scaler1.transform(Xtest.astype(np.double))
        
        # modelType = 'SVM-L2'
        # modelType = 'SVM-L1'
        # modelType = 'adaboost-tree'
        modelType = 'decision-tree'
        # modelType = 'random-forest'
        score,optWeights,clasf,rep,accs,f1s = doSVMcvPrediction(Xtrain, ytrain, Xtest, ytest, modelType)
        #classifier works perfectly
        # print clasf.get_params
        print("Checking distinctness of random graph families...")
        print(rep)
        # print accs[0],accs[1],f1s[0],f1s[1]
        # print np.column_stack((optWeights.T,D.columns[featInds]))
        
        #see which random graph model the novel gets classified as
        x = np.array(D.ix[0,featInds]).reshape(1,-1)
        if useSpectral:
            x = np.hstack((x,np.array(D2.ix[0,featInds2]).reshape(1,-1)))
            # x = np.array(D2.ix[0,featInds2]).reshape(1,-1)
        scaler=StandardScaler()
        X = scaler.fit_transform(X.astype(np.double))
        x = scaler.transform(x.astype(np.double)) 
        clasf.fit(X,y)
        novelRG = clasf.predict(x)
        if useSpectral:
            Fcol = D.columns[featInds].append(D2.columns[featInds2])
            # Fcol = D2.columns[featInds2]
        else:
            Fcol = D.columns[featInds]
        # print Fcol
        if modelType in ('SVM-L1','SVM-L2'):
            #TODO: print statements when useSpectral
            print("SVM feature weights:")
            print(np.column_stack((clasf.coef_.T,Fcol)))
            sc = x.dot(clasf.coef_.T) + clasf.intercept_
            print("scores:")
            # print np.vstack((RGfamilies,(sc-np.min(sc))/np.sum(sc-np.min(sc))))
            print(np.vstack((RGfamilies,sc)))
        elif modelType in ('adaboost-tree'):
            print("Adaboost feature weights:")
            print(np.column_stack((clasf.feature_importances_.T,Fcol)))
            # print "prediction probabilities:" 
            # print np.vstack((RGfamilies,clasf.predict_proba(x)))
            print("decision function:")
            print(np.vstack((RGfamilies,clasf.decision_function(x))))
            #also the actual tree?
        elif modelType in ('decision-tree'):
            print("Decision tree feature weights:")
            print(np.column_stack((clasf.feature_importances_.T,Fcol)))
            print("prediction probabilities:" )
            print(np.vstack((RGfamilies,clasf.predict_proba(x))))
            #also the actual tree
            Fcol2 = [s.replace('n3','H').replace('n4','F') for s in Fcol]
            writeTree(clasf,Fcol2,graphFolder[:-1]+'_dTree.pdf')
        elif modelType in ('random-forest'):
            print("Random Forest feature weights:")
            print(np.column_stack((clasf.feature_importances_.T,Fcol)))
            print("prediction probabilities:" )
            print(np.vstack((RGfamilies,clasf.predict_proba(x))))
        #print the prediction (and confidence score?)
        print("Fiction novel classified as: %s" % RGfamilies[int(novelRG)])

generating random graphs...
initializing directory and taking features of original graph...


CalledProcessError: Command '['mkdir', 'graphs']' returned non-zero exit status 1.

## Chung-Lu

## Binomial Random Graph

## Configuration Model

# Machine Learning