In [5]:
def encodeDNASeq(seq, encoding='OneHot'):
    """Encode nucleotides from character to double or OneHot encoding.
    Using OneHot nucleotides are encoded as:
    A->1000; C->0100; G->0010; T->0001; other->0000
    Using Index as: A->1.0; C->2.0; G->3.0; t->4.0; other->0.0
    @param: seq A string containing a sequence of nucleotides 
    @param: encoding_type output encodig: OneHot or Index

    """    
    if encoding=="Index":
        mymap = {'A':1.0, 'C':2.0, 'G':3.0, 'T':4.0, 'N':0.0}

    else:
        mymap ={'A':SparseVector(4, [0], [1]), 
                'C':SparseVector(4, [1], [1]), 
                'G':SparseVector(4, [2], [1]), 
                'T':SparseVector(4, [3], [1]), 
                'N':SparseVector(4,[0],[0])}    
    
    indexed_seq=list()
    for n in seq:
       indexed_seq.append(mymap.get(n) if n in mymap else SparseVector(4, [0], [0]))
    return indexed_seq   


#Split each line in single features
#encode each nucleotide using function encodeDNASeq
def load_dna_dataset(file_name, label_value=1.0, nrows=0, encoding='OneHot', setLabel=True):
    "Read Input Dataset contained in file_name. Data are labelled with value specified in label_value parameter"
    rdd = sc.textFile(file_name).flatMap(lambda line: [list(line)]).map(lambda s: encodeDNASeq(s,encoding)) 
        
    #Insert Label Column and convert Rdd into Dataframe in order to apply ML Algorithm
    df = rdd.toDF()
    if (setLabel): df = df.withColumn("label",lit(label_value)) 
        
    return df

#Inizialize Spark Context
import findspark
findspark.init("/home/osboxes/spark-2.3.1-bin-hadoop2.7")

import os
import pyspark
from pyspark.ml.linalg import SparseVector
from pyspark.ml.feature import VectorAssembler
import time
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession

#Load external Jar
os.environ['PYSPARK_SUBMIT_ARGS'] = ' --jars /home/osboxes/brainscala/target/scala-2.11/brain-scala-utils_2.11-1.0.jar pyspark-shell'


#Create Spark session
spark = SparkSession \
    .builder \
    .appName("test") \
    .getOrCreate()
sc = spark.sparkContext

#Training set Load
#The input dataset is divided in 4 files: Positive instances and negative instances for Training and Test set
#Each row of dataset cointains a string of nucleotides: A G C T.
#We will load each line as a single experiment and each character as a feature. 
#The label of each row will be "1" for positive instances and "0" for negative ones
start_time = time.time()

#Read (Positive Instance) from file system into RDD.
#training_set=load_dna_dataset("ipdata_tra_t_2018_toy.txt",label_value=1.0)
training_set=load_dna_dataset("ipdata_tra_t_2018.txt",label_value=1.0)
test_set=load_dna_dataset("ipdata_test_t_2018.txt",label_value=1.0)

#Read negative instances
#neg_tra=load_dna_dataset("ipdata_tra_f_2018_toy.txt",label_value=0.0)
neg_tra=load_dna_dataset("ipdata_tra_f_2018.txt",label_value=0.0)
neg_test=load_dna_dataset("ipdata_test_f_2018.txt",label_value=0.0)

#Create training and test dataset
training_set = training_set.union(neg_tra)
test_set = test_set.union(neg_test)

#Spark-ML algorithms requires a single vector containing each features
#Assemble vector of features
assembler = VectorAssembler(inputCols=training_set.columns[0:len(training_set.columns)-1],outputCol="features")
training=assembler.transform(training_set).select("label","features")
#Note: we are unsing the same vector assembler instantiated for trainig set.
test=assembler.transform(test_set).select("label","features")

print("Input loading Time: %f seconds ---" % (time.time() - start_time))
print("Training Tot instances: %s" %training.count())
print("Test Tot instances: %s" %test.count())

training.filter("label=1.0").createOrReplaceTempView("positivi")
training.filter("label=0.0").createOrReplaceTempView("negativi")

positive = spark.sql("select features from positivi")
negative = spark.sql("select features from negativi")

#Train a model
#The Scala static method BrainScalaMRBP.fit is invoked from SP-BRAIN linrary
model = sc._jvm.brain.scala.BrainScalaMRBP.fit(positive._jdf , negative._jdf, False, True)

model.saveModel("ipdata-model")

#Apply trained model on test set
testClassified = model.transform(test._jdf)
#Create a Dataframe

from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
sqlContext = SQLContext(sc)

dfTestClassified = DataFrame(testClassified,sqlContext)

dfTestClassified.show()

#Computer accuracy
accuracy = dfTestClassified.filter("label=class").count()/dfTestClassified.count()
print ("Accuracy = ",accuracy)

Input loading Time: 2.294711 seconds ---
Training Tot instances: 2002
Test Tot instances: 1186
+-----+--------------------+-----+
|label|            features|class|
+-----+--------------------+-----+
|  1.0|(240,[2,6,10,13,1...|  1.0|
|  1.0|(240,[3,6,9,14,17...|  1.0|
|  1.0|(240,[0,6,8,12,17...|  0.0|
|  1.0|(240,[2,6,9,15,18...|  1.0|
|  1.0|(240,[1,7,10,13,1...|  1.0|
|  1.0|(240,[0,6,10,12,1...|  0.0|
|  1.0|(240,[2,5,11,13,1...|  1.0|
|  1.0|(240,[2,6,10,13,1...|  1.0|
|  1.0|(240,[1,6,9,15,17...|  1.0|
|  1.0|(240,[0,6,8,12,17...|  0.0|
|  1.0|(240,[2,7,9,15,18...|  1.0|
|  1.0|(240,[1,7,10,13,1...|  1.0|
|  1.0|(240,[2,4,8,12,16...|  1.0|
|  1.0|(240,[2,6,9,15,17...|  1.0|
|  1.0|(240,[2,6,10,13,1...|  1.0|
|  1.0|(240,[3,6,9,15,17...|  1.0|
|  1.0|(240,[0,6,8,12,17...|  0.0|
|  1.0|(240,[2,6,9,15,18...|  1.0|
|  1.0|(240,[1,7,10,13,1...|  0.0|
|  1.0|(240,[2,4,8,12,16...|  1.0|
+-----+--------------------+-----+
only showing top 20 rows

Accuracy =  0.954468802698145
