In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.context import SparkConf
from pyspark.sql import Row
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import pyspark.sql.types as T 
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from operator import add
from functools import reduce
from bio_spark.io.fasta_reader import FASTAReade
import collections
import numpy as np

### Data Input

In [2]:
sConf = SparkConf("spark://localhost:7077")
sc = SparkContext(conf=sConf)
spark = SparkSession(sc)

plain_df = sc.textFile("/home/thiago/Dados/sparkAAI-1/data/SP1.fq").map(lambda x: Row(row=x)).toDF()

reader = FASTAReade(sc)
parsedDF = reader.read(plain_df)
parsedDF.coalesce(4)

DataFrame[seqID: struct<row:string>, seq: struct<row:string>, +: struct<row:string>, quality: struct<row:string>]

### Calculate Kmers

In [140]:
Seq2kmerTy = T.ArrayType(T.StringType())
def seq2kmer(seq_):
    value = seq_[0].strip()    
    k = 3
    num_kmers = len(value) - k + 1
    kmers_list = [value[n*k:k*(n+1)] for n in range(0, num_kmers)]
    
    # return len(value)
    return kmers_list

seq2kmer_udf = udf(seq2kmer,Seq2kmerTy)

In [141]:
kmers_of_seqs_df = parsedDF\
        .withColumn("kmers", seq2kmer_udf("seq"))\

In [142]:
kmers_of_seqs_df.select("seqID", "kmers").show()

+--------------------+--------------------+
|               seqID|               kmers|
+--------------------+--------------------+
|[@cluster_2:UMI_A...|[TTT, CCG, GGG, C...|
|[@cluster_8:UMI_C...|[TAT, CCT, TGC, A...|
|[@cluster_12:UMI_...|[GCA, GTT, TAA, G...|
|[@cluster_21:UMI_...|[GGC, ATT, GCA, A...|
|[@cluster_29:UMI_...|[CCC, CCT, TAA, A...|
|[@cluster_34:UMI_...|[TCT, TGC, AAA, A...|
|[@cluster_36:UMI_...|[TCC, CCC, CCC, C...|
|[@cluster_37:UMI_...|[GTC, TTT, GTA, C...|
|[@cluster_39:UMI_...|[CCT, TCC, ATC, A...|
|[@cluster_43:UMI_...|[GAG, TTA, TAA, T...|
|[@cluster_53:UMI_...|[GGA, AAA, TGA, A...|
|[@cluster_55:UMI_...|[AGA, CAA, AAG, G...|
|[@cluster_58:UMI_...|[AAA, GCA, ATC, C...|
|[@cluster_62:UMI_...|[GAG, TTG, CGA, C...|
|[@cluster_63:UMI_...|[GCA, ACC, ATA, C...|
|[@cluster_66:UMI_...|[CCA, ACT, AAC, T...|
|[@cluster_70:UMI_...|[CTT, TTT, CTT, C...|
|[@cluster_71:UMI_...|[CAG, CTT, TGC, A...|
|[@cluster_82:UMI_...|[GCT, TAT, GTT, T...|
|[@cluster_83:UMI_...|[CTC, CCA,

### UDFs

In [143]:
n = np.array([["1",'2'], ["3","4"]])

In [144]:
unique, counts = np.unique(n.flatten(), return_counts=True)
kmers_list = list(zip(unique.tolist(), counts.tolist()))

In [145]:
u,c = np.unique([["1", "1", "2"]], return_counts=True)

In [146]:
{k:v for k, v in zip(u, c)}

{'1': 2, '2': 1}

In [169]:
if not "":
    print("ok")

ok


In [175]:
KmerFreqTuple = T.MapType(T.StringType(), T.IntegerType())

def kmers_list2kmers_freq_dict(kmers_list):
    unique, counts = np.unique(kmers_list[0], return_counts=True)
    kmers_map = {str(k):int(v) for k, v in zip(unique, counts) if k}
    return kmers_map
#     return len(unique)

# kmers_list2kmers_freq_dict_udf = udf(kmers_list2kmers_freq_dict,KmerFreqTuple)
kmers_list2kmers_freq_dict_udf = udf(kmers_list2kmers_freq_dict)

In [176]:
agg_by_seq_df = kmers_of_seqs_df\
            .groupby("seqID")\
            .agg(F.collect_list('kmers').alias('kmers_list'))\

In [177]:
kmers_pofile_df = kmers_of_seqs_df\
            .groupby("seqID")\
            .agg(F.collect_list('kmers').alias('kmers_list'))\
            .withColumn('kmers_freq', kmers_list2kmers_freq_dict_udf('kmers_list'))

In [178]:
kmers_pofile_df.printSchema()

root
 |-- seqID: struct (nullable = true)
 |    |-- row: string (nullable = true)
 |-- kmers_list: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- kmers_freq: string (nullable = true)



In [179]:
kmers_pofile_df.select("seqID", "kmers_freq").show()

+--------------------+--------------------+
|               seqID|          kmers_freq|
+--------------------+--------------------+
|[@cluster_159:UMI...|{TTA=1, TGC=1, AA...|
|[@cluster_323:UMI...|{GGA=1, ATC=1, AA...|
|[@cluster_63:UMI_...|{ACC=1, AAA=1, CC...|
|[@cluster_626:UMI...|{GGA=1, ATC=1, AA...|
|[@cluster_389:UMI...|{CCA=2, CGG=1, CC...|
|[@cluster_475:UMI...|{CCA=1, AAA=1, CG...|
|[@cluster_823:UMI...|{TTA=2, CCA=2, AA...|
|[@cluster_131:UMI...|{ATT=1, T=1, TTT=...|
|[@cluster_36:UMI_...|{GGA=1, ATC=1, AA...|
|[@cluster_373:UMI...|{CTA=1, ATT=1, AA...|
|[@cluster_545:UMI...|{TTA=1, CTT=1, CA...|
|[@cluster_126:UMI...|{GGA=1, ATC=1, AA...|
|[@cluster_144:UMI...|{CCA=2, AAA=1, CG...|
|[@cluster_376:UMI...|{TTA=2, ATT=1, AA...|
|[@cluster_491:UMI...|{TGT=1, CCA=1, GG...|
|[@cluster_566:UMI...|{GGA=1, AAA=1, C=...|
|[@cluster_572:UMI...|{GGA=1, ATC=1, AC...|
|[@cluster_298:UMI...|{TTA=1, GGA=1, CC...|
|[@cluster_408:UMI...|{TTA=1, CCA=1, CT...|
|[@cluster_537:UMI...|{TTA=2, AC

In [163]:
agg_by_seq_df.count()

250