In [97]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.context import SparkConf
from pyspark.sql import Row
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import pyspark.sql.types as T 
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from operator import add
from functools import reduce
from bio_spark.io.fasta_reader import FASTAReader, FASTAQReader
import collections
import numpy as np
import sys

from operator import add

### Data Input

In [2]:
sConf = SparkConf("spark://localhost:7077")
sc = SparkContext(conf=sConf)
spark = SparkSession(sc)

In [84]:
fasta_plain_df = sc.textFile("/home/thiago/Dados/sparkAAI-1/data/genomes/Prochlorococcus_sp_W2_genomic.fna")\
            .map(lambda x: Row(row=x))\
            .zipWithIndex()\
            .toDF(["row","idx"])

In [85]:
fasta_plain_df.show()

+--------------------+---+
|                 row|idx|
+--------------------+---+
|[>ALPB01000001.1 ...|  0|
|[GACACTCATCCAATTT...|  1|
|[AGAAAAAAATTTACTC...|  2|
|[GAACTGATATTGCTAA...|  3|
|[GCCAGATATGGAGAAG...|  4|
|[CATACCTATTATCGAG...|  5|
|[CAAATTTTATTTTGTC...|  6|
|[GCCGAACTAGATCCAA...|  7|
|[AGGAAAAATTGATAGA...|  8|
|[TGGGTTTTGAAATTAA...|  9|
|[TGGGTTGGTCCAACAC...| 10|
|[TGATCCTGTTGGAGAA...| 11|
|[TGAATCTGAAAGCCCT...| 12|
|[CGAAAATGCCATGTTA...| 13|
|[TATAGGTAAAATCGGA...| 14|
|[AAGCAGAAATAGTTGT...| 15|
|[GAAGTTAAATTTATTG...| 16|
|[>ALPB01000002.1 ...| 17|
|[CATTTCTTTAGGTATT...| 18|
|[AACTCAATCAATTTGA...| 19|
+--------------------+---+
only showing top 20 rows



In [162]:
def parse_fasta_id_line(l):
    if l[0][0] == ">":
        heaer_splits = l[0][1:].split(" ")[0]
        seq_id_split = heaer_splits.split(".")
        return seq_id_split[0]
    else:
        return None
seq2kmer_udf = udf(parse_fasta_id_line, T.StringType())

In [163]:
fasta_null_ids_df = fasta_plain_df.withColumn("seqID_wNull", seq2kmer_udf("row"))
fasta_null_ids_df.show()

+--------------------+---+------------+
|                 row|idx| seqID_wNull|
+--------------------+---+------------+
|[>ALPB01000001.1 ...|  0|ALPB01000001|
|[GACACTCATCCAATTT...|  1|        null|
|[AGAAAAAAATTTACTC...|  2|        null|
|[GAACTGATATTGCTAA...|  3|        null|
|[GCCAGATATGGAGAAG...|  4|        null|
|[CATACCTATTATCGAG...|  5|        null|
|[CAAATTTTATTTTGTC...|  6|        null|
|[GCCGAACTAGATCCAA...|  7|        null|
|[AGGAAAAATTGATAGA...|  8|        null|
|[TGGGTTTTGAAATTAA...|  9|        null|
|[TGGGTTGGTCCAACAC...| 10|        null|
|[TGATCCTGTTGGAGAA...| 11|        null|
|[TGAATCTGAAAGCCCT...| 12|        null|
|[CGAAAATGCCATGTTA...| 13|        null|
|[TATAGGTAAAATCGGA...| 14|        null|
|[AAGCAGAAATAGTTGT...| 15|        null|
|[GAAGTTAAATTTATTG...| 16|        null|
|[>ALPB01000002.1 ...| 17|ALPB01000002|
|[CATTTCTTTAGGTATT...| 18|        null|
|[AACTCAATCAATTTGA...| 19|        null|
+--------------------+---+------------+
only showing top 20 rows



In [164]:
fasta_n_filter_df = fasta_null_ids_df.withColumn(
    "seqID", F.last('seqID_wNull', ignorenulls=True)\
    .over(Window\
    .orderBy('idx')\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)))

fasta_df = fasta_n_filter_df\
                .where(F.col("seqID_wNull").isNull())\
                .select("seqID","row")\
                .toDF("seqID","seq")

In [165]:
fasta_df.show()

+------------+--------------------+
|       seqID|                 seq|
+------------+--------------------+
|ALPB01000001|[GACACTCATCCAATTT...|
|ALPB01000001|[AGAAAAAAATTTACTC...|
|ALPB01000001|[GAACTGATATTGCTAA...|
|ALPB01000001|[GCCAGATATGGAGAAG...|
|ALPB01000001|[CATACCTATTATCGAG...|
|ALPB01000001|[CAAATTTTATTTTGTC...|
|ALPB01000001|[GCCGAACTAGATCCAA...|
|ALPB01000001|[AGGAAAAATTGATAGA...|
|ALPB01000001|[TGGGTTTTGAAATTAA...|
|ALPB01000001|[TGGGTTGGTCCAACAC...|
|ALPB01000001|[TGATCCTGTTGGAGAA...|
|ALPB01000001|[TGAATCTGAAAGCCCT...|
|ALPB01000001|[CGAAAATGCCATGTTA...|
|ALPB01000001|[TATAGGTAAAATCGGA...|
|ALPB01000001|[AAGCAGAAATAGTTGT...|
|ALPB01000001|[GAAGTTAAATTTATTG...|
|ALPB01000002|[CATTTCTTTAGGTATT...|
|ALPB01000002|[AACTCAATCAATTTGA...|
|ALPB01000002|[GAGATAAGTGAATTTG...|
|ALPB01000002|[TAGTTCTTATAAACCT...|
+------------+--------------------+
only showing top 20 rows



In [166]:
fasta_df.printSchema()

root
 |-- seqID: string (nullable = true)
 |-- seq: struct (nullable = true)
 |    |-- row: string (nullable = true)



### Calculate Kmers

In [173]:
Seq2kmerTy = T.ArrayType(T.StringType())
def seq2kmer(seq_):
    value = seq_[0].strip()    
    k = 3
    num_kmers = len(value) - k + 1
    kmers_list = [value[n*k:k*(n+1)] for n in range(0, num_kmers)]
    
    # return len(value)
    return kmers_list

seq2kmer_udf = udf(seq2kmer,Seq2kmerTy)

In [174]:
fasta_kmers_df = fasta_df\
        .withColumn("kmers", seq2kmer_udf("seq"))\

In [175]:
fasta_kmers_df.show()

+------------+--------------------+--------------------+
|       seqID|                 seq|               kmers|
+------------+--------------------+--------------------+
|ALPB01000001|[GACACTCATCCAATTT...|[GAC, ACT, CAT, C...|
|ALPB01000001|[AGAAAAAAATTTACTC...|[AGA, AAA, AAA, T...|
|ALPB01000001|[GAACTGATATTGCTAA...|[GAA, CTG, ATA, T...|
|ALPB01000001|[GCCAGATATGGAGAAG...|[GCC, AGA, TAT, G...|
|ALPB01000001|[CATACCTATTATCGAG...|[CAT, ACC, TAT, T...|
|ALPB01000001|[CAAATTTTATTTTGTC...|[CAA, ATT, TTA, T...|
|ALPB01000001|[GCCGAACTAGATCCAA...|[GCC, GAA, CTA, G...|
|ALPB01000001|[AGGAAAAATTGATAGA...|[AGG, AAA, AAT, T...|
|ALPB01000001|[TGGGTTTTGAAATTAA...|[TGG, GTT, TTG, A...|
|ALPB01000001|[TGGGTTGGTCCAACAC...|[TGG, GTT, GGT, C...|
|ALPB01000001|[TGATCCTGTTGGAGAA...|[TGA, TCC, TGT, T...|
|ALPB01000001|[TGAATCTGAAAGCCCT...|[TGA, ATC, TGA, A...|
|ALPB01000001|[CGAAAATGCCATGTTA...|[CGA, AAA, TGC, C...|
|ALPB01000001|[TATAGGTAAAATCGGA...|[TAT, AGG, TAA, A...|
|ALPB01000001|[AAGCAGAAATAGTTGT

### Obtém perfis para as squências

In [18]:
KmerFreqTuple = T.MapType(T.StringType(), T.IntegerType())

def kmers_list2kmers_freq_dict(kmers_list):
    unique, counts = np.unique(kmers_list[0], return_counts=True)
    kmers_map = {str(k):int(v) for k, v in zip(unique, counts) if k}
    return kmers_map

kmers_list2kmers_freq_dict_udf = udf(kmers_list2kmers_freq_dict)

In [176]:
kmers_pofile_df = fasta_kmers_df\
            .groupby("seqID")\
            .agg(F.collect_list('kmers').alias('kmers_list'))\
            .withColumn('kmers_freq', kmers_list2kmers_freq_dict_udf('kmers_list'))

In [177]:
kmers_pofile_df.printSchema()

root
 |-- seqID: string (nullable = true)
 |-- kmers_list: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- kmers_freq: string (nullable = true)



In [178]:
kmers_pofile_df.select("seqID", "kmers_freq").show()

+------------+--------------------+
|       seqID|          kmers_freq|
+------------+--------------------+
|ALPB01000035|{TTA=2, TT=1, ATT...|
|ALPB01000095|{ATT=1, AAA=1, CC...|
|ALPB01000016|{TTA=2, CCA=1, AC...|
|ALPB01000097|{TTA=1, GG=1, CCA...|
|ALPB01000087|{TTA=5, ATT=2, AA...|
|ALPB01000043|{TTA=1, TGT=2, GG...|
|ALPB01000044|{TTA=1, TT=1, TGT...|
|ALPB01000093|{TTA=3, TGT=1, AT...|
|ALPB01000036|{GGA=1, ATT=3, AA...|
|ALPB01000053|{TTA=1, AAA=1, TT...|
|ALPB01000070|{TTA=3, AA=1, ATT...|
|ALPB01000017|{GGA=2, ATT=2, AC...|
|ALPB01000001|{TTA=1, CCA=1, AT...|
|ALPB01000059|{TTA=2, AA=1, AAA...|
|ALPB01000018|{TTA=3, ATT=2, AA...|
|ALPB01000085|{AA=1, AAA=8, CCC...|
|ALPB01000037|{CCA=2, ATT=2, AC...|
|ALPB01000028|{TTA=2, ATT=1, AA...|
|ALPB01000022|{GGA=2, CC=1, ATT...|
|ALPB01000081|{TTA=2, GG=1, ATT...|
+------------+--------------------+
only showing top 20 rows



In [179]:
kmers_pofile_df.count()

108