# Load Package

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import os
cur_path = "/content/drive/MyDrive/Bigdataproject/"
os.chdir(cur_path)
!pwd

/content/drive/MyDrive/Bigdataproject


In [3]:
!pip install pyspark
!pip install pyspark[sql]
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=2cd340c3ba8386f00ca719d94b2f624980759b73fdaddbd7d844a8b9ff5ab7c2
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [4]:
import pyspark as spark
import time
from operator import add
from pyspark.sql import SparkSession

# Step 0: Read Data

## Set spark session

In [5]:
# set spark session
ss = (SparkSession
  .builder
  .master("local[5]")
  .appName("fqproject")
  .getOrCreate())


## Set spark context

In [6]:
# set spark session
sc = (spark
  .SparkContext
  .getOrCreate(spark
    .SparkConf()
    .setAppName("fqproject")
    .setMaster('local[*]')
    .set('spark.executor.memory', '4G')
    .set('spark.driver.memory', '4G')
    .set('spark.driver.maxResultSize', '4G')))

## Read reads.fq

In [7]:
# read reads.fq and extract the second line of every four lines.
reads = (sc
  .textFile(cur_path + "data/reads.fq")
  .zipWithIndex()
  .filter(lambda x: (x[1]+1)%2==0 and (x[1]+1)%4!=0)
  .map(lambda x: x[0]))

In [8]:
reads.take(10)

['TCCTTACTGGTTTTGCAGGTAACTTATAGAGTATTTCCACTTCCCTTCTCCTATCCCTTGAAAAATTGTCATTTATTTCTCTTATCCATATGGCATAATC',
 'GGTTTTTCAGGTAACTTATAGAGTATTTCCACTTCCCTTCTCCTATCCCTGGAAAAATTGTCATTTATTTCTCTTATCCATATGGCATAATCAAAGAATA',
 'AGGTAACTTATAGAGTACTTCCACTTCCCTTCTCCTATCCCTTGAAAAATTGTCATTGATTTCTCTTATCCATATGGCATAATCAAAGAATAAATTGGTG',
 'CACTTCCCTTCTACTATCCCTTGAAAAATTGTCATTTATTTCTCTTATCCATATGGCATAATCAAAGAATAAATTGGTGATATTTGTTCAAAAATCCATG',
 'CCTTCTCCTATCCCTTGAAAAATTGTCATTTATTTCCCTTATCCATATGGCATAATCAAAGAATAAATTGGTGATATTTGTTCAAAAATCCATGCCTTTT',
 'ATCCCTTGAAAAATTGTCATTTATTTCTCTTATCCATATGGCATAATCAAAGAATAAATTGGTGATATTTGTTCAAAAATCCGTGCCTATTAGATTCATT',
 'TCTTATCCATATGGCATAATCAAAGAATAAATTGGTGATATTTGTTCAAAAATCCATGCCTATTAGATTCATTTAGAATATAAAAAAATTTTATTTTATT',
 'TATCCATATGGCATAATCAAAGAATCAATTGTTGATATTTGTTCAAAAATCCATGCCTATTAGATTCATTTAGAATATAAAAAAATTTTATTTTATTTTC',
 'TCAAAGAATAAATTGGTGATATTTGTTCAAAAATCCATGCCTATTAGATTCATTTAGATTATAAAAAAATTTTATTTTATTTTCACTTATTTCTTCTCCA',
 'TGAATAAATTGGTGATATTTGTTCAAAAATCCATGCCTACTAGATTCATTTAG

## Read reference.fa

In [9]:
# read reference
reference = (ss
  .read
  .csv(
      cur_path + "data/reference_chr21_20000000_20050000.fa",
      inferSchema=True,
      header=True))

In [10]:
reference.show(10)

+--------+--------+--------------------+
|   start|     end|            sequence|
+--------+--------+--------------------+
|20000000|20000100|CCCTTCTCCTATCCCTT...|
|20000100|20000200|TAGATTCATTTAGAATA...|
|20000200|20000300|TTCTTCTTCCTGAAGAA...|
|20000300|20000400|ACTTTTCAAGGATAGTT...|
|20000400|20000500|CTGACAGGACTTCTGCC...|
|20000500|20000600|TTTCCTTTTTTTTTTCT...|
|20000600|20000700|ATTATAAAAAGGGAGGG...|
|20000700|20000800|TTCTTTTCTTTTTCTCC...|
|20000800|20000900|ATAAATTTCTGCTTGAA...|
|20000900|20001000|TTCGTTAGTGTTTTTTA...|
+--------+--------+--------------------+
only showing top 10 rows



# Step 1: Cut reads into kmers = 15

## Define K-mers

In [11]:
# input a string s and cut k-mers
def extract_kmers(s: str, k: int = 15) -> list:
    return [s[i:i+k] for i in range(len(s)-k+1)]

## Map K-mers to reads

In [12]:
# K-mers of reads
reads_kmers = (reads
  .map(extract_kmers))

In [13]:
reads_kmers.take(2)

[['TCCTTACTGGTTTTG',
  'CCTTACTGGTTTTGC',
  'CTTACTGGTTTTGCA',
  'TTACTGGTTTTGCAG',
  'TACTGGTTTTGCAGG',
  'ACTGGTTTTGCAGGT',
  'CTGGTTTTGCAGGTA',
  'TGGTTTTGCAGGTAA',
  'GGTTTTGCAGGTAAC',
  'GTTTTGCAGGTAACT',
  'TTTTGCAGGTAACTT',
  'TTTGCAGGTAACTTA',
  'TTGCAGGTAACTTAT',
  'TGCAGGTAACTTATA',
  'GCAGGTAACTTATAG',
  'CAGGTAACTTATAGA',
  'AGGTAACTTATAGAG',
  'GGTAACTTATAGAGT',
  'GTAACTTATAGAGTA',
  'TAACTTATAGAGTAT',
  'AACTTATAGAGTATT',
  'ACTTATAGAGTATTT',
  'CTTATAGAGTATTTC',
  'TTATAGAGTATTTCC',
  'TATAGAGTATTTCCA',
  'ATAGAGTATTTCCAC',
  'TAGAGTATTTCCACT',
  'AGAGTATTTCCACTT',
  'GAGTATTTCCACTTC',
  'AGTATTTCCACTTCC',
  'GTATTTCCACTTCCC',
  'TATTTCCACTTCCCT',
  'ATTTCCACTTCCCTT',
  'TTTCCACTTCCCTTC',
  'TTCCACTTCCCTTCT',
  'TCCACTTCCCTTCTC',
  'CCACTTCCCTTCTCC',
  'CACTTCCCTTCTCCT',
  'ACTTCCCTTCTCCTA',
  'CTTCCCTTCTCCTAT',
  'TTCCCTTCTCCTATC',
  'TCCCTTCTCCTATCC',
  'CCCTTCTCCTATCCC',
  'CCTTCTCCTATCCCT',
  'CTTCTCCTATCCCTT',
  'TTCTCCTATCCCTTG',
  'TCTCCTATCCCTTGA',
  'CTCCTATCCC

# Step 2: • Cut reference bins into kmers = 15

In [14]:
# K-mers of reference
reference_kmers = (reference
  .select('sequence')
  .rdd
  .flatMap(lambda x: x)
  .map(extract_kmers))

In [15]:
reference_kmers.take(2)

[['CCCTTCTCCTATCCC',
  'CCTTCTCCTATCCCT',
  'CTTCTCCTATCCCTT',
  'TTCTCCTATCCCTTG',
  'TCTCCTATCCCTTGA',
  'CTCCTATCCCTTGAA',
  'TCCTATCCCTTGAAA',
  'CCTATCCCTTGAAAA',
  'CTATCCCTTGAAAAA',
  'TATCCCTTGAAAAAT',
  'ATCCCTTGAAAAATT',
  'TCCCTTGAAAAATTG',
  'CCCTTGAAAAATTGT',
  'CCTTGAAAAATTGTC',
  'CTTGAAAAATTGTCA',
  'TTGAAAAATTGTCAT',
  'TGAAAAATTGTCATT',
  'GAAAAATTGTCATTT',
  'AAAAATTGTCATTTA',
  'AAAATTGTCATTTAT',
  'AAATTGTCATTTATT',
  'AATTGTCATTTATTT',
  'ATTGTCATTTATTTC',
  'TTGTCATTTATTTCT',
  'TGTCATTTATTTCTC',
  'GTCATTTATTTCTCT',
  'TCATTTATTTCTCTT',
  'CATTTATTTCTCTTA',
  'ATTTATTTCTCTTAT',
  'TTTATTTCTCTTATC',
  'TTATTTCTCTTATCC',
  'TATTTCTCTTATCCA',
  'ATTTCTCTTATCCAT',
  'TTTCTCTTATCCATA',
  'TTCTCTTATCCATAT',
  'TCTCTTATCCATATG',
  'CTCTTATCCATATGG',
  'TCTTATCCATATGGC',
  'CTTATCCATATGGCA',
  'TTATCCATATGGCAT',
  'TATCCATATGGCATA',
  'ATCCATATGGCATAA',
  'TCCATATGGCATAAT',
  'CCATATGGCATAATC',
  'CATATGGCATAATCA',
  'ATATGGCATAATCAA',
  'TATGGCATAATCAAA',
  'ATGGCATAAT

# Step 3: Collect all kmers and build a distinct kmer set

• Hint: you can use python “set” function

• Report the number of distinct kmers (N)

In [16]:
# Merge and distinct, collect to a set
kmers = set(reads_kmers 
  .union(reference_kmers)
  .flatMap(lambda x: x) 
  .distinct()
  .collect())

In [17]:
len(kmers)

72530

**The number of distinct kmers is 72,530**


In [18]:
reads

PythonRDD[33] at RDD at PythonRDD.scala:53

# Step4

In [19]:
from pyspark.ml.linalg import Vectors
import pandas as pd

In [20]:
# turn reads to list 
r_list = reads.collect()

In [21]:
# check feature vector of every kmer in each read
def check_kmer_features(kmer):
  m = []
  

  for i in range(len(r_list)):
    if kmer in r_list[i]:
      m.append(i)
  v = [1.0 for i in range(len(m))]
  return (kmer,Vectors.dense(Vectors.sparse(len(r_list),m,v)))

In [22]:
kmers_rdd = sc.parallelize(list(kmers))

In [23]:
reads_features = kmers_rdd.map(lambda x: check_kmer_features(x))

In [24]:
reads_features_df = reads_features.toDF(['kmer','read_features'])

In [27]:
#generate the dataframe with sparse vector
reads_features_df.show(1)

+---------------+--------------------+
|           kmer|       read_features|
+---------------+--------------------+
|CAACCTTTTTCATTC|[0.0,0.0,0.0,0.0,...|
+---------------+--------------------+
only showing top 1 row



In [28]:
from pyspark.sql.functions import col
from pyspark.ml.functions import vector_to_array

In [29]:
reads_features_df = reads_features_df.withColumn("read", vector_to_array("read_features")).select(["kmer"] + [col("read")[i] for i in range(2000)])

In [47]:
# old code
def check_kmer_features_old(read):
  m = []
  kmers_list = list(kmers)
  for i in range(len(kmers_list)):
    if kmers_list[i] in read:
      m.append(i)
  v = [1.0 for i in range(len(m))]
  return (read,Vectors.sparse(len(kmers_list),m,v))


In [48]:
reads_features = reads.map(lambda x: check_kmer_features_old(x))

In [49]:
reads_features.take(2)

[('TCCTTACTGGTTTTGCAGGTAACTTATAGAGTATTTCCACTTCCCTTCTCCTATCCCTTGAAAAATTGTCATTTATTTCTCTTATCCATATGGCATAATC',
  SparseVector(72530, {173: 1.0, 849: 1.0, 1689: 1.0, 2455: 1.0, 4004: 1.0, 4616: 1.0, 6086: 1.0, 6582: 1.0, 7637: 1.0, 7933: 1.0, 9005: 1.0, 11640: 1.0, 13194: 1.0, 16389: 1.0, 17650: 1.0, 18185: 1.0, 18284: 1.0, 19258: 1.0, 19561: 1.0, 20329: 1.0, 20407: 1.0, 21799: 1.0, 22096: 1.0, 23407: 1.0, 23745: 1.0, 23949: 1.0, 23971: 1.0, 24731: 1.0, 25536: 1.0, 25951: 1.0, 26534: 1.0, 26772: 1.0, 27988: 1.0, 28205: 1.0, 30102: 1.0, 31467: 1.0, 34055: 1.0, 34662: 1.0, 35092: 1.0, 36083: 1.0, 37849: 1.0, 38450: 1.0, 38455: 1.0, 40039: 1.0, 40467: 1.0, 40756: 1.0, 41109: 1.0, 41682: 1.0, 42486: 1.0, 42558: 1.0, 42904: 1.0, 43058: 1.0, 43480: 1.0, 44205: 1.0, 45089: 1.0, 46236: 1.0, 50527: 1.0, 51927: 1.0, 53792: 1.0, 54043: 1.0, 54196: 1.0, 54915: 1.0, 55897: 1.0, 56225: 1.0, 58227: 1.0, 59433: 1.0, 59625: 1.0, 61053: 1.0, 61058: 1.0, 61214: 1.0, 61334: 1.0, 61466: 1.0, 61883: 1.0, 61971: 1

# Step5

In [34]:
reference_rdd = reference.select("sequence").rdd.map(lambda x: x['sequence'])

In [35]:
reference_list = reference_rdd.collect()
reference_list[0]

'CCCTTCTCCTATCCCTTGAAAAATTGTCATTTATTTCTCTTATCCATATGGCATAATCAAAGAATAAATTGGTGATATTTGTTCAAAAATCCATGCCTAT'

In [36]:
#generate the name of columns
start_rdd = reference.select("start").rdd.map(lambda x: x['start'])
end_rdd = reference.select("end").rdd.map(lambda x: x['end'])
s_list = start_rdd.collect()
e_list = end_rdd.collect()
reference_name = ['bin' + str(s_list[i]) + '_' +  str(e_list[i]) for i in range(len(s_list))]
reference_name[0]

'bin20000000_20000100'

In [37]:
# check feature vector of every kmer in each reference
def check_kmer_features_sequence(kmer):
  m = []
  for i in range(len(reference_list)):
    if kmer in reference_list[i]:
      m.append(i)
  v = [1.0 for i in range(len(m))]
  return (kmer,Vectors.dense(Vectors.sparse(len(reference_list),m,v)))

In [38]:
reference_features = kmers_rdd.map(lambda x:check_kmer_features_sequence(x))

In [39]:
# turn to dataframe
reference_features_df = reference_features.toDF(['kmer','reference_features'])

In [40]:
reference_features_df.show(1) 

+---------------+--------------------+
|           kmer|  reference_features|
+---------------+--------------------+
|CAACCTTTTTCATTC|[0.0,0.0,0.0,0.0,...|
+---------------+--------------------+
only showing top 1 row



In [41]:
# expand the dense vector to different collomns
reference_features_df = reference_features_df.withColumn("xs", vector_to_array("reference_features")).select(["kmer"] + [col("xs")[i] for i in range(len(reference_name))])

In [42]:
reference_name = ['kmer'] + reference_name
reference_features_df = reference_features_df.select([col(c).alias(reference_name[i]) for i, c in enumerate(reference_features_df.columns)])

In [50]:
# old code 
reference_features = reference_rdd.map(lambda x: check_kmer_features_old(x))
reference_features.take(1)

[('CCCTTCTCCTATCCCTTGAAAAATTGTCATTTATTTCTCTTATCCATATGGCATAATCAAAGAATAAATTGGTGATATTTGTTCAAAAATCCATGCCTAT',
  SparseVector(72530, {173: 1.0, 849: 1.0, 2582: 1.0, 5003: 1.0, 6086: 1.0, 7344: 1.0, 7637: 1.0, 8855: 1.0, 9005: 1.0, 9272: 1.0, 11640: 1.0, 13478: 1.0, 13828: 1.0, 14095: 1.0, 15251: 1.0, 15326: 1.0, 15836: 1.0, 16315: 1.0, 16667: 1.0, 17220: 1.0, 17650: 1.0, 18185: 1.0, 19258: 1.0, 19561: 1.0, 20329: 1.0, 20407: 1.0, 21799: 1.0, 22096: 1.0, 23022: 1.0, 25536: 1.0, 25631: 1.0, 26031: 1.0, 26534: 1.0, 27988: 1.0, 33455: 1.0, 33762: 1.0, 34055: 1.0, 34652: 1.0, 36121: 1.0, 38455: 1.0, 38721: 1.0, 40467: 1.0, 40756: 1.0, 41109: 1.0, 41682: 1.0, 42036: 1.0, 42486: 1.0, 42558: 1.0, 43058: 1.0, 43784: 1.0, 44205: 1.0, 45089: 1.0, 46236: 1.0, 46870: 1.0, 47314: 1.0, 48111: 1.0, 49405: 1.0, 49872: 1.0, 50165: 1.0, 51927: 1.0, 53091: 1.0, 53758: 1.0, 53792: 1.0, 54043: 1.0, 54093: 1.0, 57833: 1.0, 58227: 1.0, 58706: 1.0, 59433: 1.0, 61053: 1.0, 61214: 1.0, 61971: 1.0, 62465: 1.0, 62579: 


# Step 6

In [None]:
def check_kmer_features_new(read,k):
    m = []
  
    for i in range(len(k)):
        if k[i] in read:
            m.append(i)
    v = [1.0 for i in range(len(m))]
    return (read,Vectors.sparse(len(k),m,v))

In [56]:
def extract_sparsevector_index(rdd):
    extracted_data = rdd.map(lambda x: (x[0], x[1].toArray().tolist().index(1.0)))
    return extracted_data

In [None]:
def permutaion():
    # shuffle kmers
    import random
    kmers_list = list(kmers)
    random.shuffle(kmers_list)

    # new ref features
    reference_features = reference.select("sequence").rdd.map(lambda x:check_kmer_features_new(x['sequence'],kmers_list))
    # new read features
    reads_features = reads.map(lambda x: check_kmer_features_new(x,kmers_list))

    # pick out the first row number of 1.0 for every read
    return extract_sparsevector_index(reads_features), extract_sparsevector_index(reference_features)

In [None]:
sample_permu_read, sample_permu_ref = permutaion()

In [None]:
sample_permu_read.take(4)

In [None]:
sample_permu_ref.take(4)

In [None]:
def signature_matrix(n):
    read_sig_temp = pd.DataFrame()
    ref_sig_temp = pd.DataFrame()
    sample_permu_read, sample_permu_ref = permutaion()
    rowname_read =  sample_permu_read.map(lambda x: x[0]).collect()
    rowname_ref =  sample_permu_ref.map(lambda x: x[0]).collect()
    read_sig_temp = pd.DataFrame(columns=rowname_read)
    ref_sig_temp = pd.DataFrame(columns=rowname_ref)

    for i in range(n):
        one_permu_read, one_permu_ref = permutaion()

        second_elements_read = one_permu_read.map(lambda x: x[1]).collect()
        read_sig_temp = read_sig_temp.append(pd.Series(second_elements_read, index=read_sig_temp.columns), ignore_index=True)

        second_elements_ref = one_permu_ref.map(lambda x: x[1]).collect()
        ref_sig_temp = ref_sig_temp.append(pd.Series(second_elements_ref, index=ref_sig_temp.columns), ignore_index=True)
  
    return read_sig_temp, ref_sig_temp

In [None]:
read_sig_temp, ref_sig_temp = signature_matrix(500)

In [None]:
read_sig_temp.to_csv('read_sig_temp_1.csv', index=False)

In [None]:
ref_sig_temp.to_csv('ref_sig_temp_1.csv', index=False)

In [None]:
read_sig_temp_2, ref_sig_temp_2 = signature_matrix(500)

In [None]:
read_sig_temp_2.to_csv('read_sig_temp_2.csv', index=False)

In [None]:
ref_sig_temp_2.to_csv('ref_sig_temp_2.csv', index=False)