# Clean data analysis

## 1. Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import seaborn as sns

In [2]:
input_path = '../../../projects/ONT/data/pcr2persons/output/'
output_path = '../../../projects/ONT/data/pcr2persons/output/'

In [3]:
data = f'{input_path}clean_00_67.tsv'

## 2. Functions

In [4]:
def modes(s, num_of_modes=20):
    return s.value_counts().nlargest(num_of_modes)

In [5]:
def consensus_string(strings):
    if not strings or not all(len(strings[0]) == len(s) for s in strings):
        raise ValueError("Input strings must be non-empty and of equal length")

    consensus = ''
    for i in range(len(strings[0])):
        # Create a dictionary to count occurrences of each character at the current position
        char_count = {}
        for s in strings:
            char = s[i]
            char_count[char] = char_count.get(char, 0) + 1

        # Find the most frequent character at the current position
        most_frequent_char = max(char_count, key=char_count.get)

        # Append the most frequent character to the consensus string
        consensus += most_frequent_char

    return consensus

## 3. Load the data

In [6]:
df = pd.read_csv(data, sep='\t')

In [7]:
df.shape

(64504, 14)

## 4. Overview

In [8]:
df.columns

Index(['id', 'direction', 'seq', 'opt', 'qual', 'read_length', 'start', 'end',
       'prefix_flank', 'suffix_flank', 'ins', 'ins_len', 'prefix_flank_len',
       'suffix_flank_len'],
      dtype='object')

In [9]:
df['ins_len'].describe()

count    64504.000000
mean        70.740853
std         79.791442
min          2.000000
25%         15.000000
50%         47.000000
75%        106.000000
max       3367.000000
Name: ins_len, dtype: float64

In [10]:
cond = df['read_length'] != (df['prefix_flank_len'] + df['ins_len'] + df['suffix_flank_len'])
df[cond]

Unnamed: 0,id,direction,seq,opt,qual,read_length,start,end,prefix_flank,suffix_flank,ins,ins_len,prefix_flank_len,suffix_flank_len


## 5. Strand specific analysis

In [11]:
direction = 'rev'
cond = df['direction'] == direction
dfc = df[cond]

In [12]:
dfc['prefix_flank_len'].value_counts()

prefix_flank_len
346     2548
347     2548
348     2516
345     2496
349     2355
        ... 
121        1
845        1
1202       1
1190       1
796        1
Name: count, Length: 754, dtype: int64

In [13]:
cond = dfc['prefix_flank_len'] == dfc['prefix_flank_len'].value_counts().index[0]
inss = list(dfc[cond]['prefix_flank'])
consensus_string(inss)

'CCACGTATTGCTGGGGGGAGGGGGTAAAAAAAAACCAAAACCCCCAAAGGGGGGGGGGCGGGGGGGGGCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCCAAAAAAGGGGGGTGGGAAAAAAAAATTCCCCCAAAAAGGGGGGGAAGGCCCCCCCCCCCCCCCCCCAGGGGGGGGGGCCCAAAAAAAAAATTTTTCCTTTTGGGAAAAAAAAAAGGGGGGGGGGGGGGGGGGGGGGGAAAACGGGCGGCCCGGCTTGCCCCCTTCCCCGGCCCGCAGTTTGCCCCTCCCCCTCAGGGCCCCAGCCTGGCCGAAAGAAAGAAATGGTCTGTGATCCCCC'

In [14]:
cond = dfc['suffix_flank_len'] == dfc['suffix_flank_len'].value_counts().index[0]
inss = list(dfc[cond]['suffix_flank'])
consensus_string(inss)

'CATTCCCGGCTACAAGGACCCTTCGAGCCCCGTTCGCCGGCCGCGGACCCGGCCCCCCCCCCCCCGGCCCCGGGGGGGGGGGCCCGGAAAAAGGGGGGGGGGGGGGGGGGGCCCCCCCCCGGGGGGGGGTGGGAAGGGGGGGGGGGGGGGGGGGGCTTTCGCCCCCCCCCCCCCCCCCCCTTTTCCCCCGGGGGCCCCGGGGGGAAAAAAATTTTTGGGGGGGGGGGGGGGGGGGGGGCCCCGGGGGGGGCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGCCCCCCCCCCCACCCCCCCGGGGGGGGGGGGGGAAAAAAAAAAAAGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAGAGAACAA'

## 6. Find cool reads

In [15]:
cond = df['direction'] == 'fwd'
cond &= df['ins_len'] == 15
df[cond]

Unnamed: 0,id,direction,seq,opt,qual,read_length,start,end,prefix_flank,suffix_flank,ins,ins_len,prefix_flank_len,suffix_flank_len
0,@90c36059-b1b8-435f-912d-533eaf89cedd runid=83...,fwd,ACTGCTTGCTGGAACCGTTGGTCTTGCTCGGAGGACGAGGTCGATA...,+,"$$$$&,%#$&&)')','+*('('&'&./0676::85''''''&%%(...",692,340.0,355.0,ACTGCTTGCTGGAACCGTTGGTCTTGCTCGGAGGACGAGGTCGATA...,CATTCCCGGCTGCGAGGACCCTTCAGCCCGTTCGCCGGCCGCGGAC...,CAGCAGCAGCAGCAG,15,340,337
197,@44d753ee-8f19-4b39-bce0-836f728b01b1 runid=83...,fwd,GCTGCTTGGTACAGTTGGTGTTGCTGGAGGACGAGGTCGAATCAAT...,+,"$$$$##$$%''-,,---+,)))),-&&')+)'&%&%&&$$#$%'''...",708,358.0,373.0,GCTGCTTGGTACAGTTGGTGTTGCTGGAGGACGAGGTCGAATCAAT...,CATTCCCGGCTACAAGGACCTCGAGCCGTTCGCCGGCCGCAGACCC...,CAGCAGCAGCAGCAG,15,358,335
296,@21cc6eba-1009-4d2b-8cbf-538b61ce8ba0 runid=83...,fwd,TGTGTCTCTCGTCAGTTACGTATTGCTCGGAGGACGAGGTCGATGA...,+,"$%%&(&%%&$%%$$'''&%%&**,'()()0141.+**,++(&&'()...",704,349.0,364.0,TGTGTCTCTCGTCAGTTACGTATTGCTCGGAGGACGAGGTCGATGA...,CATTCCCGGCTACAAGGACCCTTCGAGCCCCGTTCGCCGGCCGCGG...,CAGCAGCAGCAGCAG,15,349,340
453,@300e2c13-f820-4fa0-b502-a7a390891700 runid=83...,fwd,TTCGTTTACTTGGTTCAGTTACGTATTGCTCGGAGGACGAGGTCGA...,+,')&%&&(&&&(*.223)(().)((''''))''((=>=;211;<3))...,717,369.0,384.0,TTCGTTTACTTGGTTCAGTTACGTATTGCTCGGAGGACGAGGTCGA...,CATTCCCGGCTACAAGGACCCTTCGAGCCCCGTTCGCCGGCCGCAG...,CAGCAGCAGCAGCAG,15,369,333
459,@0887f2a9-8c34-43d0-b1e7-75920fef05e6 runid=83...,fwd,TTGTCCTGCTGGACAGATTACGTATTGCTCGGAGGACGAGGTCAGA...,+,"$%$%###$%%&(*/0..-..01221/(+,''./6886-,,++)&&'...",712,359.0,374.0,TTGTCCTGCTGGACAGATTACGTATTGCTCGGAGGACGAGGTCAGA...,TCATCCCGGCTACAAGGACCCTTCGAGCCGTTCACCAGCCGCGGAC...,AGCAGCAGCAGCAGC,15,359,338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64318,@fe255d4c-6e90-52f9-8d90-a3302024deaa runid=83...,fwd,ATGTGGTCTACTGGTTCAGTTCGTATTGCTCGGAGGACGAGGTCAA...,+,"$$$%&$$###$%$%%%$$$$&##&/,,++,,-01;:96.,++'')(...",695,361.0,376.0,ATGTGGTCTACTGGTTCAGTTCGTATTGCTCGGAGGACGAGGTCAA...,CATTCCCGGCTACAAGAGACCCTCGACCCGTACCGCCGGCCGCAGA...,CAGCAGCAGCAGCAG,15,361,319
64396,@51819eeb-5ddd-5e99-9651-f2edc786d2f9 runid=83...,fwd,ACAGCAATAACTACTTGGTTCCATTACGTATTGCTCGGAGGACGAG...,+,"./0..(((&#""""""#'''(,+++)++.24577976911?A@<<977:...",712,365.0,380.0,ACAGCAATAACTACTTGGTTCCATTACGTATTGCTCGGAGGACGAG...,CATTCCCGGCTGCAAGGACCCTTCGACCCCGTTCGCCGGCCGCGGA...,CAGCAGCAGCAGCAG,15,365,332
64413,@8f9e1ecc-0fc8-43ec-b161-d50645301df5 runid=83...,fwd,TTTGGTCTACTGGTTCCATTAGCGTATTGCTGGGAGGACAAGGTCG...,+,"#$&%%%##%'-))-.0*%%%'&&()457,/+%%&'8866'''(*'%...",708,360.0,375.0,TTTGGTCTACTGGTTCCATTAGCGTATTGCTGGGAGGACAAGGTCG...,CATTCCCGGCTACAAGGACCCCTCGACCCCGTTCGCCGGCCGCGAC...,CAGCAGCAGCGGCGG,15,360,333
64435,@ee63ad53-9ccb-5d27-97e7-290637d59033 runid=83...,fwd,GGTGATGTCCTCGACTCGTTCAGTTACGTATTGCTCGGAGGACGAG...,+,"-$$$#$&%''*+++-.'&''*('()*))(*,+*--**219<<4-,,...",1439,351.0,366.0,GGTGATGTCCTCGACTCGTTCAGTTACGTATTGCTCGGAGGACGAG...,CATTCCCGGCTACAAGGACCCTTCAAGCCGTTCACCAGCCGCGGAC...,AGCAGCAGCAGCAGC,15,351,1073
