In [None]:
from Bio import SeqIO
from preprocess_methods import *
from kmers import *

#### Positive and negative data from `experiments.csv`

In [2]:
experiments_df, positive_df, negative_df_v1 = extract_experiments("./data/experiments.tsv")
experiments_df.head()

Unnamed: 0,curation_status,coordinate_hg38,seq_hg38
0,positive,chr16:86396481-86397120,AACTGAAGGGACCCCGTTAGCATATAAACAAAAGGTGGGGGGTAGC...
1,negative,chr16:85586489-85588130,GGCCCTGGTATGTTTGTTCTTCCAGGGGCTCCCAGGATGGATCCAG...
2,negative,chr16:80389446-80390755,AAGATTGCCATTTGGGGTGTTTCTTGGGGCTAAGAACCATGAAGAC...
3,positive,chr16:80338700-80339858,CAGAGACAGACAGTGACAGAGACAGATTTTAGAATTTGAACAAAGG...
4,negative,chr16:79936010-79937400,TGACACCCACTATTATCCAGTCCTTGATAAACCTCTTTATTTGTTC...


In [5]:
len(negative_df_v1)

1913

In [3]:
neg_v2_len = len(positive_df) - len(negative_df_v1)

print(f'We need {neg_v2_len} more negative samples')

We need 354 more negative samples


#### Random negatives

In [4]:
records = list(SeqIO.parse("./data/GRCh38.primary_assembly.genome.fa", "fasta"))

In [5]:
translated_records = translate_records("./data/GRCh38.primary_assembly.genome.fa")



In [6]:
negative_df_v2 = generate_random_negatives(translated_records, positive_df)
negative_df_v2.head()

Generated 2132 valid negative samples.


Unnamed: 0,curation_status,coordinate_hg38,seq_hg38
0,negative,KI270589.1:2486-3125,AAGGAAATATCTTCAAATAGAAACTACACAGAAGCATTCAGGGAAC...
1,negative,KI270591.1:283-1441,TAGTTTTTATGTGAAGATATTTCCTTTTCCATCATAGGCCTCAAGG...
2,negative,KI270591.1:3868-5204,GTCTACTTTTTATGTGAAGATATTTCGTTTTCCACCATAGGCCTCA...
3,negative,GL000208.1:3265-4910,GCAGATACTACAAAAAGACTGTTTCATAACTGCTCTCTCAAAAGGA...
4,negative,KI270315.1:130-1711,AAACTAGACAGAATGATTCTCAGAAACTCCTTTGTGATGTGTGCGT...


## K-mers count

#### example usage

In [7]:
# example usage 
mini_df = positive_df[:10]

k = 4
result_df = process_dataframe_with_kmers(mini_df, k)
result_df.head()

Unnamed: 0,curation_status,AACT,ACTG,CTGA,TGAA,CTTC,AAGG,AGGG,GGGA,GGAC,...,AGTA,ATAG,ACGA,GTAC,CGTA,GCGC,ACCG,CCGG,CGAC,CGCG
0,positive,0.00625,0.010937,0.014063,0.007812,0.0125,0.015625,0.00625,0.00625,0.007812,...,0.001563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,positive,0.010354,0.011217,0.013805,0.009491,0.010354,0.011217,0.014668,0.012942,0.005177,...,0.007765,0.004314,0.000863,0.003451,0.0,0.0,0.0,0.0,0.0,0.0
2,positive,0.005984,0.005984,0.00374,0.014211,0.011967,0.011219,0.010471,0.010471,0.002244,...,0.006731,0.010471,0.002244,0.000748,0.001496,0.000748,0.000748,0.0,0.0,0.0
3,positive,0.006075,0.007898,0.014581,0.009113,0.010328,0.009721,0.014581,0.012758,0.006075,...,0.004253,0.000608,0.003038,0.000608,0.001823,0.0,0.00486,0.006683,0.001215,0.000608
4,positive,0.012642,0.013906,0.013274,0.013274,0.009482,0.009482,0.003793,0.005689,0.004425,...,0.008217,0.013274,0.000632,0.0,0.000632,0.000632,0.001896,0.0,0.000632,0.0


### Test set

In [14]:
positive_test = positive_df[-400:]
negative_test = negative_df_v1[-400:]
negative_random_test = negative_df_v2.sample(400)

test_experiments = pd.concat([positive_test, negative_test])
test_random_negatives = pd.concat([positive_test, negative_random_test])

In [15]:
positive_train = positive_df[:-400]
negative_train = negative_df_v1[:-400]

negative_random_train = negative_df_v2.drop(negative_random_test.index)

### Generating dataframes

In [24]:
k_list = [3, 4, 5]
dfs_list = []

for k in k_list:
    # train
    positive_train_local = process_dataframe_with_kmers(positive_train, k)
    negative_train_local = process_dataframe_with_kmers(negative_train, k)
    negative_random_train_local = process_dataframe_with_kmers(negative_random_train, k)
    dfs_list.extend([positive_train_local, negative_train_local, negative_random_train_local])

    # test
    positive_test_local = process_dataframe_with_kmers(positive_test, k)
    negative_test_local = process_dataframe_with_kmers(negative_test, k)
    negative_random_test_local = process_dataframe_with_kmers(negative_random_test, k)

    test_experiments_local = pd.concat([positive_test_local, negative_test_local])
    test_random_negatives_local = pd.concat([positive_test_local, negative_random_test_local])

    test_experiments_local.to_csv(f'test_data/test_experiments_{k}.csv', index=False)  
    test_random_negatives_local.to_csv(f'test_data/test_random_negatives_{k}.csv', index=False)  

for idx, df in enumerate(dfs_list):
    k = idx // 3
    x = idx % 3
    
    match x:
        case 0:
            name = f'positive_train_{k+3}'
        case 1:
            name = f'negative_train_{k+3}'
        case 2:
            name = f'negative_random_train_{k+3}'
    
    df.to_csv(f'train_data/{name}.csv', index=False)   