In [None]:
import prospectdataset
import pandas as pd
import numpy as np
import os
import re
from collections import Counter
from itertools import chain
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import glob
files = glob.glob("./data/*.parquet")

In [None]:
def get_aa_ptm_matrix_counts(metadata_files=[]):
    
    counter = Counter()
    # Pattern to match the desired substring and extract information
    pattern = r"(.)(?=\[UNIMOD:(\d+)\])"
    
    for filepath in metadata_files:
        df = pd.read_parquet(filepath)
        occurences = df.modified_sequence[df.modified_sequence.str.contains("UNIMOD")].apply(
            lambda x: [m.group(1)+m.group(2) for m in re.finditer(pattern, x)]
        )
        
        counter.update(list(chain.from_iterable(occurences)))
    return counter


        
counter = get_aa_ptm_matrix_counts(files)

unique_aas = sorted(np.unique([k[0] for k in counter.keys()]))
unique_mods = sorted(np.unique([int(k[1:]) for k in counter.keys()]))
data = np.ones((len(unique_aas), len(unique_mods)))

for i, aa in enumerate(unique_aas):
    for j, m in enumerate(unique_mods):
        data[i, j] = np.log(counter.get(aa+str(m), 1))



In [None]:
unique_aas, unique_mods


In [None]:
heatmap = sns.heatmap(data, cmap="RdBu_r", linewidths=1)#, figsize=(8, 8))
#heatmap = sns.heatmap(data, cmap="gist_gray_r", linewidths=1)#, figsize=(8, 8))


#cluster.ax_heatmap.set_xticklabels(["[UNIMOD:"+x+"]" for x in unique_mods])
heatmap.set_yticklabels(unique_aas)
heatmap.set_xticklabels(["[UNIMOD:"+str(x)+"]" for x in unique_mods])

plt.yticks(rotation=45)
plt.xticks(rotation=15)

plt.savefig("heatmap.pdf", format="pdf")
plt.show()

In [None]:
d

In [None]:
def get_aas_mods_frequencies(counter):

    frequency_mods = {}
    frequency_aas = {}

    for k, v in counter.items():
        aa = k[0]
        mod = k[1:]
        frequency_mods[mod] = frequency_mods.get(mod, 0) + v
        frequency_aas[aa] = frequency_aas.get(aa, 0) + v
        
    return frequency_aas, frequency_mods

f_aas, f_mods = get_aas_mods_frequencies(counter)

In [None]:
f_aas, f_mods

In [None]:
list(f_aas.values())

In [None]:
labels = list(f_aas.keys())
bar_sizes = list(f_aas.values())

# Create a DataFrame from the lists
df = pd.DataFrame({'Amino Acid': labels, 'Count': bar_sizes}).sort_values(by = "Count", ascending = False)

df.Count = np.log(df.Count)
# Create horizontal bar plot using Seaborn
sns.barplot(x='Count', y='Amino Acid', data=df, palette="RdBu_r")
plt.xlabel("Count (log scale)")
plt.savefig("aa_freq.pdf", format="pdf")
plt.show()


In [None]:
np.log(df.Count)

In [None]:
labels = list(f_mods.keys())
bar_sizes = list(f_mods.values())

# Create a DataFrame from the lists
df = pd.DataFrame({'PTM - Unimod ID': labels, 'Count': bar_sizes}).sort_values(by = "Count", ascending = False)

df.Count = np.log(df.Count)
# Create horizontal bar plot using Seaborn
ax = sns.barplot(x='Count', y='PTM - Unimod ID', data=df, palette="RdBu_r")
plt.xlabel("Count (log scale)")
plt.savefig("ptm_freq.pdf", format="pdf")

plt.show()
