In [1]:
import biotite.sequence.io.fasta as bioseq
import pandas as pd
import numpy as np

In [8]:
# Import sequences from FASTA file
fasta_file = bioseq.FastaFile()
fasta_file.read("sequences/p450_edit.fasta")
sequences = bioseq.get_sequences(fasta_file)

# put headers in list
headers = []
for key in sequences.keys():
    headers.append(key)

# convert keys to pandas dataframe
df = pd.DataFrame(headers, columns=['header'])

# split by ~, - and / and put in new columns
df = df['header'].str.split('~|-|/', expand=True)

# Get unique values from columns 2, 3, and 4
unique_values = pd.unique(df[[2, 3, 4]].values.ravel('K'))
unique_values = unique_values[~pd.isna(unique_values)]

# Create new columns for each unique value
for value in unique_values:
    df[value] = df[[2, 3, 4]].apply(lambda x: value in x.values, axis=1)

# Drop original columns 2, 3, and 4
df = df.drop(columns=[2, 3, 4])

# Sort the new columns alphabetically
new_columns = sorted(unique_values)
all_columns = list(df.columns[:2]) + new_columns

# Reorder the dataframe columns
df = df[all_columns]

df['full_header'] = headers
df['full_header'] = df['full_header'].str.replace(r'[.~/-]', '_', regex=True)

# move full_header to first column
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

In [None]:
# summate each columns [2::] and put into new df
df2 = df.iloc[:, 2:]
# Summate each column and sort the resulting series
summed_series = df2.sum(axis=0).sort_values(ascending=False)

# Convert the sorted series to a DataFrame if needed
sorted_df = summed_series.to_frame(name='sum')

print(sorted_df)


                   sum
C_28                47
C_16                13
C_24                11
C_22                10
C_6                  7
C_2                  7
C_23                 6
C_29                 5
C_7                  4
C_3                  4
unknown              4
C_19                 4
C_11                 4
C_25                 3
C_15                 3
C_12                 3
C_21                 2
C_20                 2
C_30                 2
Baeyer_Villiger      1
C_26                 1
C_13                 1
C_5                  1
C_1                  1
Unkown               1
cycloartane          1
14α_demethylation    1
