<a href="https://colab.research.google.com/github/vihan-lakshman/mutagenic/blob/main/interpro_scan_data_formatting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# prompt: read in /content/merged_data_noduplicates.csv and rename 'InterPro accession' to 'InterPro_original' and "InterPro description" to "InterPro_original_description". Then, delete any elements that are "-" in these two columns. Then, separate all elements in the list for these two columns to be a single string separated by semicolons

import pandas as pd
import ast

# Load the CSV file into a pandas DataFrame
try:
  df = pd.read_csv('/content/merged_data_noduplicates.csv')
except FileNotFoundError:
  print("Error: File not found. Please make sure the file path is correct.")
  exit()

# Rename columns
df = df.rename(columns={'InterPro accession': 'InterPro_original',
                        'InterPro description': 'InterPro_original_description'})

# Convert string representations of lists into actual lists
df['InterPro_original'] = df['InterPro_original'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['InterPro_original_description'] = df['InterPro_original_description'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

print(df.head())

# Remove '-' elements from the lists
df['InterPro_original'] = df['InterPro_original'].apply(lambda x: [item for item in x if item != '-'])
df['InterPro_original_description'] = df['InterPro_original_description'].apply(lambda x: [item for item in x if item != '-'])

# Function to process the lists into semicolon-separated strings
def process_column(text):
    return ';'.join(text)

# Apply the function to the columns
df['InterPro_original'] = df['InterPro_original'].apply(process_column)
df['InterPro_original_description'] = df['InterPro_original_description'].apply(process_column)

df.head()

  Protein Accession  Seq Length  \
0      5R1U_1|Chain         419   
1      6KBQ_1|Chain         373   
2      6KFD_1|Chain         162   
3      6L9T_1|Chain         595   
4      6LM2_1|Chain         184   

                                   InterPro_original  \
0  [IPR034163, -, IPR021109, IPR001461, IPR001969...   
1                                                [-]   
2                                        [IPR035992]   
3               [IPR019791, -, IPR037120, IPR010255]   
4                             [IPR012674, IPR015304]   

                       InterPro_original_description  
0  [Aspergillopepsin-like catalytic domain, -, As...  
1                                                [-]  
2                             [Ricin B-like lectins]  
3  [Haem peroxidase, animal-type, -, Haem peroxid...  
4                             [Calycin, ZinT domain]  


Unnamed: 0,Protein Accession,Seq Length,InterPro_original,InterPro_original_description
0,5R1U_1|Chain,419,IPR034163;IPR021109;IPR001461;IPR001969;IPR033121,Aspergillopepsin-like catalytic domain;Asparti...
1,6KBQ_1|Chain,373,,
2,6KFD_1|Chain,162,IPR035992,Ricin B-like lectins
3,6L9T_1|Chain,595,IPR019791;IPR037120;IPR010255,"Haem peroxidase, animal-type;Haem peroxidase d..."
4,6LM2_1|Chain,184,IPR012674;IPR015304,Calycin;ZinT domain


In [20]:
df.to_csv('merged_data_noduplicates_new.csv', index=False)
