# Protein Dataset Analysis
Following [Brian Risk's](https://www.kaggle.com/code/devraai/bioinformatics-protein-dataset-analysis/notebook) kaggle guide, this notebook will perform protein database analysis on a set of 20,000 synthetic proteins.

In [None]:
# # unzip data
# from zipfile import ZipFile

# with ZipFile("../Data/archive.zip", "r") as zfile:
#     zfile.extractall("../Data")

In [6]:
# load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [66]:
# load dataset
test = pd.read_csv("../Data/proteinas_test.csv")
train = pd.read_csv("../Data/proteinas_train.csv")
enriched = pd.read_csv("../Data/proteinas_20000_enriquecido.csv")

print(train.head())
# rename cols to english
col_names = [
    "protein_id",
    "sequence",
    "molecular_mass",
    "isoelectric_point",
    "hydrophobicity",
    "total_charge",
    "polar_proportion",
    "nonpolar_proportion",
    "sequence_length",
    "class"
]
# rename
if all(test.columns==train.columns):
    train.columns = col_names
    test.columns = col_names
print(train.columns)
print(test.columns==train.columns)

    ID_Proteína                                          Sequência  \
0  TRAIN_P00001  GNMRFVLHDEETHWGTLRTTLNCVPSDIYTISGEDSLFWGMAHPFC...   
1  TRAIN_P00002  LFKMQCSFYLLYLAKEAASYQVSMNMLCYEWYNYVYQVTVILRLSR...   
2  TRAIN_P00003  PAHLWPYWRFYVWIVFYGYHNPNYHFGMKEVKERPDCKNCTVAVLF...   
3  TRAIN_P00004  GEAFSRPHCFACAATKKGFPWARMCCTTSMAMDGVQSKMHKSKHRF...   
4  TRAIN_P00005  HYVFQGLMLHCGGYMITACGFGVIFPEQMTREGLIMHTARAHHFLI...   

   Massa_Molecular  Ponto_Isoelétrico  Hidrofobicidade  Carga_Total  \
0       20362.9468           4.866123         0.149425           -3   
1        9328.7909           6.298636         0.217105            0   
2       17616.3852           8.458977         0.192568            8   
3       35244.2968           8.448340         0.160473           21   
4       34557.9931           7.696306         0.140411           18   

   Proporção_Polar  Proporção_Apolar  Comprimento_Sequência      Classe  
0         0.241379          0.408046                    174  Estrutural  
1   

In [None]:
# rename class values to english
print(train['class'].unique())
# define new class keys 
class_mapping = {'Estrutural':'Structural','Receptora':'Receptor','Enzima':'Enzyme', 'Transporte':'Transport', 'Outras':'Other'}
# rename class values using if so that mapping doesnt rerun 
if train['class'][0] in class_mapping.keys():
    train['class'] = train['class'].map(class_mapping)
if test['class'][0] in class_mapping.keys():
    test['class'] = test['class'].map(class_mapping)

# confirmation 
print(train['class'].unique(), test['class'].unique())
print(train.head())

['Structural' 'Receptor' 'Enzyme' 'Transport' 'Other']
['Structural' 'Receptor' 'Enzyme' 'Transport' 'Other'] ['Receptor' 'Enzyme' 'Structural' 'Other' 'Transport']
     protein_id                                           sequence  \
0  TRAIN_P00001  GNMRFVLHDEETHWGTLRTTLNCVPSDIYTISGEDSLFWGMAHPFC...   
1  TRAIN_P00002  LFKMQCSFYLLYLAKEAASYQVSMNMLCYEWYNYVYQVTVILRLSR...   
2  TRAIN_P00003  PAHLWPYWRFYVWIVFYGYHNPNYHFGMKEVKERPDCKNCTVAVLF...   
3  TRAIN_P00004  GEAFSRPHCFACAATKKGFPWARMCCTTSMAMDGVQSKMHKSKHRF...   
4  TRAIN_P00005  HYVFQGLMLHCGGYMITACGFGVIFPEQMTREGLIMHTARAHHFLI...   

   molecular_mass  isoelectric_point  hydrophobicity  total_charge  \
0      20362.9468           4.866123        0.149425            -3   
1       9328.7909           6.298636        0.217105             0   
2      17616.3852           8.458977        0.192568             8   
3      35244.2968           8.448340        0.160473            21   
4      34557.9931           7.696306        0.140411            

### Cleaning + Preprocessing

In [None]:
# check for missing data
print(train.isnull().sum())
print(test.isnull().sum())
# no missing data in any column bc synthetic data 

# convert class levels to numericals
train['class'] = train['class'].astype('category').cat.codes
#  .astype('category') --> pd Categorical dtype
# .cat = categorical accessor, ie to access the categories
# .codes = sets numeric codes for the levels, -1 for missing values; gpes with the inherent order seen in the data, but can set explicitly

# target feature split
y_train = train['class']
X_train = train.drop(columns=['class'])
y_test = test['class']
X_test = test.drop(columns=['class'])




protein_id             0
sequence               0
molecular_mass         0
isoelectric_point      0
hydrophobicity         0
total_charge           0
polar_proportion       0
nonpolar_proportion    0
sequence_length        0
class                  0
dtype: int64
protein_id             0
sequence               0
molecular_mass         0
isoelectric_point      0
hydrophobicity         0
total_charge           0
polar_proportion       0
nonpolar_proportion    0
sequence_length        0
class                  0
dtype: int64
