<a href="https://colab.research.google.com/github/yaneura-no-gomi/yaneura-no-gomi/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import os
import lightgbm as lgb
import pandas as pd
import torch
import random
import numpy as np
from sklearn import preprocessing as pp
from typing import List

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
input_path = os.path.join("/content/drive/MyDrive/Protein")
train = pd.read_csv(os.path.join(input_path, "pdb_data_no_dups.csv"))
use_cols = [
            'experimentalTechnique',
            'macromoleculeType',
            'residueCount',
            'resolution',
            'structureMolecularWeight',
            'crystallizationMethod',
            'crystallizationTempK',
            'densityMatthews',
            'densityPercentSol',
            'phValue',
            'publicationYear',
            'classification'
        ]
train = train.loc[:, use_cols]

In [4]:
train.head()

Unnamed: 0,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,phValue,publicationYear,classification
0,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,7.0,1994.0,DNA-RNA HYBRID
1,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,1995.0,DNA
2,X-RAY DIFFRACTION,Protein,154,2.07,18112.8,,,3.09,60.2,9.0,1999.0,OXYGEN TRANSPORT
3,X-RAY DIFFRACTION,DNA,24,2.2,7637.17,"VAPOR DIFFUSION, SITTING DROP",277.0,2.28,46.06,7.0,1995.0,DNA
4,X-RAY DIFFRACTION,Protein,165,1.74,18926.61,,,2.75,55.28,,1993.0,HYDROLASE(O-GLYCOSYL)


In [5]:
class DefaultPreprocessor:
    def __init__(self, df):
        self.train = df.copy()
    def fit_transform(self, category_cols: List) -> pd.DataFrame:
        le = pp.LabelEncoder()
        self.train.loc[:, category_cols] = self.train[category_cols].apply(le.fit_transform)
        return self.train

In [6]:
category_cols = [
            'experimentalTechnique',
            'macromoleculeType',
            'crystallizationMethod',
            'classification'
        ]

In [7]:
dp = DefaultPreprocessor(train)
p_train = dp.fit_transform(category_cols)

In [8]:
train['classification'].nunique()

5050

In [9]:
train.query("macromoleculeType=='DNA'")['classification'].value_counts()

DNA                         1714
PEPTIDE NUCLEIC ACID           7
DNA/ANTIBIOTIC                 6
PEPTIDE NUCLEIC ACID/DNA       3
NUCLEIC ACID                   3
DNA/PEPTIDE NUCLEIC ACID       2
DNA/antibiotic                 2
dna/antibiotic                 2
DNA/INHIBITOR                  1
DNA/DNA INHIBITOR              1
Peptide Nucleic Acid           1
DRUG/DNA                       1
DNA BINDING PROTEIN            1
Name: classification, dtype: int64

In [10]:
train['classification'].value_counts()[:40]

HYDROLASE                                  20915
TRANSFERASE                                15777
OXIDOREDUCTASE                             12494
LYASE                                       4329
IMMUNE SYSTEM                               4075
TRANSCRIPTION                               3691
TRANSPORT PROTEIN                           3251
SIGNALING PROTEIN                           2904
HYDROLASE/HYDROLASE INHIBITOR               2677
ISOMERASE                                   2602
VIRAL PROTEIN                               2320
LIGASE                                      2133
PROTEIN BINDING                             1892
DNA                                         1765
STRUCTURAL GENOMICS, UNKNOWN FUNCTION       1738
MEMBRANE PROTEIN                            1731
TRANSFERASE/TRANSFERASE INHIBITOR           1695
DNA BINDING PROTEIN                         1517
RIBOSOME                                    1462
METAL BINDING PROTEIN                       1383
SUGAR BINDING PROTEI

In [33]:
train['experimentalTechnique'].value_counts()

X-RAY DIFFRACTION                                            126432
SOLUTION NMR                                                  12268
ELECTRON MICROSCOPY                                            2252
SOLID-STATE NMR                                                  99
ELECTRON CRYSTALLOGRAPHY                                         74
NEUTRON DIFFRACTION                                              60
FIBER DIFFRACTION                                                40
NEUTRON DIFFRACTION, X-RAY DIFFRACTION                           38
X-RAY DIFFRACTION, NEUTRON DIFFRACTION                           33
SOLUTION SCATTERING                                              32
POWDER DIFFRACTION                                               19
SOLUTION SCATTERING, SOLUTION NMR                                 9
SOLUTION NMR, SOLUTION SCATTERING                                 6
SOLID-STATE NMR, ELECTRON MICROSCOPY                              5
X-RAY DIFFRACTION, EPR                          

In [35]:
[t for t in train['experimentalTechnique'].unique() if ',' not in t] 

['X-RAY DIFFRACTION',
 'SOLUTION NMR',
 'FIBER DIFFRACTION',
 'ELECTRON CRYSTALLOGRAPHY',
 'ELECTRON MICROSCOPY',
 'NEUTRON DIFFRACTION',
 'SOLID-STATE NMR',
 'INFRARED SPECTROSCOPY',
 'SOLUTION SCATTERING',
 'POWDER DIFFRACTION',
 'CRYO-ELECTRON MICROSCOPY',
 'ELECTRON DIFFRACTION',
 'FLUORESCENCE TRANSFER']

## `experimentTechnique` のクリーニング
### 目的
- `experimentTechnique`の値を分解して少数のカテゴリのみからなる

### idea
- `unique_tech` 単位に`experimentTechnique`を分解
- `unique_tech`は`.unique`を`.split(', ')`して取得


In [98]:
techs = []
sorted_unique = []
for ut in train['experimentalTechnique'].unique():
    sorted_unique.append(sorted(ut.split(', ')))
    for st in ut.split(', '):
        techs.append(st)

In [99]:
sorted_unique

[['X-RAY DIFFRACTION'],
 ['SOLUTION NMR'],
 ['FIBER DIFFRACTION'],
 ['ELECTRON CRYSTALLOGRAPHY'],
 ['ELECTRON MICROSCOPY'],
 ['NEUTRON DIFFRACTION'],
 ['SOLID-STATE NMR'],
 ['INFRARED SPECTROSCOPY'],
 ['SOLUTION NMR', 'THEORETICAL MODEL'],
 ['SOLUTION SCATTERING'],
 ['POWDER DIFFRACTION'],
 ['CRYO-ELECTRON MICROSCOPY'],
 ['ELECTRON DIFFRACTION'],
 ['SOLUTION NMR', 'X-RAY DIFFRACTION'],
 ['SOLUTION NMR', 'THEORETICAL MODEL'],
 ['FLUORESCENCE TRANSFER'],
 ['EPR', 'X-RAY DIFFRACTION'],
 ['SOLUTION NMR', 'SOLUTION SCATTERING'],
 ['SOLID-STATE NMR', 'SOLUTION NMR'],
 ['EPR', 'SOLUTION NMR'],
 ['SOLUTION NMR', 'SOLUTION SCATTERING'],
 ['SOLID-STATE NMR', 'SOLUTION NMR'],
 ['ELECTRON MICROSCOPY', 'SOLID-STATE NMR'],
 ['EPR', 'X-RAY DIFFRACTION'],
 ['NEUTRON DIFFRACTION', 'X-RAY DIFFRACTION'],
 ['FIBER DIFFRACTION', 'SOLID-STATE NMR'],
 ['ELECTRON MICROSCOPY', 'SOLUTION SCATTERING'],
 ['ELECTRON MICROSCOPY', 'SOLUTION SCATTERING'],
 ['NEUTRON DIFFRACTION', 'X-RAY DIFFRACTION'],
 ['ELECTRON MIC

In [101]:
len(sorted_unique), len(train['experimentalTechnique'].unique())

(33, 33)

In [80]:
unique_tech = sorted(set(techs))
unique_tech

['CRYO-ELECTRON MICROSCOPY',
 'ELECTRON CRYSTALLOGRAPHY',
 'ELECTRON DIFFRACTION',
 'ELECTRON MICROSCOPY',
 'EPR',
 'FIBER DIFFRACTION',
 'FLUORESCENCE TRANSFER',
 'INFRARED SPECTROSCOPY',
 'NEUTRON DIFFRACTION',
 'POWDER DIFFRACTION',
 'SOLID-STATE NMR',
 'SOLUTION NMR',
 'SOLUTION SCATTERING',
 'THEORETICAL MODEL',
 'X-RAY DIFFRACTION']

In [100]:
rep_dict = {n:i for i, n in enumerate(unique_tech)}
rep_dict

{'CRYO-ELECTRON MICROSCOPY': 0,
 'ELECTRON CRYSTALLOGRAPHY': 1,
 'ELECTRON DIFFRACTION': 2,
 'ELECTRON MICROSCOPY': 3,
 'EPR': 4,
 'FIBER DIFFRACTION': 5,
 'FLUORESCENCE TRANSFER': 6,
 'INFRARED SPECTROSCOPY': 7,
 'NEUTRON DIFFRACTION': 8,
 'POWDER DIFFRACTION': 9,
 'SOLID-STATE NMR': 10,
 'SOLUTION NMR': 11,
 'SOLUTION SCATTERING': 12,
 'THEORETICAL MODEL': 13,
 'X-RAY DIFFRACTION': 14}

In [86]:
include_dummy = list(train['experimentalTechnique'].unique())
include_dummy.append("dummy strings here")
include_dummy

['X-RAY DIFFRACTION',
 'SOLUTION NMR',
 'FIBER DIFFRACTION',
 'ELECTRON CRYSTALLOGRAPHY',
 'ELECTRON MICROSCOPY',
 'NEUTRON DIFFRACTION',
 'SOLID-STATE NMR',
 'INFRARED SPECTROSCOPY',
 'SOLUTION NMR, THEORETICAL MODEL',
 'SOLUTION SCATTERING',
 'POWDER DIFFRACTION',
 'CRYO-ELECTRON MICROSCOPY',
 'ELECTRON DIFFRACTION',
 'X-RAY DIFFRACTION, SOLUTION NMR',
 'THEORETICAL MODEL, SOLUTION NMR',
 'FLUORESCENCE TRANSFER',
 'X-RAY DIFFRACTION, EPR',
 'SOLUTION NMR, SOLUTION SCATTERING',
 'SOLID-STATE NMR, SOLUTION NMR',
 'SOLUTION NMR, EPR',
 'SOLUTION SCATTERING, SOLUTION NMR',
 'SOLUTION NMR, SOLID-STATE NMR',
 'SOLID-STATE NMR, ELECTRON MICROSCOPY',
 'EPR, X-RAY DIFFRACTION',
 'NEUTRON DIFFRACTION, X-RAY DIFFRACTION',
 'FIBER DIFFRACTION, SOLID-STATE NMR',
 'SOLUTION SCATTERING, ELECTRON MICROSCOPY',
 'ELECTRON MICROSCOPY, SOLUTION SCATTERING',
 'X-RAY DIFFRACTION, NEUTRON DIFFRACTION',
 'ELECTRON MICROSCOPY, SOLUTION SCATTERING, SOLID-STATE NMR',
 'ELECTRON MICROSCOPY, SOLID-STATE NMR',
 '

In [104]:
rep_dict_ = {}
for ut in include_dummy:
    rep_dict_[ut] = []
    for i, t in enumerate(unique_tech):
        if t in ut.split(', '):
            rep_dict_[ut].append(i)
    if len(rep_dict[ut]) == 0:
        rep_dict_[ut].append(len(unique_tech))

In [105]:
rep_dict_

{'CRYO-ELECTRON MICROSCOPY': [0],
 'ELECTRON CRYSTALLOGRAPHY': [1],
 'ELECTRON DIFFRACTION': [2],
 'ELECTRON MICROSCOPY': [3],
 'ELECTRON MICROSCOPY, SOLID-STATE NMR': [3, 10],
 'ELECTRON MICROSCOPY, SOLUTION SCATTERING': [3, 12],
 'ELECTRON MICROSCOPY, SOLUTION SCATTERING, SOLID-STATE NMR': [3, 10, 12],
 'EPR, X-RAY DIFFRACTION': [4, 14],
 'FIBER DIFFRACTION': [5],
 'FIBER DIFFRACTION, SOLID-STATE NMR': [5, 10],
 'FLUORESCENCE TRANSFER': [6],
 'INFRARED SPECTROSCOPY': [7],
 'NEUTRON DIFFRACTION': [8],
 'NEUTRON DIFFRACTION, SOLUTION NMR': [8, 11],
 'NEUTRON DIFFRACTION, X-RAY DIFFRACTION': [8, 14],
 'POWDER DIFFRACTION': [9],
 'SOLID-STATE NMR': [10],
 'SOLID-STATE NMR, ELECTRON MICROSCOPY': [3, 10],
 'SOLID-STATE NMR, SOLUTION NMR': [10, 11],
 'SOLUTION NMR': [11],
 'SOLUTION NMR, EPR': [4, 11],
 'SOLUTION NMR, SOLID-STATE NMR': [10, 11],
 'SOLUTION NMR, SOLUTION SCATTERING': [11, 12],
 'SOLUTION NMR, THEORETICAL MODEL': [11, 13],
 'SOLUTION SCATTERING': [12],
 'SOLUTION SCATTERING, 