In [1]:
import numpy as np
import pandas as pd
import math
import Bio.PDB
from Bio.PDB import PDBParser
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.Polypeptide import three_to_one
import json
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import os
import pickle

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_pickle('df_structand_seqfeature.pkl')

In [3]:
df

Unnamed: 0,Filename,QS_state,QS_type,Symmetry,Chain_name,Sequence,Interfaces,Area_interface,Volume_interface,Planarity_interface,Symm,aa_composition,entropy,dipeptide_composition
0,104L.pdb,dimer,Homomer,"C2,C2",A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...,[TEGYKSPSLNAAMGVAGAKSRQ],1081.171052,1959.211619,71.578643,4.137568,"[36, 26, 22, 20, 0, 10, 16, 22, 2, 20, 30, 26,...",4.044585,"[10, 0, 0, 0, 0, 0, 4, 2, 0, 2, 2, 8, 0, 0, 0,..."
1,10GS.pdb,dimer,Homomer,"C2,C2",A,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,[CLYGDLTLYQSTHQQAALDMVDGGP],1639.802349,3123.466070,83.403667,4.574406,"[30, 16, 16, 26, 8, 26, 20, 36, 4, 14, 64, 24,...",4.052816,"[4, 2, 0, 4, 0, 0, 0, 2, 0, 0, 6, 0, 0, 4, 2, ..."
2,117E.pdb,dimer,Homomer,C2,A,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...,[RWFPHHIGETIYFPKSIDKWFFI],1174.448255,1873.311200,34.983343,4.263864,"[22, 6, 16, 23, 1, 7, 20, 15, 6, 27, 18, 29, 2...",4.070819,"[1, 0, 0, 2, 0, 0, 0, 1, 0, 2, 3, 3, 0, 3, 2, ..."
3,11AS.pdb,dimer,Homomer,C2,A,AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...,[AYIQIIEVQAPILSRAVQVKVKALHKLRPDEDYQGVP],2877.227380,9656.993779,211.699849,5.829962,"[29, 19, 3, 24, 0, 20, 23, 29, 13, 14, 40, 14,...",4.025078,"[2, 0, 0, 1, 0, 1, 1, 4, 0, 2, 4, 4, 0, 1, 3, ..."
4,11BA.pdb,dimer,Homomer,C2,A,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...,[SAAAKFERQHMDSGNSPSSSYNLMMCCRTFVHESVCGGV],2326.011812,6042.335876,91.126663,6.854454,"[8, 4, 7, 4, 10, 6, 5, 6, 4, 3, 2, 14, 5, 3, 5...",4.051906,"[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108844,5CUS.pdb,dodecamer,Heteromer,C2,M,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[QNRRYNSSPPG, STGLSSGLNRY, GEATTGDPVSPPSRNYVSY...",1285.654901,3561.140808,74.262558,3.873111,"[17, 6, 6, 7, 4, 9, 8, 17, 2, 5, 15, 11, 0, 4,...",3.951123,"[3, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 2, ..."
108845,5CUS.pdb,dodecamer,Heteromer,C2,N,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[GLSSSVGLNRY, GEATTGDPVSPPKSRNYVSYLTTFVQLIPTEG...",1076.897386,2497.628782,51.495621,3.280988,"[16, 5, 6, 7, 4, 8, 6, 17, 1, 5, 15, 8, 0, 4, ...",3.926629,"[3, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 2, ..."
108846,5CUS.pdb,dodecamer,Heteromer,C2,O,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[SGLSSGLNRY, GEATGDPVAPPSSRNYVSYKLTEPFATSQLTAE...",1188.844358,3414.091866,69.186915,3.807491,"[18, 5, 6, 7, 4, 10, 8, 16, 0, 5, 15, 10, 0, 4...",3.914308,"[3, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 3, ..."
108847,5D17.pdb,dodecamer,Homomer,D3,A,QDATNYNSIFANRFAAFDELLSILKTKFACRVLFEETLVLPKVGRS...,"[GLGELQVS, TYSIFANDGSP, GRSRLCKDGGVSSL]",375.821023,345.402726,24.310102,2.277767,"[8, 13, 8, 10, 2, 5, 12, 13, 3, 4, 17, 10, 0, ...",3.998582,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [7]:
features = []
for i in tqdm(range(len(df))):
    features.append(df['aa_composition'][i] + [df['entropy'][i]] + df['dipeptide_composition'][i]+[df['Area_interface'][i]]+[df['Volume_interface'][i]]+[df['Planarity_interface'][i]]+[df['Symm'][i]])
features = np.array(features)

100%|██████████| 108849/108849 [00:04<00:00, 22850.32it/s]


In [8]:
features.shape

(108849, 425)

In [10]:
symmetry = df['Symmetry']
qs = df['QS_state']
qt = df['QS_type']

In [11]:
def assign_others(arr, threshold):
    arr = list(arr)
    from collections import Counter
    counter = dict(Counter(arr))

    remove_entries = []
    for key in counter:
        if counter[key] <= threshold:
            remove_entries.append(key)
    
    for i in range(len(arr)):
        if arr[i] in remove_entries:
            arr[i] = 'others'
    
    return arr

In [12]:
threshold = 30
symmetry = assign_others(symmetry, threshold)
qs = assign_others(qs, threshold)
qt = assign_others(qt, threshold)

In [13]:
le1 = LabelEncoder()
symmetry = le1.fit_transform(symmetry)
le2 = LabelEncoder()
qs = le2.fit_transform(qs)
le3 = LabelEncoder()
qt = le3.fit_transform(qt)

In [14]:
print(le1.classes_)
print(le2.classes_)
print(le3.classes_)

['C1' 'C2' 'C2,C2' 'C3' 'C4' 'C5' 'C6' 'D2' 'D2,D2' 'D3' 'D4' 'D5' 'D6'
 'others']
['decamer' 'dimer' 'dodecamer' 'hexamer' 'octamer' 'pentamer' 'tetramer'
 'trimer']
['Heteromer' 'Homomer']


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x1_train, x1_test, y1_train, y1_test = train_test_split(features, symmetry, test_size=0.2, random_state=42, shuffle=True, stratify=symmetry)
x2_train, x2_test, y2_train, y2_test = train_test_split(features, qs, test_size=0.2, random_state=42, shuffle=True, stratify=qs)
x3_train, x3_test, y3_train, y3_test = train_test_split(features, qt, test_size=0.2, random_state=42, shuffle=True, stratify=qt)


In [17]:
path_to_results = 'results'
if not os.path.exists(path_to_results):
    os.makedirs(path_to_results)
    os.makedirs(os.path.join(path_to_results, 'symmetry'))
    os.makedirs(os.path.join(path_to_results, 'qs'))
    os.makedirs(os.path.join(path_to_results, 'qt'))
    

In [18]:
dirs = os.listdir(path_to_results)
for i in range(len(dirs)):
    if os.listdir(os.path.join(path_to_results, dirs[i])) == []:
        os.makedirs(os.path.join(os.path.join(path_to_results, dirs[i]), 'models'))
        os.makedirs(os.path.join(os.path.join(path_to_results, dirs[i]), 'metrics'))

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [20]:
model = LogisticRegression(random_state=1)
params = {'penalty' : ['l2'], 'C' : [0.01,0.1,1], 'class_weight' : [None, 'balanced'], 'multi_class' : ['auto']}
#params = {'penalty' : ['l2'], 'C' : [0.1], 'class_weight' : [None], 'multi_class' : ['auto']}
scoring = make_scorer(matthews_corrcoef)
n_jobs = -1
refit = True
cv = 5

model_type = 'lr'
model_name = 'Logistic Regression'

In [21]:
print('Training ' + model_name + ' For Symmetry')
lr1 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr1.fit(x1_train, y1_train)
print('Training ' + model_name + ' For Quaternary State')
lr2 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr2.fit(x2_train, y2_train)
print('Training ' + model_name + ' For Quaternary Type')
lr3 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr3.fit(x3_train, y3_train)

Training Logistic Regression For Symmetry


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training Logistic Regression For Quaternary State


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training Logistic Regression For Quaternary Type


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [25]:
print('{} Model Results'.format(model_name))
print('\n')

print('Cross-validation Results')
print('\n')
print('Symmetry Prediction')
print('Best Model is {}'.format(lr1.best_estimator_))
print('Crossvalidation score is {}'.format(lr1.best_score_))
print('Quaternary State Prediction')
print('Best Model is {}'.format(lr2.best_estimator_))
print('Crossvalidation score is {}'.format(lr2.best_score_))
print('Quaternary Type Prediction')
print('Best Model is {}'.format(lr3.best_estimator_))
print('Crossvalidation score is {}'.format(lr3.best_score_))

with open('results/symmetry/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1, f)
with open('results/qs/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2, f)
with open('results/qt/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3, f)

y1_pred = lr1.predict(x1_test)
y2_pred = lr2.predict(x2_test)
y3_pred = lr3.predict(x3_test)

def metrics(y_true, y_pred):
    metrics_dict = {}
    metrics_dict['f1'] = f1_score(y_true, y_pred, average = None)
    metrics_dict['macro_f1'] = f1_score(y_true, y_pred, average='macro')
    metrics_dict['micro_f1'] = f1_score(y_true, y_pred, average='micro')
    metrics_dict['weighted_f1'] = f1_score(y_true, y_pred, average='weighted')
    metrics_dict['acc'] = accuracy_score(y_true, y_pred)
    metrics_dict['mcc'] = matthews_corrcoef(y_true, y_pred)
    return metrics_dict

lr1_metrics = metrics(y1_test, y1_pred)
lr2_metrics = metrics(y2_test, y2_pred)
lr3_metrics = metrics(y3_test, y3_pred)

with open('results/symmetry/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1_metrics, f)
with open('results/qs/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2_metrics, f)
with open('results/qt/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3_metrics, f)

print('\n')
print('Results on Test Data')
print('\n')
print('For Symmetry Prediction')
print('Accuracy = {}'.format(lr1_metrics['acc']))
print('MCC = {}'.format(lr1_metrics['mcc']))
print('Class wise F1 = {}'.format(lr1_metrics['f1']))
print('Macro F1 = {}'.format(lr1_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr1_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr1_metrics['weighted_f1']))

print('\n')
print('For Quaternary State Prediction')
print('Accuracy = {}'.format(lr2_metrics['acc']))
print('MCC = {}'.format(lr2_metrics['mcc']))
print('Class wise F1 = {}'.format(lr2_metrics['f1']))
print('Macro F1 = {}'.format(lr2_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr2_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr2_metrics['weighted_f1']))

print('\n')
print('For Quaternary Type Prediction')
print('Accuracy = {}'.format(lr3_metrics['acc']))
print('MCC = {}'.format(lr3_metrics['mcc']))
print('Class wise F1 = {}'.format(lr3_metrics['f1']))
print('Macro F1 = {}'.format(lr3_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr3_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr3_metrics['weighted_f1']))


Decision Tree Model Results


Cross-validation Results


Symmetry Prediction
Best Model is LogisticRegression(C=0.01, class_weight='balanced', random_state=1)
Crossvalidation score is 0.049036894958602875
Quaternary State Prediction
Best Model is LogisticRegression(C=1, class_weight='balanced', random_state=1)
Crossvalidation score is 0.025574291028815744
Quaternary Type Prediction
Best Model is LogisticRegression(C=0.01, class_weight='balanced', random_state=1)
Crossvalidation score is 0.28959733944571575


Results on Test Data


For Symmetry Prediction
Accuracy = 0.09251263206247129
MCC = 0.03417894754742169
Class wise F1 = [0.06008584 0.24962247 0.00481762 0.02228047 0.01243232 0.01511758
 0.004158   0.05144695 0.0160901  0.0326087  0.02726388 0.01512287
 0.03174603 0.05286344]
Macro F1 = 0.04254687639626258
Micro F1 = 0.09251263206247129
Weighted F1 = 0.1405249369685618


For Quaternary State Prediction
Accuracy = 0.2277446026642168
MCC = 0.02475282768069177
Class wise F1 = [0.0519

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
model = DecisionTreeClassifier(random_state=1)
params = {'max_depth' : [10, 20, 40, -1], 'min_samples_split' : [16, 32], 'class_weight' : [None, 'balanced']}
#params = {'penalty' : ['l2'], 'C' : [0.1], 'class_weight' : [None], 'multi_class' : ['auto']}
scoring = make_scorer(matthews_corrcoef)
n_jobs = -1
refit = True
cv = 5

model_type = 'dt'
model_name = 'Decision Tree'

In [26]:
print('Training ' + model_name + ' For Symmetry')
lr1 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr1.fit(x1_train, y1_train)
print('Training ' + model_name + ' For Quaternary State')
lr2 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr2.fit(x2_train, y2_train)
print('Training ' + model_name + ' For Quaternary Type')
lr3 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr3.fit(x3_train, y3_train)

Training Decision Tree For Symmetry


20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/home/mtech/.local

Training Decision Tree For Quaternary State


20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/home/mtech/.local

Training Decision Tree For Quaternary Type


20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/mtech/.local/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/home/mtech/.local

In [27]:
print('{} Model Results'.format(model_name))
print('\n')

print('Cross-validation Results')
print('\n')
print('Symmetry Prediction')
print('Best Model is {}'.format(lr1.best_estimator_))
print('Crossvalidation score is {}'.format(lr1.best_score_))
print('Quaternary State Prediction')
print('Best Model is {}'.format(lr2.best_estimator_))
print('Crossvalidation score is {}'.format(lr2.best_score_))
print('Quaternary Type Prediction')
print('Best Model is {}'.format(lr3.best_estimator_))
print('Crossvalidation score is {}'.format(lr3.best_score_))

with open('results/symmetry/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1, f)
with open('results/qs/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2, f)
with open('results/qt/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3, f)

y1_pred = lr1.predict(x1_test)
y2_pred = lr2.predict(x2_test)
y3_pred = lr3.predict(x3_test)

def metrics(y_true, y_pred):
    metrics_dict = {}
    metrics_dict['f1'] = f1_score(y_true, y_pred, average = None)
    metrics_dict['macro_f1'] = f1_score(y_true, y_pred, average='macro')
    metrics_dict['micro_f1'] = f1_score(y_true, y_pred, average='micro')
    metrics_dict['weighted_f1'] = f1_score(y_true, y_pred, average='weighted')
    metrics_dict['acc'] = accuracy_score(y_true, y_pred)
    metrics_dict['mcc'] = matthews_corrcoef(y_true, y_pred)
    return metrics_dict

lr1_metrics = metrics(y1_test, y1_pred)
lr2_metrics = metrics(y2_test, y2_pred)
lr3_metrics = metrics(y3_test, y3_pred)

with open('results/symmetry/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1_metrics, f)
with open('results/qs/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2_metrics, f)
with open('results/qt/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3_metrics, f)

print('\n')
print('Results on Test Data')
print('\n')
print('For Symmetry Prediction')
print('Accuracy = {}'.format(lr1_metrics['acc']))
print('MCC = {}'.format(lr1_metrics['mcc']))
print('Class wise F1 = {}'.format(lr1_metrics['f1']))
print('Macro F1 = {}'.format(lr1_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr1_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr1_metrics['weighted_f1']))

print('\n')
print('For Quaternary State Prediction')
print('Accuracy = {}'.format(lr2_metrics['acc']))
print('MCC = {}'.format(lr2_metrics['mcc']))
print('Class wise F1 = {}'.format(lr2_metrics['f1']))
print('Macro F1 = {}'.format(lr2_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr2_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr2_metrics['weighted_f1']))

print('\n')
print('For Quaternary Type Prediction')
print('Accuracy = {}'.format(lr3_metrics['acc']))
print('MCC = {}'.format(lr3_metrics['mcc']))
print('Class wise F1 = {}'.format(lr3_metrics['f1']))
print('Macro F1 = {}'.format(lr3_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr3_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr3_metrics['weighted_f1']))


Decision Tree Model Results


Cross-validation Results


Symmetry Prediction
Best Model is DecisionTreeClassifier(max_depth=40, min_samples_split=16, random_state=1)
Crossvalidation score is 0.7027515902508092
Quaternary State Prediction
Best Model is DecisionTreeClassifier(max_depth=40, min_samples_split=16, random_state=1)
Crossvalidation score is 0.5753173382748862
Quaternary Type Prediction
Best Model is DecisionTreeClassifier(max_depth=40, min_samples_split=16, random_state=1)
Crossvalidation score is 0.7659259591880728


Results on Test Data


For Symmetry Prediction
Accuracy = 0.8182361047312816
MCC = 0.7388854598638617
Class wise F1 = [0.78333333 0.87743414 0.76411846 0.75934335 0.61538462 0.72517321
 0.52892562 0.75734355 0.81818182 0.71566054 0.67549669 0.64516129
 0.32       0.73015873]
Macro F1 = 0.6939796669717965
Micro F1 = 0.8182361047312816
Weighted F1 = 0.8167189027171811


For Quaternary State Prediction
Accuracy = 0.7447404685346808
MCC = 0.6183320496595376
Class wis

In [28]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
params = {}
#params = {'penalty' : ['l2'], 'C' : [0.1], 'class_weight' : [None], 'multi_class' : ['auto']}
scoring = make_scorer(matthews_corrcoef)
n_jobs = 4
refit = True
cv = 5

model_type = 'nb'
model_name = 'Gaussian Naive Bayes'


print('Training ' + model_name + ' For Symmetry')
lr1 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr1.fit(x1_train, y1_train)
print('Training ' + model_name + ' For Quaternary State')
lr2 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr2.fit(x2_train, y2_train)
print('Training ' + model_name + ' For Quaternary Type')
lr3 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr3.fit(x3_train, y3_train)


print('{} Model Results'.format(model_name))
print('\n')

print('Cross-validation Results')
print('\n')
print('Symmetry Prediction')
print('Best Model is {}'.format(lr1.best_estimator_))
print('Crossvalidation score is {}'.format(lr1.best_score_))
print('Quaternary State Prediction')
print('Best Model is {}'.format(lr2.best_estimator_))
print('Crossvalidation score is {}'.format(lr2.best_score_))
print('Quaternary Type Prediction')
print('Best Model is {}'.format(lr3.best_estimator_))
print('Crossvalidation score is {}'.format(lr3.best_score_))

with open('results/symmetry/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1, f)
with open('results/qs/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2, f)
with open('results/qt/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3, f)

y1_pred = lr1.predict(x1_test)
y2_pred = lr2.predict(x2_test)
y3_pred = lr3.predict(x3_test)

def metrics(y_true, y_pred):
    metrics_dict = {}
    metrics_dict['f1'] = f1_score(y_true, y_pred, average = None)
    metrics_dict['macro_f1'] = f1_score(y_true, y_pred, average='macro')
    metrics_dict['micro_f1'] = f1_score(y_true, y_pred, average='micro')
    metrics_dict['weighted_f1'] = f1_score(y_true, y_pred, average='weighted')
    metrics_dict['acc'] = accuracy_score(y_true, y_pred)
    metrics_dict['mcc'] = matthews_corrcoef(y_true, y_pred)
    return metrics_dict

lr1_metrics = metrics(y1_test, y1_pred)
lr2_metrics = metrics(y2_test, y2_pred)
lr3_metrics = metrics(y3_test, y3_pred)

with open('results/symmetry/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1_metrics, f)
with open('results/qs/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2_metrics, f)
with open('results/qt/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3_metrics, f)

print('\n')
print('Results on Test Data')
print('\n')
print('For Symmetry Prediction')
print('Accuracy = {}'.format(lr1_metrics['acc']))
print('MCC = {}'.format(lr1_metrics['mcc']))
print('Class wise F1 = {}'.format(lr1_metrics['f1']))
print('Macro F1 = {}'.format(lr1_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr1_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr1_metrics['weighted_f1']))

print('\n')
print('For Quaternary State Prediction')
print('Accuracy = {}'.format(lr2_metrics['acc']))
print('MCC = {}'.format(lr2_metrics['mcc']))
print('Class wise F1 = {}'.format(lr2_metrics['f1']))
print('Macro F1 = {}'.format(lr2_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr2_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr2_metrics['weighted_f1']))

print('\n')
print('For Quaternary Type Prediction')
print('Accuracy = {}'.format(lr3_metrics['acc']))
print('MCC = {}'.format(lr3_metrics['mcc']))
print('Class wise F1 = {}'.format(lr3_metrics['f1']))
print('Macro F1 = {}'.format(lr3_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr3_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr3_metrics['weighted_f1']))


Training Gaussian Naive Bayes For Symmetry
Training Gaussian Naive Bayes For Quaternary State
Training Gaussian Naive Bayes For Quaternary Type
Gaussian Naive Bayes Model Results


Cross-validation Results


Symmetry Prediction
Best Model is GaussianNB()
Crossvalidation score is 0.1438124152947955
Quaternary State Prediction
Best Model is GaussianNB()
Crossvalidation score is 0.06111738488690817
Quaternary Type Prediction
Best Model is GaussianNB()
Crossvalidation score is 0.2706467862571123


Results on Test Data


For Symmetry Prediction
Accuracy = 0.2935691318327974
MCC = 0.14457143116809673
Class wise F1 = [0.35265318 0.3238558  0.27812848 0.19278937 0.10029499 0.19455253
 0.07194245 0.12140992 0.77777778 0.15693904 0.28971963 0.18412698
 0.07142857 0.47727273]
Macro F1 = 0.25663510317124133
Micro F1 = 0.2935691318327974
Weighted F1 = 0.28119637132869985


For Quaternary State Prediction
Accuracy = 0.17606798346348185
MCC = 0.056308613285166266
Class wise F1 = [0.05611574 0.3341073

In [29]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=1)
params = {'n_estimators' : [100, 400, 800], 'min_samples_split' : [16, 32], 'class_weight' : [None, 'balanced'],
           'max_features' : ['sqrt', 'log2']}
#params = {'penalty' : ['l2'], 'C' : [0.1], 'class_weight' : [None], 'multi_class' : ['auto']}
scoring = make_scorer(matthews_corrcoef)
n_jobs = 4
refit = True
cv = 5

model_type = 'rf'
model_name = 'Random Forest'


print('Training ' + model_name + ' For Symmetry')
lr1 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr1.fit(x1_train, y1_train)
print('Training ' + model_name + ' For Quaternary State')
lr2 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr2.fit(x2_train, y2_train)
print('Training ' + model_name + ' For Quaternary Type')
lr3 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr3.fit(x3_train, y3_train)


print('{} Model Results'.format(model_name))
print('\n')

print('Cross-validation Results')
print('\n')
print('Symmetry Prediction')
print('Best Model is {}'.format(lr1.best_estimator_))
print('Crossvalidation score is {}'.format(lr1.best_score_))
print('Quaternary State Prediction')
print('Best Model is {}'.format(lr2.best_estimator_))
print('Crossvalidation score is {}'.format(lr2.best_score_))
print('Quaternary Type Prediction')
print('Best Model is {}'.format(lr3.best_estimator_))
print('Crossvalidation score is {}'.format(lr3.best_score_))

with open('results/symmetry/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1, f)
with open('results/qs/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2, f)
with open('results/qt/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3, f)

y1_pred = lr1.predict(x1_test)
y2_pred = lr2.predict(x2_test)
y3_pred = lr3.predict(x3_test)

def metrics(y_true, y_pred):
    metrics_dict = {}
    metrics_dict['f1'] = f1_score(y_true, y_pred, average = None)
    metrics_dict['macro_f1'] = f1_score(y_true, y_pred, average='macro')
    metrics_dict['micro_f1'] = f1_score(y_true, y_pred, average='micro')
    metrics_dict['weighted_f1'] = f1_score(y_true, y_pred, average='weighted')
    metrics_dict['acc'] = accuracy_score(y_true, y_pred)
    metrics_dict['mcc'] = matthews_corrcoef(y_true, y_pred)
    return metrics_dict

lr1_metrics = metrics(y1_test, y1_pred)
lr2_metrics = metrics(y2_test, y2_pred)
lr3_metrics = metrics(y3_test, y3_pred)

with open('results/symmetry/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1_metrics, f)
with open('results/qs/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2_metrics, f)
with open('results/qt/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3_metrics, f)

print('\n')
print('Results on Test Data')
print('\n')
print('For Symmetry Prediction')
print('Accuracy = {}'.format(lr1_metrics['acc']))
print('MCC = {}'.format(lr1_metrics['mcc']))
print('Class wise F1 = {}'.format(lr1_metrics['f1']))
print('Macro F1 = {}'.format(lr1_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr1_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr1_metrics['weighted_f1']))

print('\n')
print('For Quaternary State Prediction')
print('Accuracy = {}'.format(lr2_metrics['acc']))
print('MCC = {}'.format(lr2_metrics['mcc']))
print('Class wise F1 = {}'.format(lr2_metrics['f1']))
print('Macro F1 = {}'.format(lr2_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr2_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr2_metrics['weighted_f1']))

print('\n')
print('For Quaternary Type Prediction')
print('Accuracy = {}'.format(lr3_metrics['acc']))
print('MCC = {}'.format(lr3_metrics['mcc']))
print('Class wise F1 = {}'.format(lr3_metrics['f1']))
print('Macro F1 = {}'.format(lr3_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr3_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr3_metrics['weighted_f1']))




Training Random Forest For Symmetry




Training Random Forest For Quaternary State




Training Random Forest For Quaternary Type




Random Forest Model Results


Cross-validation Results


Symmetry Prediction
Best Model is RandomForestClassifier(class_weight='balanced', min_samples_split=16,
                       n_estimators=400, random_state=1)
Crossvalidation score is 0.8349906483292987
Quaternary State Prediction
Best Model is RandomForestClassifier(class_weight='balanced', min_samples_split=16,
                       n_estimators=800, random_state=1)
Crossvalidation score is 0.7227110223866658
Quaternary Type Prediction
Best Model is RandomForestClassifier(class_weight='balanced', min_samples_split=16,
                       n_estimators=800, random_state=1)
Crossvalidation score is 0.8573120014300315


Results on Test Data


For Symmetry Prediction
Accuracy = 0.9028020211299954
MCC = 0.8612344563972448
Class wise F1 = [0.88120805 0.91843627 0.87155346 0.93333333 0.88888889 0.92982456
 0.89705882 0.8863685  0.85714286 0.89053498 0.85314685 0.76744186
 0.91525424 0.94285714]
Macro F1 = 0.8880749875371746
Micro

In [None]:
from sklearn import svm

model = svm.SVC(random_state=1)
params = {'C' : [0.01, .1, 1], 'kernel' : ['linear', 'rbf'], 'class_weight' : [None, 'balanced']}
#params = {'penalty' : ['l2'], 'C' : [0.1], 'class_weight' : [None], 'multi_class' : ['auto']}
scoring = make_scorer(matthews_corrcoef)
n_jobs = 4
refit = True
cv = 5

model_type = 'svm'
model_name = 'Support Vector Machine'


print('Training ' + model_name + ' For Symmetry')
lr1 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr1.fit(x1_train, y1_train)
print('Training ' + model_name + ' For Quaternary State')
lr2 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr2.fit(x2_train, y2_train)
print('Training ' + model_name + ' For Quaternary Type')
lr3 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr3.fit(x3_train, y3_train)


print('{} Model Results'.format(model_name))
print('\n')

print('Cross-validation Results')
print('\n')
print('Symmetry Prediction')
print('Best Model is {}'.format(lr1.best_estimator_))
print('Crossvalidation score is {}'.format(lr1.best_score_))
print('Quaternary State Prediction')
print('Best Model is {}'.format(lr2.best_estimator_))
print('Crossvalidation score is {}'.format(lr2.best_score_))
print('Quaternary Type Prediction')
print('Best Model is {}'.format(lr3.best_estimator_))
print('Crossvalidation score is {}'.format(lr3.best_score_))

with open('results/symmetry/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1, f)
with open('results/qs/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2, f)
with open('results/qt/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3, f)

y1_pred = lr1.predict(x1_test)
y2_pred = lr2.predict(x2_test)
y3_pred = lr3.predict(x3_test)

def metrics(y_true, y_pred):
    metrics_dict = {}
    metrics_dict['f1'] = f1_score(y_true, y_pred, average = None)
    metrics_dict['macro_f1'] = f1_score(y_true, y_pred, average='macro')
    metrics_dict['micro_f1'] = f1_score(y_true, y_pred, average='micro')
    metrics_dict['weighted_f1'] = f1_score(y_true, y_pred, average='weighted')
    metrics_dict['acc'] = accuracy_score(y_true, y_pred)
    metrics_dict['mcc'] = matthews_corrcoef(y_true, y_pred)
    return metrics_dict

lr1_metrics = metrics(y1_test, y1_pred)
lr2_metrics = metrics(y2_test, y2_pred)
lr3_metrics = metrics(y3_test, y3_pred)

with open('results/symmetry/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1_metrics, f)
with open('results/qs/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2_metrics, f)
with open('results/qt/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3_metrics, f)

print('\n')
print('Results on Test Data')
print('\n')
print('For Symmetry Prediction')
print('Accuracy = {}'.format(lr1_metrics['acc']))
print('MCC = {}'.format(lr1_metrics['mcc']))
print('Class wise F1 = {}'.format(lr1_metrics['f1']))
print('Macro F1 = {}'.format(lr1_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr1_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr1_metrics['weighted_f1']))

print('\n')
print('For Quaternary State Prediction')
print('Accuracy = {}'.format(lr2_metrics['acc']))
print('MCC = {}'.format(lr2_metrics['mcc']))
print('Class wise F1 = {}'.format(lr2_metrics['f1']))
print('Macro F1 = {}'.format(lr2_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr2_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr2_metrics['weighted_f1']))

print('\n')
print('For Quaternary Type Prediction')
print('Accuracy = {}'.format(lr3_metrics['acc']))
print('MCC = {}'.format(lr3_metrics['mcc']))
print('Class wise F1 = {}'.format(lr3_metrics['f1']))
print('Macro F1 = {}'.format(lr3_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr3_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr3_metrics['weighted_f1']))




In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=1)
params = {'n_estimators' : [100, 400, 800], 'learning_rate' : [0.1, 1]}
#params = {'penalty' : ['l2'], 'C' : [0.1], 'class_weight' : [None], 'multi_class' : ['auto']}
scoring = make_scorer(matthews_corrcoef)
n_jobs = 4
refit = True
cv = 5

model_type = 'AdaBoost'
model_name = 'AdaBoost Classifier'


print('Training ' + model_name + ' For Symmetry')
lr1 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr1.fit(x1_train, y1_train)
print('Training ' + model_name + ' For Quaternary State')
lr2 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr2.fit(x2_train, y2_train)
print('Training ' + model_name + ' For Quaternary Type')
lr3 = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv)
lr3.fit(x3_train, y3_train)


print('{} Model Results'.format(model_name))
print('\n')

print('Cross-validation Results')
print('\n')
print('Symmetry Prediction')
print('Best Model is {}'.format(lr1.best_estimator_))
print('Crossvalidation score is {}'.format(lr1.best_score_))
print('Quaternary State Prediction')
print('Best Model is {}'.format(lr2.best_estimator_))
print('Crossvalidation score is {}'.format(lr2.best_score_))
print('Quaternary Type Prediction')
print('Best Model is {}'.format(lr3.best_estimator_))
print('Crossvalidation score is {}'.format(lr3.best_score_))

with open('results/symmetry/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1, f)
with open('results/qs/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2, f)
with open('results/qt/models/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3, f)

y1_pred = lr1.predict(x1_test)
y2_pred = lr2.predict(x2_test)
y3_pred = lr3.predict(x3_test)

def metrics(y_true, y_pred):
    metrics_dict = {}
    metrics_dict['f1'] = f1_score(y_true, y_pred, average = None)
    metrics_dict['macro_f1'] = f1_score(y_true, y_pred, average='macro')
    metrics_dict['micro_f1'] = f1_score(y_true, y_pred, average='micro')
    metrics_dict['weighted_f1'] = f1_score(y_true, y_pred, average='weighted')
    metrics_dict['acc'] = accuracy_score(y_true, y_pred)
    metrics_dict['mcc'] = matthews_corrcoef(y_true, y_pred)
    return metrics_dict

lr1_metrics = metrics(y1_test, y1_pred)
lr2_metrics = metrics(y2_test, y2_pred)
lr3_metrics = metrics(y3_test, y3_pred)

with open('results/symmetry/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr1_metrics, f)
with open('results/qs/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr2_metrics, f)
with open('results/qt/metrics/{}.pkl'.format(model_type), 'wb') as f:
    pickle.dump(lr3_metrics, f)

print('\n')
print('Results on Test Data')
print('\n')
print('For Symmetry Prediction')
print('Accuracy = {}'.format(lr1_metrics['acc']))
print('MCC = {}'.format(lr1_metrics['mcc']))
print('Class wise F1 = {}'.format(lr1_metrics['f1']))
print('Macro F1 = {}'.format(lr1_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr1_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr1_metrics['weighted_f1']))

print('\n')
print('For Quaternary State Prediction')
print('Accuracy = {}'.format(lr2_metrics['acc']))
print('MCC = {}'.format(lr2_metrics['mcc']))
print('Class wise F1 = {}'.format(lr2_metrics['f1']))
print('Macro F1 = {}'.format(lr2_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr2_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr2_metrics['weighted_f1']))

print('\n')
print('For Quaternary Type Prediction')
print('Accuracy = {}'.format(lr3_metrics['acc']))
print('MCC = {}'.format(lr3_metrics['mcc']))
print('Class wise F1 = {}'.format(lr3_metrics['f1']))
print('Macro F1 = {}'.format(lr3_metrics['macro_f1']))
print('Micro F1 = {}'.format(lr3_metrics['micro_f1']))
print('Weighted F1 = {}'.format(lr3_metrics['weighted_f1']))


