<a href="https://colab.research.google.com/github/vdhulappanavar/bioinformatics/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the data
pdb_data_no_dups = pd.read_csv('./pdb_data_no_dups.csv')
pdb_data_seq = pd.read_csv('./pdb_data_seq.csv')

# Merge the datasets on 'structureId'
merged_data = pd.merge(pdb_data_no_dups, pdb_data_seq, on='structureId')

# Ensure 'sequence' column is of type str and handle missing values
merged_data['sequence'] = merged_data['sequence'].fillna('').astype(str)

# Feature Extraction: Create 'sequence_length' by calculating the length of each sequence
if 'sequence' in merged_data.columns:
    merged_data['sequence_length'] = merged_data['sequence'].apply(len)
else:
    raise KeyError("The 'sequence' column is missing from the dataset")

# Convert 'residueCount_x' to numeric (this assumes 'residueCount_x' is the correct column name)
merged_data['residueCount_x'] = pd.to_numeric(merged_data['residueCount_x'], errors='coerce')

# Handle any remaining NaN values in numeric columns
merged_data.fillna(0, inplace=True)

# Example: Calculate amino acid composition
def amino_acid_composition(sequence):
    aa_dict = {aa: sequence.count(aa) for aa in set(sequence)}
    return aa_dict

merged_data['aa_composition'] = merged_data['sequence'].apply(amino_acid_composition)

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(merged_data[['sequence_length', 'residueCount_x']])

# Target Variable
y = merged_data['classification']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Prediction and Evaluation
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


TypeError: '<' not supported between instances of 'str' and 'int'

In [4]:
# Check the data types of all columns
print(merged_data.dtypes)

# Optionally, inspect the first few rows to get an overview
print(merged_data.head())

# Identify non-numeric entries in the columns of interest
for col in ['sequence_length', 'residueCount_x']:
    non_numeric = merged_data[pd.to_numeric(merged_data[col], errors='coerce').isna()]
    if not non_numeric.empty:
        print(f"Non-numeric values found in column {col}:")
        print(non_numeric[[col]])


structureId                  object
classification               object
experimentalTechnique        object
macromoleculeType_x          object
residueCount_x                int64
resolution                  float64
structureMolecularWeight    float64
crystallizationMethod        object
crystallizationTempK        float64
densityMatthews             float64
densityPercentSol           float64
pdbxDetails                  object
phValue                     float64
publicationYear             float64
chainId                      object
sequence                     object
residueCount_y                int64
macromoleculeType_y          object
sequence_length               int64
aa_composition               object
dtype: object
  structureId    classification experimentalTechnique macromoleculeType_x  \
0        100D    DNA-RNA HYBRID     X-RAY DIFFRACTION      DNA/RNA Hybrid   
1        100D    DNA-RNA HYBRID     X-RAY DIFFRACTION      DNA/RNA Hybrid   
2        101D               DNA    

In [5]:
# Convert the columns to numeric, forcing any non-convertible values to NaN
merged_data['residueCount_x'] = pd.to_numeric(merged_data['residueCount_x'], errors='coerce')
merged_data['sequence_length'] = pd.to_numeric(merged_data['sequence_length'], errors='coerce')

# Handle NaN values after conversion
merged_data.fillna(0, inplace=True)

# Now proceed with the rest of the analysis


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(merged_data[['sequence_length', 'residueCount_x']])

# Target Variable
y = merged_data['classification']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Prediction and Evaluation
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


TypeError: '<' not supported between instances of 'str' and 'int'

In [6]:
# Check for unexpected string values in numeric columns
# Convert any placeholders to NaN, then handle them

columns_to_check = ['phValue', 'crystallizationTempK', 'densityMatthews', 'densityPercentSol']

for col in columns_to_check:
    merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')

# Handle NaN values that arise after coercion
merged_data.fillna(0, inplace=True)

# Now proceed with the rest of the analysis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(merged_data[['sequence_length', 'residueCount_x']])

# Target Variable
y = merged_data['classification']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Prediction and Evaluation
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


TypeError: '<' not supported between instances of 'str' and 'int'

In [7]:
# Inspect unique values in the target variable
print(y_train.unique())


['OXIDOREDUCTASE' 'SIGNALING PROTEIN' 'MEMBRANE PROTEIN, LIPID TRANSPORT'
 ... 'CHOLESTEROL-BINDING PROTEIN' 'VIRAL PROTEIN/WINGED HELIX'
 'FUNGICIDE']


In [16]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y_train)

# Fit the model using encoded target variable
rf.fit(X_train, y_encoded)

# If using y_test for evaluation, ensure it is also encoded the same way
y_test_encoded = le.transform(y_test)

# Prediction and Evaluation
y_pred = rf.predict(X_test)
y_pred_decoded = le.inverse_transform(y_pred)  # Decode the predictions back to original labels
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print(classification_report(y_test_encoded, y_pred))


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['int', 'str']

In [17]:
# Convert all values in y_train and y_test to strings
y_train = y_train.astype(str)
y_test = y_test.astype(str)

# Now use LabelEncoder to encode the categorical labels
from sklearn.preprocessing import LabelEncoder

# Encoding categorical target variable
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Proceed with model training
rf.fit(X_train, y_train_encoded)

# Prediction and Evaluation
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print(classification_report(y_test_encoded, y_pred))


ValueError: y contains previously unseen labels: 'TRANSCARBAMYLASE'

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Combine y_train and y_test for fitting the encoder
y_combined = pd.concat([y_train, y_test])

# Fit the LabelEncoder on the combined data
le = LabelEncoder()
le.fit(y_combined)

# Transform y_train and y_test separately
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

# Now proceed with model training
rf.fit(X_train, y_train_encoded)

# Prediction and Evaluation
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print(classification_report(y_test_encoded, y_pred))


In [8]:
# Sample 10% of the data
sampled_data = merged_data.sample(frac=0.1, random_state=42)

# Proceed with the sampled data
X = sampled_data[['sequence_length', 'residueCount_x']]
y = sampled_data['classification']


In [9]:
# Downcast numerical columns to save memory
merged_data['residueCount_x'] = pd.to_numeric(merged_data['residueCount_x'], downcast='integer')
merged_data['resolution'] = pd.to_numeric(merged_data['resolution'], downcast='float')

# Check memory usage
print(merged_data.memory_usage(deep=True))


Index                             128
structureId                  28740089
classification               33675773
experimentalTechnique        34893444
macromoleculeType_x          29334644
residueCount_x                1884596
resolution                    1884596
structureMolecularWeight      3769192
crystallizationMethod        31258762
crystallizationTempK          3769192
densityMatthews               3769192
densityPercentSol             3769192
pdbxDetails                  61748955
phValue                       3769192
publicationYear               3769192
chainId                      27372132
sequence                    143441710
residueCount_y                3769192
macromoleculeType_y          29334644
sequence_length               3769192
aa_composition              288228248
dtype: int64


In [None]:
pip install dask-ml dask[complete]


Collecting dask-ml
  Downloading dask_ml-2024.4.4-py3-none-any.whl.metadata (5.9 kB)
Collecting dask-glm>=0.2.0 (from dask-ml)
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting sparse>=0.7.0 (from dask-glm>=0.2.0->dask-ml)
  Downloading sparse-0.15.4-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[array,dataframe]>=2.4.0->dask-ml)
  Downloading dask_expr-1.1.10-py3-none-any.whl.metadata (2.5 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.9-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_ml-2024.4.4-py3-none-any.whl (149 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.8/149.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [None]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from dask_ml.ensemble import RandomForestClassifier

# Load your large CSV file in chunks using Dask
ddf = dd.read_csv('/path/to/your/large_file.csv')

# Split the data
X = ddf[['sequence_length', 'residueCount_x']]
y = ddf['classification']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Dask-ML's RandomForestClassifier
clf = RandomForestClassifier()

# Fit the model (this will be done in parallel, in chunks)
clf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = clf.predict(X_test)
print(clf.score(X_test, y_test))


ImportError: cannot import name 'RandomForestClassifier' from 'dask_ml.ensemble' (/usr/local/lib/python3.10/dist-packages/dask_ml/ensemble/__init__.py)

In [None]:
pip install dask[complete] scikit-learn




In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file in chunks using Dask
pdb_data_no_dups = pd.read_csv('./pdb_data_no_dups.csv')
pdb_data_seq = pd.read_csv('./pdb_data_seq.csv')

# Merge the datasets on 'structureId'
merged_data = pd.merge(pdb_data_no_dups, pdb_data_seq, on='structureId')
ddf = merged_data

# Define your features and target
X = ddf[['sequence_length', 'residueCount_x']]
y = ddf['classification']

# Initialize the model
model = SGDClassifier(loss='log', random_state=42)

# Iterate over chunks of data
for X_chunk, y_chunk in ddf.to_delayed().compute():
    # Convert chunk to numpy arrays
    X_chunk = X_chunk.to_numpy()
    y_chunk = y_chunk.to_numpy()

    # Incremental fitting of the model
    model.partial_fit(X_chunk, y_chunk, classes=np.unique(y.compute()))

# Predict on the entire test set (using Dask for scalability)
y_pred = model.predict(X.compute())

# Evaluate the model
print("Accuracy:", accuracy_score(y.compute(), y_pred))
print(classification_report(y.compute(), y_pred))


KeyError: "['sequence_length'] not in index"

In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file in chunks using Dask
ddf = merged_data
# Ensure 'sequence' column is of type str and create 'sequence_length' column
ddf['sequence'] = ddf['sequence'].astype(str)
ddf['sequence_length'] = ddf['sequence'].apply(len, meta=('sequence', 'int64'))

# Inspect the columns to ensure 'sequence_length' exists
print(ddf.columns)

# Define your features and target
X = ddf[['sequence_length', 'residueCount_x']]
y = ddf['classification']

# Initialize the model
model = SGDClassifier(loss='log', random_state=42)

# Iterate over chunks of data
for X_chunk, y_chunk in ddf.to_delayed().compute():
    # Convert chunk to numpy arrays
    X_chunk = X_chunk.to_numpy()
    y_chunk = y_chunk.to_numpy()

    # Incremental fitting of the model
    model.partial_fit(X_chunk, y_chunk, classes=np.unique(y.compute()))

# Predict on the entire test set (using Dask for scalability)
y_pred = model.predict(X.compute())

# Evaluate the model
print("Accuracy:", accuracy_score(y.compute(), y_pred))
print(classification_report(y.compute(), y_pred))


TypeError: len() takes no keyword arguments

In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file in chunks using Dask
ddf = merged_data

# Ensure 'sequence' column is of type str and create 'sequence_length' column
ddf['sequence'] = ddf['sequence'].astype(str)
ddf['sequence_length'] = ddf['sequence'].apply(lambda x: len(x), meta=('sequence_length', 'int64'))

# Inspect the columns to ensure 'sequence_length' exists
print(ddf.columns)

# Define your features and target
X = ddf[['sequence_length', 'residueCount_x']]
y = ddf['classification']

# Initialize the m


TypeError: <lambda>() got an unexpected keyword argument 'meta'

In [1]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file in chunks using Dask
ddf = merged_data

# Ensure 'sequence' column is of type str
ddf['sequence'] = ddf['sequence'].astype(str)

# Create 'sequence_length' column using map_partitions to apply len function
ddf['sequence_length'] = ddf['sequence'].map_partitions(lambda df: df.apply(len), meta=('sequence_length', 'int64'))

# Inspect the columns to ensure 'sequence_length' exists
print(ddf.columns)

# Define your features and target
X = ddf[['sequence_length', 'residueCount_x']]
y = ddf['classification']

# Initialize the model
model = SGDClassifier(loss='log', random_state=42)

# Iterate over chunks of data
for X_chunk, y_chunk in zip(X.to_delayed(), y.to_delayed()):
    X_chunk = X_chunk.compute()
    y_chunk = y_chunk.compute()

    # Incremental fitting of the model
    model.partial_fit(X_chunk, y_chunk, classes=y.unique().compute())

# Predict on the entire test set (using Dask for scalability)
y_pred = model.predict(X.compute())

# Evaluate the model
print("Accuracy:", accuracy_score(y.compute(), y_pred))
print(classification_report(y.compute(), y_pred))


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



NameError: name 'merged_data' is not defined

In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file in chunks using Dask
ddf = merged_data

# Ensure 'sequence' column is of type str
ddf['sequence'] = ddf['sequence'].astype(str)

# Create 'sequence_length' column by applying len() using map
ddf['sequence_length'] = ddf['sequence'].map(len)

# Inspect the columns to ensure 'sequence_length' exists
print(ddf.columns)

# Define your features and target
X = ddf[['sequence_length', 'residueCount_x']]
y = ddf['classification']

# Initialize the model
model = SGDClassifier(loss='log', random_state=42)

# Iterate over chunks of data
for X_chunk, y_chunk in zip(X.to_delayed(), y.to_delayed()):
    X_chunk = X_chunk.compute()
    y_chunk = y_chunk.compute()

    # Incremental fitting of the model
    model.partial_fit(X_chunk, y_chunk, classes=y.unique().compute())

# Predict on the entire test set (using Dask for scalability)
y_pred = model.predict(X.compute())

# Evaluate the model
print("Accuracy:", accuracy_score(y.compute(), y_pred))
print(classification_report(y.compute(), y_pred))


Index(['structureId', 'classification', 'experimentalTechnique',
       'macromoleculeType_x', 'residueCount_x', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear', 'chainId', 'sequence',
       'residueCount_y', 'macromoleculeType_y', 'sequence_length'],
      dtype='object')


AttributeError: 'DataFrame' object has no attribute 'to_delayed'

In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file in chunks using Dask
ddf = merged_data

# Ensure 'sequence' column is of type str
ddf['sequence'] = ddf['sequence'].astype(str)

# Create 'sequence_length' column by applying len() using map
ddf['sequence_length'] = ddf['sequence'].map(len)

# Convert to Pandas DataFrame in chunks and train incrementally
model = SGDClassifier(loss='log', random_state=42)

# Iterate over Dask DataFrame in chunks and convert to Pandas DataFrame
for i in range(0, len(ddf), 10000):
    X_chunk = ddf[['sequence_length', 'residueCount_x']].iloc[i:i+10000].compute()
    y_chunk = ddf['classification'].iloc[i:i+10000].compute()

    if i == 0:
        model.partial_fit(X_chunk, y_chunk, classes=y_chunk.unique())
    else:
        model.partial_fit(X_chunk, y_chunk)

# Predict and evaluate on the entire dataset
X = ddf[['sequence_length', 'residueCount_x']].compute()
y = ddf['classification'].compute()

y_pred = model.predict(X)
print("Accuracy:", accuracy_score(y, y_pred))
print(classification_report(y, y_pred))


AttributeError: 'DataFrame' object has no attribute 'compute'

In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file using Dask
ddf = merged_data

# Ensure 'sequence' column is of type str and create 'sequence_length'
ddf['sequence'] = ddf['sequence'].astype(str)
# ddf['sequence_length'] = ddf['sequence'].apply(len, meta=('sequence_length', 'int64'))

ddf['sequence_length'] = ddf['sequence'].map(len)

# Convert to Pandas in chunks
chunks = ddf.to_delayed()

model = SGDClassifier(loss='log', random_state=42)

# Iterate through each chunk
for chunk in chunks:
    chunk = chunk.compute()
    X_chunk = chunk[['sequence_length', 'residueCount_x']]
    y_chunk = chunk['classification']

    if model.coef_ is None:
        model.partial_fit(X_chunk, y_chunk, classes=y_chunk.unique())
    else:
        model.partial_fit(X_chunk, y_chunk)

# Evaluate the model
X_full = ddf[['sequence_length', 'residueCount_x']].compute()
y_full = ddf['classification'].compute()

y_pred = model.predict(X_full)
print("Accuracy:", accuracy_score(y_full, y_pred))
print(classification_report(y_full, y_pred))


AttributeError: 'DataFrame' object has no attribute 'to_delayed'

In [None]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'merged_data' is your large Pandas DataFrame
df = merged_data

# Initialize the model
model = SGDClassifier(loss='log', random_state=42)

# Process data in chunks
chunksize = 10000
for start in range(0, len(df), chunksize):
    # Extract the chunk
    chunk = df.iloc[start:start + chunksize]

    # Prepare the features and target
    X_chunk = chunk[['sequence_length', 'residueCount_x']]
    y_chunk = chunk['classification']

    # Fit the model incrementally
    if start == 0:
        model.partial_fit(X_chunk, y_chunk, classes=y_chunk.unique())
    else:
        model.partial_fit(X_chunk, y_chunk)

# Evaluate the model on the entire dataset
X_full = df[['sequence_length', 'residueCount_x']]
y_full = df['classification']

y_pred = model.predict(X_full)
print("Accuracy:", accuracy_score(y_full, y_pred))
print(classification_report(y_full, y_pred))


InvalidParameterError: The 'loss' parameter of SGDClassifier must be a str among {'squared_hinge', 'squared_error', 'modified_huber', 'log_loss', 'perceptron', 'hinge', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'}. Got 'log' instead.

In [None]:
import dask.dataframe as dd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your large CSV file using Dask
ddf = merged_data

# Ensure 'sequence' column is of type str and create 'sequence_length'
ddf['sequence'] = ddf['sequence'].astype(str)
ddf['sequence_length'] = ddf['sequence'].map(len)

# Convert to Dask Delayed objects for chunk processing
chunks = ddf.to_delayed()

model = SGDClassifier(loss='log', random_state=42)

# Iterate through each chunk
for chunk in chunks:
    chunk = chunk.compute()
    X_chunk = chunk[['sequence_length', 'residueCount_x']]
    y_chunk = chunk['classification']

    if 'coef_' not in dir(model):
        model.partial_fit(X_chunk, y_chunk, classes=y_chunk.unique())
    else:
        model.partial_fit(X_chunk, y_chunk)

# Evaluate the model
X_full = ddf[['sequence_length', 'residueCount_x']].compute()
y_full = ddf['classification'].compute()

y_pred = model.predict(X_full)
print("Accuracy:", accuracy_score(y_full, y_pred))
print(classification_report(y_full, y_pred))


AttributeError: 'DataFrame' object has no attribute 'to_delayed'

In [None]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'merged_data' is your original large Pandas DataFrame

# Reduce the dataset size by sampling 10% of the data
sampled_data = merged_data.sample(frac=0.1, random_state=42)

# Ensure 'sequence' column is of type str and create 'sequence_length'
sampled_data['sequence'] = sampled_data['sequence'].astype(str)
sampled_data['sequence_length'] = sampled_data['sequence'].apply(len)

# Split the data into features and target
X = sampled_data[['sequence_length', 'residueCount_x']]
y = sampled_data['classification']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = SGDClassifier(loss='log', random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


InvalidParameterError: The 'loss' parameter of SGDClassifier must be a str among {'squared_hinge', 'squared_error', 'modified_huber', 'log_loss', 'perceptron', 'hinge', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'}. Got 'log' instead.

In [None]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'merged_data' is your original large Pandas DataFrame

# Reduce the dataset size by sampling 10% of the data
sampled_data = merged_data.sample(frac=0.1, random_state=42)

# Ensure 'sequence' column is of type str and create 'sequence_length'
sampled_data['sequence'] = sampled_data['sequence'].astype(str)
sampled_data['sequence_length'] = sampled_data['sequence'].apply(len)

# Split the data into features and target
X = sampled_data[['sequence_length', 'residueCount_x']]
y = sampled_data['classification']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = SGDClassifier(loss='log_loss', random_state=42)  # Corrected loss parameter
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.15483391701156743


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                    precision    recall  f1-score   support

                     ACETYLCHOLINE BINDING PROTEIN       0.00      0.00      0.00         1
                 ACETYLCHOLINE RECEPTOR ANTAGONIST       0.00      0.00      0.00         1
                     ACETYLCHOLINE-BINDING PROTEIN       0.00      0.00      0.00         2
                             ACTIN BINDING PROTEIN       0.00      0.00      0.00         1
                             ACTIN-BINDING PROTEIN       0.00      0.00      0.00         1
                                   ACYLTRANSFERASE       0.00      0.00      0.00         1
                                  ADHESION PROTEIN       0.00      0.00      0.00         1
                              ALKALINE PHOSPHATASE       0.00      0.00      0.00         1
                                          ALLERGEN       0.00      0.00      0.00         2
                                  AMIDOTRANSFERASE       0.00      0.00      0.

In [None]:
from sklearn.svm import SVC

sampled_data = merged_data.sample(frac=0.1, random_state=42)
sampled_data['sequence'] = sampled_data['sequence'].astype(str)
sampled_data['sequence_length'] = sampled_data['sequence'].apply(len)
X = sampled_data[['sequence_length', 'residueCount_x']]
y = sampled_data['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


In [1]:
pip install biopython


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [2]:
from Bio.Blast import NCBIWWW, NCBIXML
import pandas as pd

# Load the data
pdb_data_seq = pd.read_csv('pdb_data_seq.csv')

# Extract protein sequences
protein_sequences = pdb_data_seq[pdb_data_seq['macromoleculeType'] == 'Protein']

# Function to run BLASTP
def run_blastp(sequence, sequence_id):
    result_handle = NCBIWWW.qblast("blastp", "nr", sequence)
    with open(f"blast_results_{sequence_id}.xml", "w") as out_handle:
        out_handle.write(result_handle.read())
    result_handle.close()

# Run BLASTP for each protein sequence
for index, row in protein_sequences.iterrows():
    run_blastp(row['sequence'], row['structureId'])

print("BLASTP search completed for protein sequences.")


KeyboardInterrupt: 

In [3]:
from Bio.Blast import NCBIWWW, NCBIXML
import pandas as pd

# Load the data
pdb_data_seq = pd.read_csv('pdb_data_seq.csv')

# Extract protein sequences
protein_sequences = pdb_data_seq[pdb_data_seq['macromoleculeType'] == 'Protein'].head(10)

# Function to run BLASTP
def run_blastp(sequence, sequence_id):
    result_handle = NCBIWWW.qblast("blastp", "nr", sequence)
    with open(f"blast_results_{sequence_id}.xml", "w") as out_handle:
        out_handle.write(result_handle.read())
    result_handle.close()

# Run BLASTP for each of the first 10 protein sequences
for index, row in protein_sequences.iterrows():
    print(f"Running BLASTP for sequence ID: {row['structureId']}")
    run_blastp(row['sequence'], row['structureId'])

print("BLASTP search completed for the first 10 protein sequences.")


Running BLASTP for sequence ID: 101M
Running BLASTP for sequence ID: 102L
Running BLASTP for sequence ID: 102M
Running BLASTP for sequence ID: 103L
Running BLASTP for sequence ID: 103M
Running BLASTP for sequence ID: 104L
Running BLASTP for sequence ID: 104L
Running BLASTP for sequence ID: 104M
Running BLASTP for sequence ID: 105M
Running BLASTP for sequence ID: 106M
BLASTP search completed for the first 10 protein sequences.


In [6]:
!apt-get update
!apt-get install -y ncbi-blast+


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.org (108.139.15.91)] [C                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                                                    Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [2 InRelease 21.4 kB/128 kB 17%] [3 InRelease 46.0 kB/129 kB 36%] [Connected to cloud.r-project.o                                                                                                    Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [2 InRelease 63.4 kB/128 kB 50%] [3 InRelease 93.8 kB/129 kB 73%] [4 InRelease 3,626 B/3,626 B 100% [2 InRelease 63.4 kB/128 kB 50%] [3 InRelease 94.2 kB/129 kB 73%] [Waiting

In [7]:
import subprocess

# Create a BLAST database
command = "makeblastdb -in sequences.fasta -dbtype nucl"
subprocess.run(command, shell=True, check=True)

# Run BLASTN against the database
command = "blastn -query sequences.fasta -db sequences.fasta -out blast_results.xml -outfmt 5"
subprocess.run(command, shell=True, check=True)


CompletedProcess(args='blastn -query sequences.fasta -db sequences.fasta -out blast_results.xml -outfmt 5', returncode=0)

In [8]:
import os
from Bio.Blast.Applications import NcbimakeblastdbCommandline, NcbiblastnCommandline

# Load the data
pdb_data_seq = pd.read_csv('pdb_data_seq.csv')

protein_sequences = pdb_data_seq[pdb_data_seq['macromoleculeType'] == 'Protein'].head(10)


# Save sequences to a FASTA file
with open('sequences.fasta', 'w') as f:
    for index, row in protein_sequences.iterrows():
        f.write(f">{row['structureId']}_{row['chainId']}\n")
        f.write(f"{row['sequence']}\n")

# Create a BLAST database
makeblastdb_cline = NcbimakeblastdbCommandline(
    dbtype="nucl", input_file="sequences.fasta")
makeblastdb_cline()

# Run BLASTN against the database
blastn_cline = NcbiblastnCommandline(
    query="sequences.fasta", db="sequences.fasta", outfmt=5, out="blast_results.xml")
stdout, stderr = blastn_cline()

print("Local BLASTN search completed for all sequences.")


Local BLASTN search completed for all sequences.


In [2]:
!apt-get install clustalo


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libargtable2-0
The following NEW packages will be installed:
  clustalo libargtable2-0
0 upgraded, 2 newly installed, 0 to remove and 45 not upgraded.
Need to get 273 kB of archives.
After this operation, 694 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libargtable2-0 amd64 13-1.1 [14.1 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 clustalo amd64 1.2.4-7 [259 kB]
Fetched 273 kB in 0s (775 kB/s)
Selecting previously unselected package libargtable2-0.
(Reading database ... 123835 files and directories currently installed.)
Preparing to unpack .../libargtable2-0_13-1.1_amd64.deb ...
Unpacking libargtable2-0 (13-1.1) ...
Selecting previously unselected package clustalo.
Preparing to unpack .../clustalo_1.2.4-7_amd64.deb ...
Unpacking clustalo (1.2.4-7) ...
Setting up

In [3]:
# Run Clustal Omega in Colab
!clustalo -i hydrolase_sequences.fasta -o hydrolase_aligned.aln


ERROR: Failed to open sequence file hydrolase_sequences.fasta for reading
FATAL: Reading sequence file 'hydrolase_sequences.fasta' failed


In [5]:
import pandas as pd

# Load your data
merged_data = pd.read_csv('pdb_data_no_dups.csv')
seq_data = pd.read_csv('pdb_data_seq.csv')

# Filter by a specific classification, e.g., 'HYDROLASE'
filtered_data = merged_data[merged_data['classification'] == 'HYDROLASE']
filtered_seq_data = seq_data[seq_data['structureId'].isin(filtered_data['structureId'])]

# Write filtered sequences to a FASTA file
def write_fasta(data, filename):
    with open(filename, 'w') as fasta_file:
        for index, row in data.iterrows():
            fasta_file.write(f">{row['structureId']}|{row['chainId']}\n")
            fasta_file.write(f"{row['sequence']}\n")

write_fasta(filtered_seq_data, 'hydrolase_sequences.fasta')


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical

# Assuming 'sampled_data' is your preprocessed data
# Convert categorical labels to numeric
label_encoder = LabelEncoder()
merged_data['classification'] = label_encoder.fit_transform(merged_data['classification'])

# Split data into features and target
X = merged_data[['sequence_length', 'residueCount']]
y = to_categorical(merged_data['classification'])

# Reshape X to fit CNN input (samples, time steps, features)
X = np.expand_dims(X, axis=2)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model
cnn_model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
cnn_score = cnn_model.evaluate(X_test, y_test)
print(f'CNN Accuracy: {cnn_score[1]}')


KeyError: "None of [Index(['sequence_length', 'residueCount_x'], dtype='object')] are in the [columns]"

In [11]:
pdb_data_no_dups = pd.read_csv('./pdb_data_no_dups.csv')
pdb_data_seq = pd.read_csv('./pdb_data_seq.csv')

# Merge the datasets on 'structureId'
merged_data = pd.merge(pdb_data_no_dups, pdb_data_seq, on='structureId')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical

# Assuming 'merged_data' is your original DataFrame
# Ensure 'sequence' column is of type str and create 'sequence_length'
merged_data['sequence'] = merged_data['sequence'].astype(str)
merged_data['sequence_length'] = merged_data['sequence'].apply(len)

# Convert categorical labels to numeric
label_encoder = LabelEncoder()
merged_data['classification'] = label_encoder.fit_transform(merged_data['classification'])

# Split data into features and target
X = merged_data[['sequence_length', 'residueCount']]  # Use the correct column names
y = to_categorical(merged_data['classification'])

# Reshape X to fit CNN input (samples, time steps, features)
X = np.expand_dims(X, axis=2)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model
cnn_model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
cnn_score = cnn_model.evaluate(X_test, y_test)
print(f'CNN Accuracy: {cnn_score[1]}')


KeyError: "['residueCount'] not in index"

In [12]:
print(merged_data.columns)


Index(['structureId', 'classification', 'experimentalTechnique',
       'macromoleculeType_x', 'residueCount_x', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear', 'chainId', 'sequence',
       'residueCount_y', 'macromoleculeType_y', 'sequence_length'],
      dtype='object')


In [2]:
pdb_data_no_dups = pd.read_csv('./pdb_data_no_dups.csv')
pdb_data_seq = pd.read_csv('./pdb_data_seq.csv')

# Merge the datasets on 'structureId'
merged_data = pd.merge(pdb_data_no_dups, pdb_data_seq, on='structureId')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical

# Assuming 'merged_data' is your original DataFrame
# Ensure 'sequence' column is of type str and create 'sequence_length'
merged_data['sequence'] = merged_data['sequence'].astype(str)
merged_data['sequence_length'] = merged_data['sequence'].apply(len)

# Check the column names
print(merged_data.columns)

# If 'residueCount' does not exist, identify the correct column name
# For example, if it's 'residueCount_x' or 'residueCount_y', use that
if 'residueCount_x' in merged_data.columns:
    residue_count_col = 'residueCount_x'
elif 'residueCount_y' in merged_data.columns:
    residue_count_col = 'residueCount_y'
else:
    residue_count_col = 'residueCount'  # Adjust based on your dataset

# Convert categorical labels to numeric
label_encoder = LabelEncoder()
merged_data['classification'] = label_encoder.fit_transform(merged_data['classification'])
# Sample a smaller portion of the data, e.g., 10% of the dataset
merged_data = merged_data.sample(frac=0.1, random_state=42)

# Split data into features and target
X = merged_data[['sequence_length', residue_count_col]]  # Use the correct column name
y = to_categorical(merged_data['classification'])

# Reshape X to fit CNN input (samples, time steps, features)
X = np.expand_dims(X, axis=2)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model
cnn_model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
cnn_score = cnn_model.evaluate(X_test, y_test)
print(f'CNN Accuracy: {cnn_score[1]}')


Index(['structureId', 'classification', 'experimentalTechnique',
       'macromoleculeType_x', 'residueCount_x', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear', 'chainId', 'sequence',
       'residueCount_y', 'macromoleculeType_y', 'sequence_length'],
      dtype='object')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


ValueError: Exception encountered when calling Conv1D.call().

[1mNegative dimension size caused by subtracting 3 from 2 for '{{node sequential_1/conv1d_1/convolution}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](sequential_1/conv1d_1/convolution/ExpandDims, sequential_1/conv1d_1/convolution/ExpandDims_1)' with input shapes: [?,1,2,1], [1,3,1,64].[0m

Arguments received by Conv1D.call():
  • inputs=tf.Tensor(shape=(None, 2, 1), dtype=float32)

In [3]:
pdb_data_no_dups = pd.read_csv('./pdb_data_no_dups.csv')
pdb_data_seq = pd.read_csv('./pdb_data_seq.csv')

# Merge the datasets on 'structureId'
merged_data = pd.merge(pdb_data_no_dups, pdb_data_seq, on='structureId')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical

# Assuming 'merged_data' is your original DataFrame
# Ensure 'sequence' column is of type str and create 'sequence_length'
merged_data['sequence'] = merged_data['sequence'].astype(str)
merged_data['sequence_length'] = merged_data['sequence'].apply(len)

# Convert categorical labels to numeric
label_encoder = LabelEncoder()
merged_data['classification'] = label_encoder.fit_transform(merged_data['classification'])

# Use a smaller sample of data if the dataset is too large
sampled_data = merged_data.sample(frac=0.1, random_state=42)

# Use the correct column for residue count
residue_count_col = 'residueCount_x' if 'residueCount_x' in merged_data.columns else 'residueCount_y'

# Split data into features and target
X = sampled_data[['sequence_length', residue_count_col]]
y = to_categorical(sampled_data['classification'])

# Reshape X to fit CNN input (samples, time steps, features)
X = np.expand_dims(X, axis=2)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model with a smaller kernel size or same padding
cnn_model = Sequential([
    Conv1D(64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),  # Adjusted kernel size
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
cnn_score = cnn_model.evaluate(X_test, y_test)
print(f'CNN Accuracy: {cnn_score[1]}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


ValueError: Exception encountered when calling MaxPooling1D.call().

[1mNegative dimension size caused by subtracting 2 from 1 for '{{node sequential_1_1/max_pooling1d_1_1/MaxPool1d}} = MaxPool[T=DT_FLOAT, data_format="NHWC", explicit_paddings=[], ksize=[1, 1, 2, 1], padding="VALID", strides=[1, 1, 2, 1]](sequential_1_1/max_pooling1d_1_1/MaxPool1d/ExpandDims)' with input shapes: [?,1,1,64].[0m

Arguments received by MaxPooling1D.call():
  • inputs=tf.Tensor(shape=(None, 1, 64), dtype=float32)