<a href="https://colab.research.google.com/github/vdhulappanavar/bioinformatics/blob/main/FinalBioInformaticsProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the data
pdb_data_no_dups = pd.read_csv('./pdb_data_no_dups.csv')
pdb_data_seq = pd.read_csv('./pdb_data_seq.csv')

# Merge the datasets on 'structureId'
merged_data = pd.merge(pdb_data_no_dups, pdb_data_seq, on='structureId')

# Ensure 'sequence' column is of type str and handle missing values
merged_data['sequence'] = merged_data['sequence'].fillna('').astype(str)

# Feature Extraction: Create 'sequence_length' by calculating the length of each sequence
if 'sequence' in merged_data.columns:
    merged_data['sequence_length'] = merged_data['sequence'].apply(len)
else:
    raise KeyError("The 'sequence' column is missing from the dataset")

# Convert 'residueCount_x' to numeric (this assumes 'residueCount_x' is the correct column name)
merged_data['residueCount_x'] = pd.to_numeric(merged_data['residueCount_x'], errors='coerce')

# Handle any remaining NaN values in numeric columns
merged_data.fillna(0, inplace=True)

# Example: Calculate amino acid composition
def amino_acid_composition(sequence):
    aa_dict = {aa: sequence.count(aa) for aa in set(sequence)}
    return aa_dict

merged_data['aa_composition'] = merged_data['sequence'].apply(amino_acid_composition)

# Check the data types of all columns
print(merged_data.dtypes)

# Optionally, inspect the first few rows to get an overview
print(merged_data.head())

# Identify non-numeric entries in the columns of interest
for col in ['sequence_length', 'residueCount_x']:
    non_numeric = merged_data[pd.to_numeric(merged_data[col], errors='coerce').isna()]
    if not non_numeric.empty:
        print(f"Non-numeric values found in column {col}:")
        print(non_numeric[[col]])

In [None]:
# Convert the columns to numeric, forcing any non-convertible values to NaN
merged_data['residueCount_x'] = pd.to_numeric(merged_data['residueCount_x'], errors='coerce')
merged_data['sequence_length'] = pd.to_numeric(merged_data['sequence_length'], errors='coerce')

# Handle NaN values after conversion
merged_data.fillna(0, inplace=True)

# Now proceed with the rest of the analysis

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(merged_data[['sequence_length', 'residueCount_x']])

# Target Variable
y = merged_data['classification']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inspect unique values in the target variable
print(y_train.unique())


['OXIDOREDUCTASE' 'SIGNALING PROTEIN' 'MEMBRANE PROTEIN, LIPID TRANSPORT'
 ... 'CHOLESTEROL-BINDING PROTEIN' 'VIRAL PROTEIN/WINGED HELIX'
 'FUNGICIDE']


In [None]:
# Downcast numerical columns to save memory
merged_data['residueCount_x'] = pd.to_numeric(merged_data['residueCount_x'], downcast='integer')
merged_data['resolution'] = pd.to_numeric(merged_data['resolution'], downcast='float')

# Check memory usage
print(merged_data.memory_usage(deep=True))

Index                             128
structureId                  28740089
classification               33675773
experimentalTechnique        34893444
macromoleculeType_x          29334644
residueCount_x                1884596
resolution                    1884596
structureMolecularWeight      3769192
crystallizationMethod        31258762
crystallizationTempK          3769192
densityMatthews               3769192
densityPercentSol             3769192
pdbxDetails                  61748955
phValue                       3769192
publicationYear               3769192
chainId                      27372132
sequence                    143441710
residueCount_y                3769192
macromoleculeType_y          29334644
sequence_length               3769192
aa_composition              288228248
dtype: int64


In [None]:
pip install dask-ml dask[complete]




In [None]:
pip install dask[complete] scikit-learn




In [None]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'merged_data' is your original large Pandas DataFrame

# Reduce the dataset size by sampling 10% of the data
sampled_data = merged_data.sample(frac=0.1, random_state=42)

# Ensure 'sequence' column is of type str and create 'sequence_length'
sampled_data['sequence'] = sampled_data['sequence'].astype(str)
sampled_data['sequence_length'] = sampled_data['sequence'].apply(len)

# Split the data into features and target
X = sampled_data[['sequence_length', 'residueCount_x']]
y = sampled_data['classification']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = SGDClassifier(loss='log_loss', random_state=42)  # Corrected loss parameter
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))




Accuracy: 0.044018643190056966


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                   precision    recall  f1-score   support

                                         ALDOLASE       0.00      0.00      0.00         1
                                         ALLERGEN       0.00      0.00      0.00         1
                                          AMIDASE       0.00      0.00      0.00         1
                                    AMINE OXIDASE       0.00      0.00      0.00         1
                          AMINOACYL-TRNA SYNTHASE       0.00      0.00      0.00         1
                        AMINOACYL-TRNA SYNTHETASE       0.00      0.00      0.00         1
                                 AMINOTRANSFERASE       0.00      0.00      0.00         1
                                 ANTI-HIV PROTEIN       0.00      0.00      0.00         1
                                    ANTI-ONCOGENE       0.00      0.00      0.00         1
                                       ANTIBIOTIC       0.00      0.00      0.00        1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
num_rows = merged_data.shape[0]
print(f'The number of rows in the merged data is: {num_rows}')

The number of rows in the merged data is: 471149


In [None]:
from sklearn.svm import SVC

sampled_data = merged_data.sample(frac=0.1, random_state=42)
sampled_data['sequence'] = sampled_data['sequence'].astype(str)
sampled_data['sequence_length'] = sampled_data['sequence'].apply(len)
sampled_data = sampled_data.head(10000)

X = sampled_data[['sequence_length', 'residueCount_x']]
y = sampled_data['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


In [None]:
num_rows = merged_data.shape[0]
print(f'The number of rows in the merged data is: {num_rows}')

In [None]:
pip install biopython


In [None]:
from Bio.Blast import NCBIWWW, NCBIXML
import pandas as pd

# Load the data
pdb_data_seq = pd.read_csv('pdb_data_seq.csv')

# Extract protein sequences
protein_sequences = pdb_data_seq[pdb_data_seq['macromoleculeType'] == 'Protein'].head(10)

# Function to run BLASTP
def run_blastp(sequence, sequence_id):
    result_handle = NCBIWWW.qblast("blastp", "nr", sequence)
    with open(f"blast_results_{sequence_id}.xml", "w") as out_handle:
        out_handle.write(result_handle.read())
    result_handle.close()

# Run BLASTP for each of the first 10 protein sequences
for index, row in protein_sequences.iterrows():
    print(f"Running BLASTP for sequence ID: {row['structureId']}")
    run_blastp(row['sequence'], row['structureId'])

print("BLASTP search completed for the first 10 protein sequences.")


In [None]:
!apt-get update
!apt-get install -y ncbi-blast+


In [None]:
import os
from Bio.Blast.Applications import NcbimakeblastdbCommandline, NcbiblastnCommandline

# Load the data
pdb_data_seq = pd.read_csv('pdb_data_seq.csv')

protein_sequences = pdb_data_seq[pdb_data_seq['macromoleculeType'] == 'Protein'].head(10)


# Save sequences to a FASTA file
with open('sequences.fasta', 'w') as f:
    for index, row in protein_sequences.iterrows():
        f.write(f">{row['structureId']}_{row['chainId']}\n")
        f.write(f"{row['sequence']}\n")

# Create a BLAST database
makeblastdb_cline = NcbimakeblastdbCommandline(
    dbtype="nucl", input_file="sequences.fasta")
makeblastdb_cline()

# Run BLASTN against the database
blastn_cline = NcbiblastnCommandline(
    query="sequences.fasta", db="sequences.fasta", outfmt=5, out="blast_results.xml")
stdout, stderr = blastn_cline()

print("Local BLASTN search completed for all sequences.")
