In [1]:
import csv

In [4]:
def convert_ss2_to_csv(ss2_file, csv_file):
    with open(ss2_file, 'r') as ss2_input:
        lines = ss2_input.readlines()

    data = []
    for line in lines:
        if line.startswith("#"):
            continue  # Skip comments
        columns = line.split()
        if len(columns) >= 6:
            residue_index = columns[0]
            amino_acid = columns[1]
            secondary_structure = columns[2]
            data.append([residue_index, amino_acid, secondary_structure])

    with open(csv_file, 'w', newline='') as csv_output:
        csv_writer = csv.writer(csv_output)
        csv_writer.writerow(['Residue_Index', 'Amino_Acid', 'Secondary_Structure'])
        csv_writer.writerows(data)

# Replace 'input.ss2' and 'output.csv' with your file names
convert_ss2_to_csv('newdata/new_nep.ss2', 'newdata/new_nep.csv')


In [2]:
import pandas as pd

def calculate_kyte_doolittle_hydrophobicity(residue):
    hydrophobicity_values = {
        'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
    }
    return hydrophobicity_values.get(residue, 0.0)

def main():
    # Replace 'input.csv' and 'output.csv' with your file names
    input_file = 'newdata/spotone-ml.xlsx'
    output_file = 'newdata/spotone-ml-hydro.xlsx'

    # Read the CSV file
    df = pd.read_excel(input_file)

    # Add a new column with Kyte-Doolittle hydrophobicity values
    df['Hydrophobicity'] = df['Amino_Acid'].apply(calculate_kyte_doolittle_hydrophobicity)

    # Save the result to a new CSV file
    df.to_xlsx(output_file, index=False)

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'newdata/spotone-ml.xlsx'

In [5]:
import pandas as pd

def calculate_amino_acid_propensities(protein_sequence, amino_acid):
    propensities = []

    for i in range(len(protein_sequence)):
        start = max(0, i - 3)
        end = min(len(protein_sequence), i + 4)

        aa_count = protein_sequence[start:end].count(amino_acid)
        total_residues = end - start

        aa_propensity = aa_count / total_residues

        propensities.append((i + 1, aa_propensity))  # Residue numbering starts from 1

    return propensities

def calculate_combined_propensities(protein_sequence):
    arginine_propensities = calculate_amino_acid_propensities(protein_sequence, 'R')
    tryptophan_propensities = calculate_amino_acid_propensities(protein_sequence, 'W')
    tyrosine_propensities = calculate_amino_acid_propensities(protein_sequence, 'Y')

    combined_propensities = [
        (residue, arginine, tryptophan, tyrosine)
        for (residue, arginine), (_, tryptophan), (_, tyrosine)
        in zip(arginine_propensities, tryptophan_propensities, tyrosine_propensities)
    ]

    return combined_propensities

def main():
    # Replace 'input.fasta' and 'output_combined.csv' with your file names
    input_file = 'newdata/newns.fasta'
    output_file = 'newdata/outputns.csv'

    with open(input_file, 'r') as file:
        # Read the entire file, assuming each sequence is one line
        fasta_sequences = file.read().split('>')[1:]

    propensities_list = []

    for i, sequence in enumerate(fasta_sequences):
        sequence_parts = sequence.split('\n', 1)
        header = sequence_parts[0]
        sequence = sequence_parts[1].replace('\n', '')

        # Uncomment the following line if you want to include the header information in the CSV
        # propensities_list.append(('Header', header))

        # Calculate combined propensities for the sequence
        propensities = calculate_combined_propensities(sequence)
        propensities_list.extend([(f"Sequence_{i + 1}", residue, arginine, tryptophan, tyrosine)
                                 for residue, arginine, tryptophan, tyrosine in propensities])

    # Create a DataFrame from the propensities
    df = pd.DataFrame(propensities_list, columns=['Sequence', 'Residue', 'Arginine_Propensity', 'Tryptophan_Propensity', 'Tyrosine_Propensity'])

    # Save the result to a combined CSV file
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    main()


In [6]:
import pandas as pd

def calculate_amino_acid_propensities(protein_sequence, amino_acid):
    propensities = []

    for i in range(len(protein_sequence)):
        start = max(0, i - 3)
        end = min(len(protein_sequence), i + 4)

        aa_count = protein_sequence[start:end].count(amino_acid)
        total_residues = end - start

        aa_propensity = aa_count / total_residues

        propensities.append((i + 1, aa_propensity))  # Residue numbering starts from 1

    return propensities

def calculate_combined_propensities(protein_sequence):
    valine_propensities = calculate_amino_acid_propensities(protein_sequence, 'V')
    serine_propensities = calculate_amino_acid_propensities(protein_sequence, 'S')
    methionine_propensities = calculate_amino_acid_propensities(protein_sequence, 'M')
    threonine_propensities = calculate_amino_acid_propensities(protein_sequence, 'T')
    leucine_propensities = calculate_amino_acid_propensities(protein_sequence, 'L')

    combined_propensities = [
        (residue, valine, serine, methionine, threonine, leucine)
        for (residue, valine), (_, serine), (_, methionine), (_, threonine), (_, leucine)
        in zip(valine_propensities, serine_propensities, methionine_propensities, threonine_propensities, leucine_propensities)
    ]

    return combined_propensities

def main():
    # Replace 'input.fasta' and 'output_combined.csv' with your file names
    input_file = 'newdata/newns.fasta'
    output_file = 'newdata/outputns1.csv'

    with open(input_file, 'r') as file:
        # Read the entire file, assuming each sequence is one line
        fasta_sequences = file.read().split('>')[1:]

    propensities_list = []

    for i, sequence in enumerate(fasta_sequences):
        sequence_parts = sequence.split('\n', 1)
        header = sequence_parts[0]
        sequence = sequence_parts[1].replace('\n', '')

        # Uncomment the following line if you want to include the header information in the CSV
        # propensities_list.append(('Header', header))

        # Calculate combined propensities for the sequence
        propensities = calculate_combined_propensities(sequence)
        propensities_list.extend([(f"Sequence_{i + 1}", residue, valine, serine, methionine, threonine, leucine)
                                 for residue, valine, serine, methionine, threonine, leucine in propensities])

    # Create a DataFrame from the propensities
    df = pd.DataFrame(propensities_list, columns=['Sequence', 'Residue', 'Valine_Propensity', 'Serine_Propensity', 'Methionine_Propensity', 'Threonine_Propensity', 'Leucine_Propensity'])

    # Save the result to a combined CSV file
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    main()


In [13]:
# import pandas as pd

# def calculate_amino_acid_propensities(protein_sequence, amino_acid):
#     propensities = []

#     for i in range(len(protein_sequence)):
#         start = max(0, i - 3)
#         end = min(len(protein_sequence), i + 4)

#         aa_count = protein_sequence[start:end].count(amino_acid)
#         total_residues = end - start

#         aa_propensity = aa_count / total_residues

#         propensities.append((i + 1, aa_propensity))  # Residue numbering starts from 1

#     return propensities

# def calculate_combined_propensities(protein_sequence):
#     arginine_propensities = calculate_amino_acid_propensities(protein_sequence, 'R')
#     tryptophan_propensities = calculate_amino_acid_propensities(protein_sequence, 'W')
#     tyrosine_propensities = calculate_amino_acid_propensities(protein_sequence, 'Y')

#     combined_propensities = [
#         (residue, arginine, tryptophan, tyrosine)
#         for (residue, arginine), (_, tryptophan), (_, tyrosine)
#         in zip(arginine_propensities, tryptophan_propensities, tyrosine_propensities)
#     ]

#     return combined_propensities

# def main():
#     # Replace 'input.csv' and 'output_combined.csv' with your file names
#     input_file = 'newdata/PB1.csv'
#     output_file = 'newdata/try.csv'  # Overwrite the input file with the appended columns

#     # Read the input CSV file
#     df = pd.read_csv(input_file)

#     propensities_list = []

#     for i, sequence in enumerate(df['Amino_Acid']):
#         # Calculate combined propensities for the sequence
#         propensities = calculate_combined_propensities(sequence)
#         propensities_list.extend([(f"Sequence_{i + 1}", residue, arginine, tryptophan, tyrosine)
#                                  for residue, arginine, tryptophan, tyrosine in propensities])

#     # Create a DataFrame from the propensities
#     new_columns = ['Sequence', 'Residue', 'Arginine_Propensity', 'Tryptophan_Propensity', 'Tyrosine_Propensity']
#     df = pd.concat([df, pd.DataFrame(propensities_list, columns=new_columns)], axis=1)

#     # Save the result to the same CSV file with appended columns
#     df.to_csv(output_file, index=False)

# if __name__ == "__main__":
#     main()
