In [2]:
import pandas as pd
import glob
import os

# Specify the folder containing the .txt files
folder_path = './9_Strains'  # Replace with your folder path

# Use glob to get all .txt files in the folder
txt_files = glob.glob(os.path.join(folder_path, '*.txt'))

# Initialize an empty DataFrame
combined_df = pd.DataFrame()

# Loop through each file and read into a DataFrame
for file in txt_files:
    try:
        # Read the txt file into a DataFrame
        df = pd.read_csv(file, sep='\t', header=None)  # Assuming tab-separated values
        # Get the base filename without extension
        base_filename = os.path.splitext(os.path.basename(file))[0]
        # Rename columns to avoid conflicts if combining
        df.columns = ['x'] + [f'{base_filename}_s{i}' for i in range(1, df.shape[1])]
        # Merge into the combined DataFrame
        if combined_df.empty:
            combined_df = df
        else:
            combined_df = pd.merge(combined_df, df, on='x', how='outer')
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Replace NaN values with the median of each column
combined_df.fillna(combined_df.median(), inplace=True)

# View or save the combined DataFrame
print(combined_df.head())  # To check the top few rows

# Save to a new CSV file
os.makedirs('csv_data', exist_ok=True)  # Ensure the directory exists
combined_df.to_csv('csv_data/combined_spectra.csv', index=False)


         x  Enterobacter_aerogenes_1_s1  Enterobacter_aerogenes_1_s2  \
0  542.245                        462.8                        463.8   
1  542.328                        464.4                        460.6   
2  542.411                        464.0                        462.6   
3  542.493                        462.6                        461.0   
4  542.576                        461.8                        461.0   

   Enterobacter_aerogenes_1_s3  Enterobacter_aerogenes_1_s4  \
0                        464.6                        463.0   
1                        462.4                        464.2   
2                        462.4                        465.0   
3                        460.4                        463.0   
4                        461.4                        467.4   

   Enterobacter_aerogenes_1_s5  Enterobacter_aerogenes_1_s6  \
0                        466.8                        464.2   
1                        468.2                        467.8   

In [1]:
import pandas as pd
from scipy.io import savemat

# Step 1: Read CSV file into a Pandas DataFrame
csv_file = './csv_data/combined_spectra.csv'  # Replace with your CSV file path
data = pd.read_csv(csv_file)

# Step 2: Convert DataFrame to a dictionary
# MATLAB .mat files store data in dictionaries (key-value pairs).
# The key will be the variable name inside the .mat file.
data_dict = {'data': data.to_dict(orient='list')}  # Convert the DataFrame into a dictionary

# Step 3: Save the dictionary to a .mat file
mat_file = './csv_data/combined_spectra.mat'  # Desired output .mat file name
savemat(mat_file, data_dict)

print(f"CSV file converted to {mat_file}")


CSV file converted to ./csv_data/combined_spectra.mat


In [1]:
import pandas as pd

# Sample DataFrame
df = pd.read_csv("./csv_data/combined_spectra.csv")
# Step 1: Find the maximum value in the DataFrame
max_value = df.max().max()

# Step 2: Find the location (index and column) of the maximum value
# Use stack to reshape the DataFrame and then find the index of the max value
location = df.stack().idxmax()

# Extract the row and column index from the location
row_index, column_index = location

# Output results
print(f"Largest value in the DataFrame: {max_value}")
print(f"Location of the largest value: Row {row_index}, Column '{column_index}'")
print(f"raman shift {df.iloc[row_index, 0]}")


Largest value in the DataFrame: 3269.75
Location of the largest value: Row 1076, Column 'Eschericia_coli_3_s13'
raman shift 630.427


In [2]:
peak_band = df.iloc[1076, :]
peak_band.head()

x                              630.427
Enterobacter_aerogenes_1_s1    584.600
Enterobacter_aerogenes_1_s2    584.600
Enterobacter_aerogenes_1_s3    576.600
Enterobacter_aerogenes_1_s4    563.200
Name: 1076, dtype: float64

In [6]:
import pandas as pd
import glob
import os
from pathlib import Path

# List of TXT file names
txt_files = ["./9_Strains/setA/Enterobacter_aerogenes_1.txt",
            './9_Strains/setA/Enterococcus_faecalis_1.txt',
            './9_Strains/setA/Eschericia_coli_1.txt',
            './9_Strains/setA/Listeria_innocua_1.txt', 
            './9_Strains/setA/Listeria_monocytogenes_1.txt', 
            './9_Strains/setA/Rhodococcus_equii_1.txt', 
            './9_Strains/setA/S_abony_1.txt', 
            './9_Strains/setA/Shigella_boydi_1.txt', 
            './9_Strains/setA/Staphylococcus_aureus_1.txt']

# Initialize lists to store data
spectra_list = []
uspectra_list = []
x_axis_list = []
labels_list = []
columns = 5
flag = True
# Process each file
for file_name in txt_files:
    # Read the TXT file
    data = pd.read_csv(file_name, delimiter='\t', header=None)  # Adjust delimiter if needed
    
    # Extract x-axis and spectra
    x_axis = data.iloc[:, 0]  # First column
    spectra = data.iloc[:, 1:columns]  # Next 4 columns (change 5 if needed for different number of spectra)
    uspectra = data.iloc[:, 6]  # For unknown data
    # Append data to lists
    if flag:
        x_axis_list.append(x_axis)
        flag = False
    spectra_list.append(spectra)
    uspectra_list.append(uspectra)
    # labels_list.append([file_name] * len(x_axis))
    fname = Path(file_name).stem[:-2]
    for i in range(1,columns):
        labels_list.append(f"{fname}_{i}")

# Combine all data
x_axis_df = pd.concat(x_axis_list, axis=1)
spectra_df = pd.concat(spectra_list, axis=1)
uspectra_df = pd.concat(uspectra_list, axis=1)
labels_df = pd.DataFrame(labels_list) # Flatten and reset index

# Save DataFrames to CSV files
x_axis_df.to_csv('./csv_data2/x_axis.csv', index=False, header=False)
spectra_df.to_csv('./csv_data2/known_spectra.csv', index=False, header=False)
uspectra_df.to_csv('./csv_data2/unknown_spectra.csv', index=False, header=False)
labels_df.to_csv('./csv_data2/labels.csv', index=False, header=False)


In [9]:
import numpy as np


xorg = np.genfromtxt("./csv_data2/x_axis.csv",delimiter=",")
Akn = np.genfromtxt("./csv_data2/known_spectra.csv",delimiter=",")

Akn = Akn.T
rng = np.arange(620,640)
xc = xorg[rng]
Ac = Akn[:,rng]

print("shape : ", Akn.shape)
print(Akn)

shape :  (36, 1650)
[[462.8   464.4   464.    ... 462.2   460.6   460.8  ]
 [463.8   460.6   462.6   ... 461.6   458.2   455.8  ]
 [464.6   462.4   462.4   ... 460.2   457.6   457.4  ]
 ...
 [480.714 485.    479.857 ... 481.714 476.857 477.714]
 [488.571 488.286 491.    ... 473.286 470.143 471.571]
 [513.143 508.429 509.429 ... 487.143 486.857 483.   ]]


In [3]:
import os
import pandas as pd

# Path to the folder containing the .txt files
folder_path = './9_Strains/setB'

# List all .txt files in the folder
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# List to hold the spectra data
spectra_data = []

# Iterate over each .txt file
for txt_file in txt_files:
    # Read the content of the txt file into a DataFrame
    file_path = os.path.join(folder_path, txt_file)
    df = pd.read_csv(file_path, delim_whitespace=True, header=None)
    
    # Transpose the DataFrame so each spectrum is a single row
    spectrum = df.iloc[:, 1:].transpose()  # Exclude the first column (x-axis)
    
    # Add the label (filename without extension) as the last column
    spectrum['label'] = txt_file.split('.txt')[0]
    
    # Append the spectrum to the list
    spectra_data.append(spectrum)

# Concatenate all spectra into a single DataFrame
final_df = pd.concat(spectra_data, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv('./csv_data3/test.csv', index=False)


In [8]:
import pandas as pd
import re

# Load the CSV file into a DataFrame
csv_file = './csv_data3/validate.csv'
df = pd.read_csv(csv_file)

# Extract the last column (bacteria names)
last_column = df.iloc[:, -1]

# Create a function to remove the numeric part after the underscore
def clean_bacteria_name(name):
    # Remove the part after the last underscore (e.g., _1, _2, _3)
    return re.sub(r'_\d+$', '', name)

# Apply the function to clean the bacteria names
cleaned_names = last_column.apply(clean_bacteria_name)

# Create a mapping from unique bacteria names to numeric labels
bacteria_labels = {name: idx for idx, name in enumerate(cleaned_names.unique())}

# Replace the cleaned bacteria names with their corresponding numeric labels
df.iloc[:, -1] = cleaned_names.map(bacteria_labels)

# Save the updated DataFrame to a new CSV file
df.to_csv('./csv_data3/BS_validate.csv', index=False)

# Optional: If you want to print the mapping
print("Bacteria to Numeric Label Mapping:")
for bacteria, label in bacteria_labels.items():
    print(f"{bacteria}: {label}")


Bacteria to Numeric Label Mapping:
Enterobacter_aerogenes: 0
Enterococcus_faecalis: 1
Eschericia_coli: 2
Listeria_innocua: 3
Listeria_monocytogenes: 4
Rhodococcus_equii: 5
Shigella_boydi: 6
Staphylococcus_aureus: 7
S_abony: 8
