In [1]:
import pandas as pd

# Load the raw MorFi data
file_path = 'MorFi.xlsx'  
xls = pd.ExcelFile(file_path)

# Load the dataset
df = pd.read_excel(file_path, sheet_name=xls.sheet_names[0])

# Identify the row where data starts (Row 13 in the standard MorFi file)
df_data = df.iloc[13:].reset_index(drop=True)

# Rename columns 
df_data.columns = ["Feature", "Value"]

# Drop empty rows
df_data = df_data.dropna()

# Fix encoding issues in feature names
df_data["Feature"] = df_data["Feature"].str.replace(r'[ｵµ]', 'µ', regex=True)  # Fix µm
df_data["Feature"] = df_data["Feature"].str.replace(r'ｲ', '²', regex=True)  # Fix ²
df_data["Feature"] = df_data["Feature"].str.replace(r'ｰ', '°', regex=True)  # Fix °

# Define Fiber_Length_Below_556, Fiber_Length_Above_556, Fiber_width_Below_19, Fiber_width_Above_19   
feature_groups = {
    "Fiber_Length_Below_556": ["[200-289]", "[289-378]", "[378-467]", "[467-556]"],
    "Fiber_Length_Above_556": ["[556-644]", "[644-733]", "[733-822]", "[822-911]", "[911-1000]"],
    "Fiber_width_Below_19": ["[5-12]", "[12-19]"],
    "Fiber_width_Above_19": ["[19-26]", "[26-33]", "[33-39]", "[39-46]", "[46-53]", "[53-60]", "[60-67]", "[67->]"]
}

# Required features 
final_feature_order = [
    "Number of analysed fibres",
    "Fibre content, millions/g of pulp",
    "Fiber_Length_Below_556",
    "Fiber_Length_Above_556",
    "[1000->]",
    "Mean fibre arithmetic length, µm",
    "Mean length-weighted fibre length, µm",
    "Fiber_width_Below_19",
    "Fiber_width_Above_19",
    "Mean fibre width, µm",
    "Mean fibre coarseness, mg/m",
    "Average kink number",
    "Average kink angle, °",
    "Kinked fibre content, %",
    "Mean fibre curl index, %",
    "MacroFibrillation index, %",
    "Broken fibre content, %",
    "Number of analysed fines",
    "Fines content, millions/g of pulp",
    "Fine content, % in area",
    "Fine content, % in length",
    "Fine content, % in length weighted in length",
    "Mean fine area, µm²",
    "Mean fine length, µm"
]

# Initialize dictionary to store final values
final_data = {}

# Extract individual features
for feature in final_feature_order:
    value = df_data[df_data["Feature"] == feature]["Value"].values
    if len(value) > 0:
        final_data[feature] = value[0]

# Sum grouped features
for new_feature, group in feature_groups.items():
    total_value = df_data[df_data["Feature"].isin(group)]["Value"].sum()
    final_data[new_feature] = total_value

# Convert dictionary to DataFrame in the correct order
df_final = pd.DataFrame([final_data], columns=final_feature_order)

# Save to CSV
output_file = "formatted_data.csv"
df_final.to_csv(output_file, index=False)

print(f"Formatted data has been saved to {output_file}.")


Formatted data has been saved to formatted_data.csv.


Normalizing the data based on the traning and validation dataset

In [2]:
import pandas as pd

# Load the training and validation dataset
training_file = "training_and_validation_data.csv" 
df_train = pd.read_csv(training_file)

# Load the formatted test data
formatted_file = "formatted_data.csv"
df_test = pd.read_csv(formatted_file)

# Ensure both datasets have the same column order
df_test = df_test[df_train.columns]  

# Concatenate training and test data
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Convert all data to numeric 
df_combined = df_combined.apply(pd.to_numeric, errors='coerce')

# Perform Min-Max Normalization based on training and validation dataset
for column in df_train.columns:
    min_value = df_train[column].min()  # Get min from training data
    max_value = df_train[column].max()  # Get max from training data
    
    # Avoid division by zero
    if max_value != min_value:
        df_combined[column] = (df_combined[column] - min_value) / (max_value - min_value)
    else:
        df_combined[column] = 0  # If min == max, set normalized value to 0

# Extract the normalized final normalized data (last row)
df_normalized_test = df_combined.iloc[-1:].reset_index(drop=True)

# Save the normalized data to a new CSV file
output_file = "normalized_final_data.csv"
df_normalized_test.to_csv(output_file, index=False)

print(f"Normalized final data has been saved to {output_file}")


Normalized final data has been saved to normalized_final_data.csv
