In [4]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from docx import Document
from docx.shared import Inches

def perform_pca(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path)
    
    # Drop rows with missing values
    data_cleaned = data.dropna()
    
    # Extract reflectance values (exclude non-numeric columns like sample IDs)
    reflectance_matrix = data_cleaned.iloc[:, 1:].values  # Adjust slicing as per your dataset
    wavelengths = data_cleaned.columns[1:]  # Extract wavelength column names
    
    # Normalize the reflectance data (standardize: mean=0, std=1)
    reflectance_normalized = (reflectance_matrix - np.mean(reflectance_matrix, axis=0)) / np.std(reflectance_matrix, axis=0)
    
    # Perform PCA
    pca = PCA()
    pca.fit(reflectance_normalized)
    
    # Extract explained variance and principal components
    explained_variance = pca.explained_variance_ratio_
    principal_components = pca.components_
    
    # Get top 10 wavelengths for each of the first 5 PCs
    pc_results = {}
    for pc_index in range(5):
        loading_scores = principal_components[pc_index]  # Loadings for the current PC
        important_wavelengths = sorted(zip(wavelengths, loading_scores), key=lambda x: abs(x[1]), reverse=True)[:10]
        pc_results[f"PC{pc_index + 1}"] = important_wavelengths
    
    return explained_variance[:5], pc_results

def generate_word_report(file_paths, output_docx):
    # Initialize report data
    report_pc_percentages = []
    report_pc_wavelengths = []
    
    # Process each file (day-by-day)
    for day, file_path in enumerate(file_paths, start=1):
        print(f"Processing Day {day}...")
        explained_variance, pc_results = perform_pca(file_path)
        
        # Add explained variance percentages to the report
        report_pc_percentages.append([f"Day {day}"] + [f"{variance * 100:.2f}%" for variance in explained_variance])
        
        # Add top 10 wavelengths for each PC to the report
        day_wavelengths = {}
        for pc, wavelengths in pc_results.items():
            day_wavelengths[pc] = wavelengths
        report_pc_wavelengths.append((f"Day {day}", day_wavelengths))
    
    # Create a Word document
    doc = Document()
    
    # Add title
    doc.add_heading("PCA Report: Day-by-Day Analysis", 0)
    
    # Add explained variance table
    doc.add_heading("Explained Variance Percentages", level=1)
    pc_percentage_df = pd.DataFrame(report_pc_percentages, columns=["Day", "PC1", "PC2", "PC3", "PC4", "PC5"])
    table = doc.add_table(rows=1, cols=6)
    table.style = "Table Grid"
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = "Day"
    hdr_cells[1].text = "PC1"
    hdr_cells[2].text = "PC2"
    hdr_cells[3].text = "PC3"
    hdr_cells[4].text = "PC4"
    hdr_cells[5].text = "PC5"
    for row in pc_percentage_df.values:
        row_cells = table.add_row().cells
        for i, value in enumerate(row):
            row_cells[i].text = value
    
    # Add top 10 wavelengths for each day
    doc.add_heading("Top 10 Wavelengths for Each Principal Component", level=1)
    for day, wavelengths in report_pc_wavelengths:
        doc.add_heading(f"{day}:", level=2)
        for pc, values in wavelengths.items():
            doc.add_heading(f"{pc}:", level=3)
            table = doc.add_table(rows=1, cols=2)
            table.style = "Table Grid"
            hdr_cells = table.rows[0].cells
            hdr_cells[0].text = "Wavelength"
            hdr_cells[1].text = "Contribution"
            for wavelength, score in values:
                row_cells = table.add_row().cells
                row_cells[0].text = wavelength
                row_cells[1].text = f"{score:.4f}"
    
    # Save the Word document
    doc.save(output_docx)
    print(f"Word report saved to {output_docx}")

# Example usage
file_paths = [
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 09-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 12-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 23-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 31-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 06-09-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 03-10-24.csv"
]
generate_word_report(file_paths, "pca_report_green.docx")

Processing Day 1...
Processing Day 2...
Processing Day 3...
Processing Day 4...
Processing Day 5...
Processing Day 6...
Word report saved to pca_report_green.docx


In [4]:
file_paths = [
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file09-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file12-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file23-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file31-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file06-09-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file 03-10-24.csv"
]
generate_word_report(file_paths, "pca_report_red.docx")

Processing Day 1...
Processing Day 2...
Processing Day 3...
Processing Day 4...
Processing Day 5...
Processing Day 6...
Word report saved to pca_report_red.docx


In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from docx import Document
from docx.shared import Inches

def perform_pca(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path)
    
    # Drop rows with missing values
    data_cleaned = data.dropna()
    
    # Extract reflectance values (exclude non-numeric columns like sample IDs)
    reflectance_matrix = data_cleaned.iloc[:, 1:].values  # Adjust slicing as per your dataset
    wavelengths = data_cleaned.columns[1:]  # Extract wavelength column names
    
    # Normalize the reflectance data (standardize: mean=0, std=1)
    reflectance_normalized = (reflectance_matrix - np.mean(reflectance_matrix, axis=0)) / np.std(reflectance_matrix, axis=0)
    
    # Perform PCA
    pca = PCA()
    pca.fit(reflectance_normalized)
    
    # Extract explained variance and principal components
    explained_variance = pca.explained_variance_ratio_
    principal_components = pca.components_
    
    # Get top 10 wavelengths for each of the first 5 PCs
    pc_results = {}
    for pc_index in range(5):
        loading_scores = principal_components[pc_index]  # Loadings for the current PC
        important_wavelengths = sorted(zip(wavelengths, loading_scores), key=lambda x: abs(x[1]), reverse=True)[:10]
        pc_results[f"PC{pc_index + 1}"] = [wavelength for wavelength, _ in important_wavelengths]
    
    return explained_variance[:5], pc_results

def generate_word_report(file_paths, output_docx):
    # Initialize report data
    report_pc_percentages = []
    report_pc_wavelengths = []
    
    # Process each file (day-by-day)
    for day, file_path in enumerate(file_paths, start=1):
        print(f"Processing Day {day}...")
        explained_variance, pc_results = perform_pca(file_path)
        
        # Add explained variance percentages to the report
        report_pc_percentages.append([f"Day {day}"] + [f"{variance * 100:.2f}%" for variance in explained_variance])
        
        # Add top 10 wavelengths for each PC to the report
        report_pc_wavelengths.append((f"Day {day}", pc_results))
    
    # Create a Word document
    doc = Document()
    
    # Add title
    doc.add_heading("PCA Report: Day-by-Day Analysis", 0)
    
    # Add explained variance table
    doc.add_heading("Explained Variance Percentages", level=1)
    pc_percentage_df = pd.DataFrame(report_pc_percentages, columns=["Day", "PC1", "PC2", "PC3", "PC4", "PC5"])
    table = doc.add_table(rows=1, cols=6)
    table.style = "Table Grid"
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = "Day"
    hdr_cells[1].text = "PC1"
    hdr_cells[2].text = "PC2"
    hdr_cells[3].text = "PC3"
    hdr_cells[4].text = "PC4"
    hdr_cells[5].text = "PC5"
    for row in pc_percentage_df.values:
        row_cells = table.add_row().cells
        for i, value in enumerate(row):
            row_cells[i].text = value
    
    # Add top 10 wavelengths for each PC in a single table
    doc.add_heading("Top 10 Wavelengths for Each Principal Component (Day-by-Day)", level=1)
    
    # Create a table for each PC
    for pc_index in range(5):
        pc = f"PC{pc_index + 1}"
        doc.add_heading(f"{pc} Wavelengths", level=2)
        
        # Create a table with days as columns
        table = doc.add_table(rows=11, cols=len(file_paths))  # 10 wavelengths + header row
        table.style = "Table Grid"
        
        # Add headers for each day
        for col_idx, (day, _) in enumerate(report_pc_wavelengths):
            table.cell(0, col_idx).text = day
        
        # Add top 10 wavelengths for each day
        for row_idx in range(1, 11):  # Rows for wavelengths 1-10
            for col_idx, (_, wavelengths) in enumerate(report_pc_wavelengths):
                wavelength = wavelengths[pc][row_idx - 1]
                table.cell(row_idx, col_idx).text = wavelength
    
    # Save the Word document
    doc.save(output_docx)
    print(f"Word report saved to {output_docx}")

# Example usage
file_paths = [
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 09-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 12-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 23-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 31-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 06-09-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\Filter data of Green apple\\Merged data\\merged_wavelength_reflectance 03-10-24.csv"
]
generate_word_report(file_paths, "pca_report_green1.docx")

Processing Day 1...
Processing Day 2...
Processing Day 3...
Processing Day 4...
Processing Day 5...
Processing Day 6...
Word report saved to pca_report_green1.docx


In [6]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from docx import Document
from docx.shared import Inches

def perform_pca(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path)
    
    # Drop rows with missing values
    data_cleaned = data.dropna()
    
    # Extract reflectance values (exclude non-numeric columns like sample IDs)
    reflectance_matrix = data_cleaned.iloc[:, 1:].values  # Adjust slicing as per your dataset
    wavelengths = data_cleaned.columns[1:]  # Extract wavelength column names
    
    # Normalize the reflectance data (standardize: mean=0, std=1)
    reflectance_normalized = (reflectance_matrix - np.mean(reflectance_matrix, axis=0)) / np.std(reflectance_matrix, axis=0)
    
    # Perform PCA
    pca = PCA()
    pca.fit(reflectance_normalized)
    
    # Extract explained variance and principal components
    explained_variance = pca.explained_variance_ratio_
    principal_components = pca.components_
    
    # Get top 10 wavelengths for each of the first 5 PCs
    pc_results = {}
    for pc_index in range(5):
        loading_scores = principal_components[pc_index]  # Loadings for the current PC
        important_wavelengths = sorted(zip(wavelengths, loading_scores), key=lambda x: abs(x[1]), reverse=True)[:10]
        pc_results[f"PC{pc_index + 1}"] = [wavelength for wavelength, _ in important_wavelengths]
    
    return explained_variance[:5], pc_results

def generate_word_report(file_paths, output_docx):
    # Initialize report data
    report_pc_percentages = []
    report_pc_wavelengths = []
    
    # Process each file (day-by-day)
    for day, file_path in enumerate(file_paths, start=1):
        print(f"Processing Day {day}...")
        explained_variance, pc_results = perform_pca(file_path)
        
        # Add explained variance percentages to the report
        report_pc_percentages.append([f"Day {day}"] + [f"{variance * 100:.2f}%" for variance in explained_variance])
        
        # Add top 10 wavelengths for each PC to the report
        report_pc_wavelengths.append((f"Day {day}", pc_results))
    
    # Create a Word document
    doc = Document()
    
    # Add title
    doc.add_heading("PCA Report: Day-by-Day Analysis", 0)
    
    # Add explained variance table
    doc.add_heading("Explained Variance Percentages", level=1)
    pc_percentage_df = pd.DataFrame(report_pc_percentages, columns=["Day", "PC1", "PC2", "PC3", "PC4", "PC5"])
    table = doc.add_table(rows=1, cols=6)
    table.style = "Table Grid"
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = "Day"
    hdr_cells[1].text = "PC1"
    hdr_cells[2].text = "PC2"
    hdr_cells[3].text = "PC3"
    hdr_cells[4].text = "PC4"
    hdr_cells[5].text = "PC5"
    for row in pc_percentage_df.values:
        row_cells = table.add_row().cells
        for i, value in enumerate(row):
            row_cells[i].text = value
    
    # Add top 10 wavelengths for each PC in a single table
    doc.add_heading("Top 10 Wavelengths for Each Principal Component (Day-by-Day)", level=1)
    
    # Create a table for each PC
    for pc_index in range(5):
        pc = f"PC{pc_index + 1}"
        doc.add_heading(f"{pc} Wavelengths", level=2)
        
        # Create a table with days as columns
        table = doc.add_table(rows=11, cols=len(file_paths))  # 10 wavelengths + header row
        table.style = "Table Grid"
        
        # Add headers for each day
        for col_idx, (day, _) in enumerate(report_pc_wavelengths):
            table.cell(0, col_idx).text = day
        
        # Add top 10 wavelengths for each day
        for row_idx in range(1, 11):  # Rows for wavelengths 1-10
            for col_idx, (_, wavelengths) in enumerate(report_pc_wavelengths):
                wavelength = wavelengths[pc][row_idx - 1]
                table.cell(row_idx, col_idx).text = wavelength
    
    # Save the Word document
    doc.save(output_docx)
    print(f"Word report saved to {output_docx}")

file_paths = [
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file09-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file12-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file23-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file31-07-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file06-09-24.csv",
    "C:\\Users\\sumat\\Desktop\\GSI Suman\\pravakar\\filtered data of red apple\\merged data\\merged_file 03-10-24.csv"
]
generate_word_report(file_paths, "pca_report_red1.docx")

Processing Day 1...
Processing Day 2...
Processing Day 3...
Processing Day 4...
Processing Day 5...
Processing Day 6...
Word report saved to pca_report_red1.docx
