In [3]:
import pandas as pd
import os
import re
from pathlib import Path
from openpyxl.styles import Font, PatternFill
from typing import Optional, Dict, Any, List

In [4]:
class ProductRecallsExtractor:
    """
    A class to extract and process product recall data, including classifying the reasons for recalls.
    """
    def __init__(self):
        """Initializes the extractor with configuration settings."""
        # --- Main Element Configuration ---
        self.ELEMENT_NAME_COL = 'Element Name'
        self.VALUE_COL = 'Fact Value'
        self.COMPANY_NAME_ELEMENT = 'NameOfTheCompany'
        self.VOLUNTARY_RECALLS_ELEMENT = 'NumberOfVoluntaryRecalls'
        self.FORCED_RECALLS_ELEMENT = 'NumberOfForcedRecalls'
        self.VOLUNTARY_REASONS_ELEMENT = 'ReasonsForVoluntaryRecall'
        self.FORCED_REASONS_ELEMENT = 'ReasonsForForcedRecall'
        
        # --- NEW: Configuration for Reason Classification ---
        self.REASON_CATEGORIES = {
            'Contamination/Foreign Material': ['contamination', 'foreign material', 'foreign object', 'contaminated'],
            'Product Defect or Malfunction': ['defect', 'defective', 'malfunction', 'broken', 'faulty', 'failed', 'error'],
            'Labeling or Packaging Error': ['labeling', 'label', 'packaging', 'mislabeled', 'packaging error', 'incorrect label', 'undeclared allergen'],
            'Safety Standard Non-Compliance': ['safety', 'non-compliance', 'standard', 'regulation', 'unsafe', 'hazard'],
            'Quality, Purity, or Potency Issue': ['quality', 'purity', 'potency', 'ingredient', 'sub-standard', 'strength', 'impurity']
        }

        # --- NEW: Pre-compile regex for reason categories for efficiency ---
        # The re.IGNORECASE flag is set here for case-insensitive matching.
        self.PRECOMPILED_REASON_REGEX = {
            category: re.compile('|'.join(keywords), re.IGNORECASE)
            for category, keywords in self.REASON_CATEGORIES.items()
        }

    def _get_text_value(self, df: pd.DataFrame, element_name: str, default_value: Optional[str] = '0') -> Optional[str]:
        """Helper to find an element name in a DataFrame and return its value as a string."""
        if self.ELEMENT_NAME_COL not in df.columns or self.VALUE_COL not in df.columns:
            return default_value
        series = df.loc[df[self.ELEMENT_NAME_COL] == element_name, self.VALUE_COL]
        if not series.empty and not pd.isna(series.iloc[0]):
            return str(series.iloc[0])
        return default_value

    def _process_single_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
        """Processes a single file (Excel or CSV) and extracts recall data."""
        try:
            if file_path.suffix in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path, engine='openpyxl')
            elif file_path.suffix == '.csv':
                df = pd.read_csv(file_path, on_bad_lines='skip', encoding_errors='ignore')
            else:
                return None # Skip unsupported file types silently
        except Exception as e:
            print(f"Error reading file {file_path.name}: {e}")
            return None

        company_name = self._get_text_value(df, self.COMPANY_NAME_ELEMENT, "Unknown Company")
        voluntary_recalls = self._get_text_value(df, self.VOLUNTARY_RECALLS_ELEMENT)
        forced_recalls = self._get_text_value(df, self.FORCED_RECALLS_ELEMENT)
        voluntary_reasons = self._get_text_value(df, self.VOLUNTARY_REASONS_ELEMENT, default_value="")
        forced_reasons = self._get_text_value(df, self.FORCED_REASONS_ELEMENT, default_value="")

        reasons_list = [r for r in [voluntary_reasons, forced_reasons] if r and str(r).strip()]
        combined_reasons = " | ".join(reasons_list) if reasons_list else "Not Provided"

        data_entry = {
            'Company Name': company_name,
            'NumberOfVoluntaryRecalls': voluntary_recalls,
            'NumberOfForcedRecalls': forced_recalls,
            'Reasons': combined_reasons
        }
        
        print(f"--- Successfully extracted data for: {company_name}")
        return data_entry

    def _convert_df_to_numeric(self, df: pd.DataFrame) -> pd.DataFrame:
        """Converts recall count columns in the DataFrame from text to numeric types."""
        print("\n[INFO] Converting recall counts from text to numbers...")
        
        def clean_and_convert(value):
            if pd.isna(value): return 0
            value_str = str(value).strip().lower()
            if value_str in ['not applicable', 'n/a', '-', 'nil', 'none', '']: return 0
            if value_str.startswith('(') and value_str.endswith(')'):
                value_str = '-' + value_str[1:-1]
            cleaned_str = re.sub(r'[^0-9.-]', '', value_str)
            if not cleaned_str or cleaned_str in ['-', '.']: return 0
            try:
                return int(float(cleaned_str))
            except (ValueError, TypeError):
                return 0

        numeric_cols = ['NumberOfVoluntaryRecalls', 'NumberOfForcedRecalls']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].apply(clean_and_convert)
        
        print("[INFO] Data conversion complete.")
        return df

    def _classify_reasons(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        UPDATED: Classifies recall reasons into predefined categories using a more robust method.
        """
        print("[INFO] Classifying recall reasons into categories...")
        if 'Reasons' not in df.columns:
            print("[WARN] 'Reasons' column not found in the DataFrame. Skipping classification.")
            return df
        
        # Ensure the 'Reasons' column is treated as a string to prevent errors
        df['Reasons'] = df['Reasons'].astype(str)
        
        for category, pattern in self.PRECOMPILED_REASON_REGEX.items():
            # Use a more explicit .apply() method to check each reason text.
            # The lambda function returns 1 if the pattern is found, and 0 otherwise.
            df[category] = df['Reasons'].apply(lambda text: 1 if pattern.search(text) else 0)

        print("[INFO] Reason classification complete.")
        return df

    def process_directory(self, directory: str) -> pd.DataFrame:
        """
        Processes all valid files in a directory, extracts data, cleans it, and classifies reasons.
        """
        source_path = Path(directory)
        if not source_path.is_dir() or not any(source_path.iterdir()):
            print(f"Error: The directory '{directory}' is empty or does not exist.")
            return pd.DataFrame()

        print(f"[INFO] Starting analysis of files in '{directory}'...\n")
        all_data: List[Dict[str, Any]] = []
        for file_path in source_path.iterdir():
            if file_path.is_dir(): continue
            data_entry = self._process_single_file(file_path)
            if data_entry:
                all_data.append(data_entry)

        if not all_data:
            print("No valid data could be processed from the files.")
            return pd.DataFrame()

        summary_df = pd.DataFrame(all_data)
        
        # Convert recall counts to numbers
        summary_df = self._convert_df_to_numeric(summary_df)
        
        # Classify the recall reasons
        summary_df = self._classify_reasons(summary_df)

        # Define the final column order, including new reason categories
        column_order = (['Company Name', 'NumberOfVoluntaryRecalls', 'NumberOfForcedRecalls', 'Reasons'] + 
                        list(self.REASON_CATEGORIES.keys()))
        summary_df = summary_df.reindex(columns=column_order)
        
        return summary_df

    def export_to_excel(self, df: pd.DataFrame, output_file: str):
        """Exports the main data to a formatted Excel file."""
        if df.empty:
            print("Cannot export empty DataFrame.")
            return

        print(f"\n[INFO] Exporting data to '{output_file}'...")
        try:
            with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='Product Recall Data', index=False)
                
                ws = writer.sheets['Product Recall Data']
                header_font = Font(bold=True, color="FFFFFF")
                header_fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid")
                
                for cell in ws[1]:
                    cell.font = header_font
                    cell.fill = header_fill

                for column_cells in ws.columns:
                    max_length = max(len(str(cell.value)) for cell in column_cells if cell.value)
                    # Cap width at 60 for readability
                    new_width = min(max_length + 2, 60)
                    ws.column_dimensions[column_cells[0].column_letter].width = new_width
            
            print(f"SUCCESS: Master Excel sheet '{output_file}' has been created.")

        except PermissionError:
            print(f"ERROR: Permission denied. Is '{output_file}' open? Please close it and try again.")
        except Exception as e:
            print(f"An error occurred while saving the Excel file: {e}")




In [5]:

source_directory = 'excel_files'
# Updated output filename to reflect the new content
output_excel_file = 'p9_recalls.xlsx'

if not os.path.exists(source_directory):
    os.makedirs(source_directory)
    print(f"Created directory: {source_directory}")
    print(f"Please add your company data files to '{source_directory}' before running again.")

extractor = ProductRecallsExtractor()
main_data_df = extractor.process_directory(source_directory)



[INFO] Starting analysis of files in 'excel_files'...

--- Successfully extracted data for: 360 One Wam Limited
--- Successfully extracted data for: 3I Infotech Limited
--- Successfully extracted data for: 3M India Limited
--- Successfully extracted data for: 5paisa Capital Limited
--- Successfully extracted data for: 63 Moons Technologies Limited
--- Successfully extracted data for: Aarti Drugs Limited
--- Successfully extracted data for: Aarti Industries Limited
--- Successfully extracted data for: Aarti Pharmalabs Limited
--- Successfully extracted data for: Aavas Financiers Limited
--- Successfully extracted data for: Abans Holdings Limited
--- Successfully extracted data for: ABB India Limited
--- Successfully extracted data for: Accelya Solutions India Limited
--- Successfully extracted data for: ACC Limited
--- Successfully extracted data for: Action Construction Equipment Limited
--- Successfully extracted data for: Adani Energy Solutions Limited
--- Successfully extracted data

In [6]:
main_data_df

Unnamed: 0,Company Name,NumberOfVoluntaryRecalls,NumberOfForcedRecalls,Reasons,Contamination/Foreign Material,Product Defect or Malfunction,Labeling or Packaging Error,Safety Standard Non-Compliance,"Quality, Purity, or Potency Issue"
0,360 One Wam Limited,0,0,Not Provided,0,0,0,0,0
1,3I Infotech Limited,0,0,Not Provided,0,0,0,0,0
2,3M India Limited,0,0,Not Provided,0,0,0,0,0
3,5paisa Capital Limited,0,0,Not Applicable | Not Applicable,0,0,0,0,0
4,63 Moons Technologies Limited,0,0,Not Provided,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1169,ZF Commercial Vehicle Control Systems India Li...,0,0,0 | 0,0,0,0,0,0
1170,Zomato Limited,0,0,Not applicable | Not applicable,0,0,0,0,0
1171,Zota Health Care Limited,0,0,0 | 0,0,0,0,0,0
1172,Zydus Lifesciences Limited,25,1,Market complaints – products out of specificat...,0,0,0,0,0


In [7]:
extractor.export_to_excel(main_data_df, output_excel_file)


[INFO] Exporting data to 'p9_recalls.xlsx'...
SUCCESS: Master Excel sheet 'p9_recalls.xlsx' has been created.
