In [28]:
import json
import re
import os
import tkinter as tk
from tkinter import filedialog, messagebox, ttk

# Set default save directory
DEFAULT_SAVE_DIR = r"YOUR_DIRECTORY"

class JSONProcessorGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("JSON Field Extractor & Number Parser")
        self.root.geometry("650x700")
        self.root.minsize(600, 650)
        
        # Variables
        self.file_paths = []  # Changed to list for multiple files
        self.selected_field = tk.StringVar()
        self.output_name = tk.StringVar(value="extracted_data")
        
        self.create_widgets()
        
    def create_widgets(self):
        # Main container
        main_frame = tk.Frame(self.root)
        main_frame.pack(fill="both", expand=True, padx=10, pady=10)
        
        # Title
        title_label = tk.Label(main_frame, text="JSON Field Extractor & Number Parser", 
                              font=("Arial", 16, "bold"))
        title_label.pack(pady=(0, 20))
        
        # File selection section
        file_frame = tk.LabelFrame(main_frame, text="1. Select JSON Files", font=("Arial", 12), pady=10)
        file_frame.pack(fill="x", pady=10)
        
        tk.Label(file_frame, text="Selected files:", font=("Arial", 10)).pack(anchor="w", padx=10, pady=5)
        self.file_listbox = tk.Listbox(file_frame, height=4, font=("Arial", 9))
        self.file_listbox.pack(fill="x", padx=10, pady=5)
        
        btn_frame = tk.Frame(file_frame)
        btn_frame.pack(pady=10)
        
        browse_btn = tk.Button(btn_frame, text="Browse for JSON Files", command=self.browse_files,
                              bg="#4CAF50", fg="white", font=("Arial", 10, "bold"))
        browse_btn.pack(side="left", padx=5)
        
        clear_btn = tk.Button(btn_frame, text="Clear List", command=self.clear_files,
                             bg="#ff9800", fg="white", font=("Arial", 10))
        clear_btn.pack(side="left", padx=5)
        
        # Field selection section
        field_frame = tk.LabelFrame(main_frame, text="2. Select Field to Extract", font=("Arial", 12), pady=5)
        field_frame.pack(fill="x", pady=5)
        
        # Add scrollbar to field listbox
        field_container = tk.Frame(field_frame)
        field_container.pack(fill="x", padx=10, pady=5)
        
        self.field_listbox = tk.Listbox(field_container, height=8, font=("Arial", 10))
        scrollbar = tk.Scrollbar(field_container, orient="vertical")
        self.field_listbox.config(yscrollcommand=scrollbar.set)
        scrollbar.config(command=self.field_listbox.yview)
        
        self.field_listbox.pack(side="left", fill="both", expand=True)
        scrollbar.pack(side="right", fill="y")
        
        self.field_listbox.bind("<<ListboxSelect>>", self.on_field_select)
        
        # Output naming section
        output_frame = tk.LabelFrame(main_frame, text="3. Output File Name", font=("Arial", 12), pady=5)
        output_frame.pack(fill="x", pady=5)
        
        tk.Label(output_frame, text="Base filename (without .json):", font=("Arial", 10)).pack(anchor="w", padx=10, pady=2)
        name_entry = tk.Entry(output_frame, textvariable=self.output_name, font=("Arial", 11))
        name_entry.pack(fill="x", padx=10, pady=2)
        
        tk.Label(output_frame, text=f"Files will be saved to: {DEFAULT_SAVE_DIR}", 
                wraplength=500, fg="gray", font=("Arial", 8)).pack(anchor="w", padx=10, pady=2)
        
        # PROCESS BUTTON - MAIN ACTION
        process_button = tk.Button(main_frame, text="🚀 PROCESS JSON NOW!", 
                                  command=self.process_json,
                                  bg="#FF5722", fg="white", 
                                  font=("Arial", 12, "bold"), 
                                  height=2, width=30)
        process_button.pack(pady=15)
        
        # SPLIT BUTTON - FOR SINGLE FILE WITH MULTIPLE RUNS
        split_button = tk.Button(main_frame, text="🔀 SPLIT SINGLE JSON INTO RUNS", 
                                command=self.split_json_runs,
                                bg="#9C27B0", fg="white", 
                                font=("Arial", 12, "bold"), 
                                height=2, width=30)
        split_button.pack(pady=5)
        
        # Status section
        status_frame = tk.LabelFrame(main_frame, text="Status", font=("Arial", 12))
        status_frame.pack(fill="both", expand=True, pady=5)
        
        self.status_text = tk.Text(status_frame, height=6, wrap="word", font=("Arial", 9))
        self.status_text.pack(fill="both", expand=True, padx=10, pady=5)
        
        # Bottom buttons
        button_frame = tk.Frame(main_frame)
        button_frame.pack(fill="x", pady=10)
        
        close_btn = tk.Button(button_frame, text="Close", command=self.root.quit,
                             bg="#f44336", fg="white", font=("Arial", 10), width=15)
        close_btn.pack(side="right")
        
    def log(self, message):
        self.status_text.insert(tk.END, message + "\n")
        self.status_text.see(tk.END)
        self.root.update()
        
    def browse_files(self):
        file_paths = filedialog.askopenfilenames(
            title="Select multiple JSON files",
            filetypes=[("JSON files", "*.json"), ("All files", "*.*")]
        )
        
        if file_paths:
            self.file_paths.extend(file_paths)
            self.update_file_list()
            # Load fields from the first file to show available options
            if self.file_paths:
                self.load_fields_from_first_file()
    
    def clear_files(self):
        self.file_paths = []
        self.update_file_list()
        self.field_listbox.delete(0, tk.END)
        self.log("📁 File list cleared")
    
    def update_file_list(self):
        self.file_listbox.delete(0, tk.END)
        for file_path in self.file_paths:
            filename = os.path.basename(file_path)
            self.file_listbox.insert(tk.END, filename)
        
        if self.file_paths:
            self.log(f"📁 Selected {len(self.file_paths)} files")
            
    def load_fields_from_first_file(self):
        if not self.file_paths:
            return
            
        try:
            with open(self.file_paths[0], 'r') as f:
                data = json.load(f)
            
            self.field_listbox.delete(0, tk.END)
            
            if isinstance(data, list) and len(data) > 0:
                fields = list(data[0].keys())
            elif isinstance(data, dict):
                fields = list(data.keys())
            else:
                self.log("Error: Could not identify fields in JSON structure")
                return
                
            for field in fields:
                self.field_listbox.insert(tk.END, field)
                
            self.log(f"✅ Loaded {len(fields)} fields from first file")
            
        except Exception as e:
            self.log(f"❌ Error loading first file: {str(e)}")
            
    def on_field_select(self, event):
        selection = self.field_listbox.curselection()
        if selection:
            self.selected_field.set(self.field_listbox.get(selection[0]))
            self.log(f"📋 Selected field: {self.selected_field.get()}")
    
    def pre_clean_text(self, text):
        """STEP 1: Remove Unicode garbage but keep letters, numbers, and punctuation"""
        if not isinstance(text, str):
            text = str(text)
        
        # FIRST: Remove Unicode garbage and keep only clean characters
        # This removes emojis/Unicode that might be interfering with phrase detection
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,;:!?\'"-]', '', text)
        
        # Clean up multiple spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = cleaned_text.strip()
        
        # NOW remove specific phrases from the cleaned text
        strings_to_remove = [
            "neither agree nor disagree",
            "strongly disagree", 
            "strongly agree",
            "somewhat agree",
            "somewhat disagree", 
            "a little agree",
            "a little disagree"
        ]
        
        for phrase in strings_to_remove:
            pattern = r'\b' + re.escape(phrase) + r'\b'
            cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
        
        # Final cleanup
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = cleaned_text.strip()
        
        return cleaned_text
    
    def final_clean_text(self, text):
        """STEP 3: Remove numbers from already cleaned text"""
        if not isinstance(text, str):
            text = str(text)
        
        # Remove numbers from the pre-cleaned text
        cleaned_text = re.sub(r'\d+', '', text)
        
        # Clean up multiple spaces and trim
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = cleaned_text.strip()
        
        return cleaned_text
            
    def process_json(self):
        if not self.file_paths:
            messagebox.showerror("Error", "Please select at least one JSON file!")
            return
            
        if not self.selected_field.get():
            messagebox.showerror("Error", "Please select a field to extract!")
            return
            
        if not self.output_name.get():
            messagebox.showerror("Error", "Please enter an output filename!")
            return
        
        # Confirmation dialog
        file_list = "\n".join([f"• {os.path.basename(fp)}" for fp in self.file_paths[:5]])
        if len(self.file_paths) > 5:
            file_list += f"\n... and {len(self.file_paths) - 5} more files"
            
        confirm_msg = f"""Ready to process {len(self.file_paths)} files:

{file_list}

📋 Field: {self.selected_field.get()}
💾 Output: {self.output_name.get()}_merged.json
📂 Save to: {DEFAULT_SAVE_DIR}

Do you want to proceed?"""
        
        if not messagebox.askyesno("🚀 Confirm Batch Processing", confirm_msg):
            return
            
        try:
            # Create output directory if it doesn't exist
            os.makedirs(DEFAULT_SAVE_DIR, exist_ok=True)
            
            field_name = self.selected_field.get()
            processed_count = 0
            
            # Process each file separately
            for i, file_path in enumerate(self.file_paths, 1):
                filename = os.path.splitext(os.path.basename(file_path))[0]
                self.log(f"🔄 Processing file {i}/{len(self.file_paths)}: {filename}")
                
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Extract field data
                    if isinstance(data, list):
                        extracted_data = [item[field_name] for item in data if field_name in item]
                    else:
                        extracted_data = data[field_name] if field_name in data else []
                        if not isinstance(extracted_data, list):
                            extracted_data = [extracted_data]
                    
                    # STEP 1: Clean the data FIRST (remove Unicode garbage but keep numbers)
                    pre_cleaned_data = [self.pre_clean_text(item) for item in extracted_data]
                    
                    # STEP 2: Extract numbers from CLEANED data (not original garbage data)
                    numbered_data = {}
                    entries_to_process = pre_cleaned_data[:42] if len(pre_cleaned_data) > 42 else pre_cleaned_data
                    
                    for j, entry in enumerate(entries_to_process, 1):
                        numbers = re.findall(r'\d+', str(entry))
                        if numbers:
                            numbered_data[str(j)] = int(numbers[0])
                        else:
                            numbered_data[str(j)] = None
                    
                    # STEP 3: Remove numbers from cleaned data for text file
                    final_cleaned_data = [self.final_clean_text(item) for item in pre_cleaned_data]
                    
                    # STEP 4: Create numbered dictionary for text file (1-42 for each file)
                    numbered_text_data = {}
                    final_entries_to_process = final_cleaned_data[:42] if len(final_cleaned_data) > 42 else final_cleaned_data
                    
                    for j, entry in enumerate(final_entries_to_process, 1):
                        numbered_text_data[str(j)] = entry
                    
                    # Save this file's cleaned text data (NO NUMBERS)
                    text_output_file = os.path.join(DEFAULT_SAVE_DIR, f"{self.output_name.get()}_{filename}.json")
                    with open(text_output_file, 'w') as f:
                        json.dump(numbered_text_data, f, indent=2)
                    
                    # Save this file's numbered data
                    numbers_output_file = os.path.join(DEFAULT_SAVE_DIR, f"{self.output_name.get()}_{filename}_numbers.json")
                    with open(numbers_output_file, 'w') as f:
                        json.dump(numbered_data, f, indent=2)
                    
                    valid_numbers = sum(1 for v in numbered_data.values() if v is not None)
                    self.log(f"   ✅ Saved {len(numbered_text_data)} text entries, {valid_numbers} with numbers")
                    processed_count += 1
                    
                except Exception as e:
                    self.log(f"   ❌ Error processing {filename}: {str(e)}")
                    continue
            
            # Summary
            self.log(f"\n🎉 BATCH PROCESSING COMPLETE!")
            self.log(f"📁 Successfully processed {processed_count}/{len(self.file_paths)} files")
            self.log(f"📂 Each file saved separately with 1-42 numbering")
            self.log(f"💾 Files saved to: {DEFAULT_SAVE_DIR}")
            
            messagebox.showinfo("🎉 Success!", f"Batch processing complete!\n\nProcessed {processed_count}/{len(self.file_paths)} files\nEach run saved as separate files\n\nFiles saved to:\n{DEFAULT_SAVE_DIR}")
            
        except Exception as e:
            error_msg = f"❌ Error during batch processing: {str(e)}"
            self.log(error_msg)
            messagebox.showerror("Error", error_msg)

    def split_json_runs(self):
        """Split a single JSON file containing multiple runs into separate run files"""
        if not self.file_paths:
            messagebox.showerror("Error", "Please select a JSON file first!")
            return
            
        if len(self.file_paths) > 1:
            messagebox.showerror("Error", "Please select only ONE JSON file for splitting!")
            return
            
        if not self.selected_field.get():
            messagebox.showerror("Error", "Please select a field to extract!")
            return
            
        if not self.output_name.get():
            messagebox.showerror("Error", "Please enter an output filename!")
            return
        
        # Ask how many runs per file
        runs_dialog = tk.Toplevel(self.root)
        runs_dialog.title("Split Configuration")
        runs_dialog.geometry("400x200")
        runs_dialog.grab_set()
        
        tk.Label(runs_dialog, text="How many runs are in this file?", font=("Arial", 12)).pack(pady=10)
        
        runs_var = tk.StringVar(value="20")
        runs_entry = tk.Entry(runs_dialog, textvariable=runs_var, font=("Arial", 11), width=10)
        runs_entry.pack(pady=5)
        
        tk.Label(runs_dialog, text="How many responses per run?", font=("Arial", 12)).pack(pady=10)
        
        responses_var = tk.StringVar(value="42")
        responses_entry = tk.Entry(runs_dialog, textvariable=responses_var, font=("Arial", 11), width=10)
        responses_entry.pack(pady=5)
        
        result = {"confirmed": False, "runs": 20, "responses": 42}
        
        def confirm_split():
            try:
                result["runs"] = int(runs_var.get())
                result["responses"] = int(responses_var.get())
                result["confirmed"] = True
                runs_dialog.destroy()
            except ValueError:
                messagebox.showerror("Error", "Please enter valid numbers!")
        
        def cancel_split():
            runs_dialog.destroy()
        
        btn_frame = tk.Frame(runs_dialog)
        btn_frame.pack(pady=20)
        
        tk.Button(btn_frame, text="Split", command=confirm_split, bg="#4CAF50", fg="white", font=("Arial", 10, "bold")).pack(side="left", padx=5)
        tk.Button(btn_frame, text="Cancel", command=cancel_split, bg="#f44336", fg="white", font=("Arial", 10)).pack(side="left", padx=5)
        
        runs_dialog.wait_window()
        
        if not result["confirmed"]:
            return
            
        try:
            # Create output directory if it doesn't exist
            os.makedirs(DEFAULT_SAVE_DIR, exist_ok=True)
            
            file_path = self.file_paths[0]
            field_name = self.selected_field.get()
            num_runs = result["runs"]
            responses_per_run = result["responses"]
            
            self.log(f"🔀 Splitting {os.path.basename(file_path)} into {num_runs} runs...")
            
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # Extract field data
            if isinstance(data, list):
                extracted_data = [item[field_name] for item in data if field_name in item]
            else:
                extracted_data = data[field_name] if field_name in data else []
                if not isinstance(extracted_data, list):
                    extracted_data = [extracted_data]
            
            self.log(f"📊 Found {len(extracted_data)} total responses")
            
            # Split into runs
            for run_num in range(1, num_runs + 1):
                start_idx = (run_num - 1) * responses_per_run
                end_idx = start_idx + responses_per_run
                
                if start_idx >= len(extracted_data):
                    self.log(f"⚠️ Run {run_num}: No more data available")
                    break
                
                run_data = extracted_data[start_idx:end_idx]
                
                self.log(f"🔄 Processing Run {run_num}: {len(run_data)} responses")
                
                # STEP 1: Clean the data FIRST (remove Unicode garbage but keep numbers)
                pre_cleaned_data = [self.pre_clean_text(item) for item in run_data]
                
                # STEP 2: Extract numbers from CLEANED data
                numbered_data = {}
                for j, entry in enumerate(pre_cleaned_data, 1):
                    numbers = re.findall(r'\d+', str(entry))
                    if numbers:
                        numbered_data[str(j)] = int(numbers[0])
                    else:
                        numbered_data[str(j)] = None
                
                # STEP 3: Remove numbers from cleaned data for text file
                final_cleaned_data = [self.final_clean_text(item) for item in pre_cleaned_data]
                
                # STEP 4: Create numbered dictionary for text file (1-42 for each run)
                numbered_text_data = {}
                for j, entry in enumerate(final_cleaned_data, 1):
                    numbered_text_data[str(j)] = entry
                
                # Save this run's cleaned text data (NO NUMBERS)
                text_output_file = os.path.join(DEFAULT_SAVE_DIR, f"{self.output_name.get()}_run{run_num}.json")
                with open(text_output_file, 'w') as f:
                    json.dump(numbered_text_data, f, indent=2)
                
                # Save this run's numbered data
                numbers_output_file = os.path.join(DEFAULT_SAVE_DIR, f"{self.output_name.get()}_run{run_num}_numbers.json")
                with open(numbers_output_file, 'w') as f:
                    json.dump(numbered_data, f, indent=2)
                
                valid_numbers = sum(1 for v in numbered_data.values() if v is not None)
                self.log(f"   ✅ Run {run_num}: Saved {len(numbered_text_data)} text entries, {valid_numbers} with numbers")
            
            # Summary
            self.log(f"\n🎉 SPLIT PROCESSING COMPLETE!")
            self.log(f"📁 Split into {num_runs} separate run files")
            self.log(f"💾 Files saved to: {DEFAULT_SAVE_DIR}")
            
            messagebox.showinfo("🎉 Success!", f"Split processing complete!\n\nSplit into {num_runs} run files\nEach run numbered 1-{responses_per_run}\n\nFiles saved to:\n{DEFAULT_SAVE_DIR}")
            
        except Exception as e:
            error_msg = f"❌ Error during split processing: {str(e)}"
            self.log(error_msg)
            messagebox.showerror("Error", error_msg)

# Create and run the GUI
if __name__ == "__main__":
    root = tk.Tk()
    app = JSONProcessorGUI(root)
    root.mainloop()