In [23]:
import requests
import json
import pandas as pd
import io # To represent DataFrame as string

# --- Configuration ---
# 1. Ollama/CodeLlama Endpoint Configuration
BASE_URL = "http://localhost:11434"  # Or your CodeLlama server URL
API_PATH = "/api/chat"              # Or "/api/generate" if you prefer
CODELAMA_MODEL = "codellama:7b"     # Your specific CodeLlama model

# 2. CSV File Path
#    Replace 'your_data.csv' with the actual path to your CSV file.
#    If the CSV is in the same directory as your notebook, just the filename is fine.
CSV_FILE_PATH = 'jumped_up_test.csv' # <--- !!! IMPORTANT: UPDATE THIS PATH !!!

# --- Helper Function to Prepare DataFrame Info for LLM ---
def get_dataframe_info_for_prompt(df, num_rows_to_show=5):
    """
    Generates a string representation of DataFrame info (header, dtypes, head)
    to be included in the LLM prompt.
    """
    header = ", ".join(df.columns.tolist())
    
    # Get dtypes as a string
    dtypes_buffer = io.StringIO()
    df.info(buf=dtypes_buffer)
    dtypes_str = dtypes_buffer.getvalue()

    # Get a sample of the data (first few rows) as a string
    sample_data_str = df.head(num_rows_to_show).to_string()

    info = f"CSV Columns (Header): {header}\n\n"
    info += f"Data Types and Non-Null Counts:\n{dtypes_str}\n\n"
    info += f"First {num_rows_to_show} rows of data:\n{sample_data_str}\n"
    return info

# --- Main Logic ---
try:
    # 1. Load the CSV into a pandas DataFrame
    print(f"Loading CSV file from: {CSV_FILE_PATH}")
    try:
        df = pd.read_csv(CSV_FILE_PATH)
        print("CSV loaded successfully.")
    except FileNotFoundError:
        print(f"ERROR: CSV file not found at '{CSV_FILE_PATH}'. Please check the path.")
        # Create a dummy DataFrame for demonstration if file not found,
        # so the rest of the script can run. Replace with your error handling.
        print("Using a dummy DataFrame for demonstration purposes.")
        data = {'col1': [1, 2, None, 4, '  '],
                'col2': ['a', 'b', 'c', ' d ', 'e'],
                'date_col': ['2023-01-01', '2023-01-02', '2023-01-03', '2023/01/04', 'invalid_date'],
                'numeric_col_as_str': ['100', '200.5', '300', 'Nan', '500']}
        df = pd.DataFrame(data)
        print("Dummy DataFrame created.")


    # 2. Get DataFrame information for the prompt
    dataframe_details = get_dataframe_info_for_prompt(df, num_rows_to_show=3) # Show 3 rows for brevity

    # 3. Construct the prompt for CodeLlama
    prompt_text = (
        f"I have a dataset with the following structure and a sample of the data:\n\n"
        f"{dataframe_details}\n\n"
        f"Based on these column names, data types, and sample values, please describe:\n"
        f"1. Potential structural issues.\n"
        f"2. Identify any mispaced column names.and state the column names\n"
        f"tate structural transformations required to make the data analytics ready."
    )

    # 4. Prepare the payload for the LLM
    if "chat" in API_PATH:
        payload = {
            "model": CODELAMA_MODEL,
            "messages": [
                {"role": "user", "content": prompt_text}
            ],
            "stream": False,
            # "options": {
            #     "temperature": 0.3, # Lower temperature for more factual/less creative cleaning suggestions
            #     "num_predict": 500  # Max tokens for the response
            # }
        }
    else: # For /api/generate
        payload = {
            "model": CODELAMA_MODEL,
            "prompt": prompt_text,
            "stream": False,
            # "options": { ... }
        }

    headers = {
        "Content-Type": "application/json"
    }

    ENDPOINT_URL = BASE_URL + API_PATH
    print(f"\nSending request to CodeLlama at: {ENDPOINT_URL}")
    # print(f"Payload: {json.dumps(payload, indent=2)}") # Uncomment to see the full payload

    # 5. Send the request to CodeLlama
    response = requests.post(ENDPOINT_URL, json=payload, headers=headers)
    response.raise_for_status()

    # 6. Process and display the response
    result = response.json()
    print("\n--- CodeLlama's Data Cleanup Suggestions ---")

    if "chat" in API_PATH:
        if result.get("message"):
            print(result.get("message", {}).get("content"))
        else: # Some Ollama versions might return the content directly
            print(result.get("content", "No content found in response message."))
    elif "generate" in API_PATH:
        print(result.get("response", "No response content found."))
    else:
        print(json.dumps(result, indent=2))

except FileNotFoundError as fnf_error:
    print(fnf_error) # Already handled above but good to have a catch-all
except pd.errors.EmptyDataError:
    print(f"ERROR: The CSV file '{CSV_FILE_PATH}' is empty.")
except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
    print(f"Response content: {response.content.decode()}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error: {conn_err}")
    print("Please ensure your local CodeLlama endpoint is running and accessible at the specified URL.")
except requests.exceptions.RequestException as req_err:
    print(f"Request error occurred: {req_err}")
except json.JSONDecodeError:
    print("Failed to decode JSON response from CodeLlama.")
    print(f"Response content: {response.text}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Loading CSV file from: jumped_up_test.csv
CSV loaded successfully.

Sending request to CodeLlama at: http://localhost:11434/api/chat

--- CodeLlama's Data Cleanup Suggestions ---

Based on the provided information, here are some potential structural issues and suggested changes:

1. Potential structural issues:
* The data has a lot of missing values (7 non-null values out of 7 entries). This can make it difficult to analyze the data as there may be bias towards certain groups or values.
* The data has a lot of duplicated values, which can also cause bias in analysis.
* The data types for all columns are strings, which can limit the type of analysis that can be performed on them. For example, if the age column were to be numeric, it would be easier to analyze and compare values.
2. Mispaced column names:
* The column "Name" appears twice in the first row, which may cause confusion when reading the data. It is best to have consistent column names throughout the dataset.
3. Suggested stru

In [27]:
import requests
import json
import pandas as pd
import io # To represent DataFrame as string
import re

# --- Configuration ---
# 1. Ollama/CodeLlama Endpoint Configuration
BASE_URL = "http://localhost:11434"  # Or your CodeLlama server URL
API_PATH = "/api/chat"              # Using chat for more instruction-following capabilities
CODELAMA_MODEL = "codellama:7b-instruct" # Using an instruct model is often better for this
                                     # If you only have base codellama:7b, it might work but instruct is preferred

# 2. CSV File Path
CSV_FILE_PATH = 'example_messy_recipe.csv' # <--- !!! IMPORTANT: UPDATE THIS PATH !!!

# --- Helper Function to Prepare Raw Data Snippet for LLM ---
def get_raw_data_snippet(df, num_rows_to_show=15):
    """
    Generates a string representation of the first few rows of the DataFrame,
    meant for identifying headers in a headerless load.
    """
    if df is None or df.empty:
        return "No data or empty DataFrame loaded."

    # Represent the DataFrame's head as a string, keeping the default integer headers
    # to show the LLM exactly what pandas loaded.
    snippet = df.head(num_rows_to_show).to_string(index=True, header=True)
    return snippet

# --- Helper Function to Call CodeLlama ---
def call_codellama_for_headers(raw_data_snippet, model_name, base_url, api_path):
    """
    Sends a prompt to CodeLlama specifically to identify headers and return them
    as a comma-separated string.
    """
    endpoint_url = base_url + api_path

    prompt_text = (
        f"You are an expert data analysis assistant. I have loaded a CSV file into a pandas DataFrame "
        f"using `header=None`, so it currently has default integer column names (0, 1, 2, ...).\n"
        f"The actual column headers are likely present in one of the first few data rows.\n\n"
        f"Here are the first few rows of this DataFrame (including the current integer index and integer column names):\n"
        f"--- DATA SNIPPET ---\n"
        f"{raw_data_snippet}\n"
        f"--- END DATA SNIPPET ---\n\n"
        f"Your task is to carefully examine these rows and identify the row that most likely contains the actual column headers. "
        f"These headers might be jumbled, have extra spaces, or be in a row with some empty/NaN values in other cells that are not part of the header list.\n"
        f"Extract only the meaningful header names from that row.\n\n"
        f"**Your entire response MUST be ONLY a comma-separated list of these identified column names.**\n"
        f"For example: `ColumnID,ProductName,SaleAmount,TransactionDate`\n"
        f"Do NOT include any explanations, apologies, or any other text. Do not use markdown like backticks.\n"
        f"If you are highly confident you've found the headers, return them. "
        f"If you cannot confidently identify a clear header row or the data seems to have no discernible headers in the snippet, return the exact string: `NO_HEADERS_FOUND`"
    )

    payload = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt_text}],
        "stream": False,
        "options": {
            "temperature": 0.0, # Low temperature for deterministic output
            "num_predict": 150  # Max tokens for the header list
        }
    }
    headers = {"Content-Type": "application/json"}

    print(f"\nSending request to CodeLlama ({model_name}) to identify headers...")
    # print(f"Prompt Snippet: {prompt_text[:300]}...") # For debugging
    response = requests.post(endpoint_url, json=payload, headers=headers)
    response.raise_for_status()
    result = response.json()

    content = result.get("message", {}).get("content", "").strip()
    
    # Clean up common LLM artifacts if they slip through despite prompt
    content = re.sub(r'^["\']?(.*?)["\']?$', r'\1', content) # Remove leading/trailing quotes
    if content.lower().startswith("here are the identified headers:") or \
       content.lower().startswith("the identified headers are:"):
        content = content.split(":", 1)[-1].strip()
    if content.startswith("`") and content.endswith("`"):
        content = content[1:-1]


    if not content:
        print("Warning: Received empty content from CodeLlama for headers.")
        return "NO_HEADERS_FOUND"
    return content

# --- Main Logic ---
df_loaded = None
identified_headers = []
try:
    # 1. Load the CSV into a pandas DataFrame, ASSUMING NO HEADER
    print(f"Loading CSV file from: {CSV_FILE_PATH} (assuming no header row initially)")
    try:
        df_loaded = pd.read_csv(CSV_FILE_PATH, header=None, on_bad_lines='warn', keep_default_na=False, na_filter=False)
        # keep_default_na=False, na_filter=False ensures empty strings are read as empty strings, not NaN,
        # which can be important for header identification if headers have empty strings.
        print("CSV loaded successfully with default integer column names.")
        if df_loaded.empty:
            print("Warning: The loaded DataFrame is empty after initial load. Check your CSV file.")
    except FileNotFoundError:
        print(f"ERROR: CSV file not found at '{CSV_FILE_PATH}'. Please check the path.")
        print("Using a dummy DataFrame (with header in data) for demonstration purposes.")
        data_for_dummy_csv = [
            ["REPORT TITLE", "", "", "", ""],
            ["Date: 2025-05-22", "", "", "", ""],
            ["", "", "", "", ""], # Blank row
            ["  ITEM ID  ", " Product Name ", "  Category ", " Unit Price ", " Quantity Sold "], # Actual header row with spaces
            ["SKU001", "Widget Alpha", "Electronics", "  19.99 ", "150"],
            ["SKU002", "Gizmo Beta", "Gadgets", "35.00", "  200  "],
            ["SKU003", "Thingamajig Gamma", "Accessories", "7.50", "500"],
            ["", "TOTALS", "", "62.49", "850"] # Potentially confusing row
        ]
        df_loaded = pd.DataFrame(data_for_dummy_csv)
        print("Dummy DataFrame created.")
    except pd.errors.EmptyDataError:
        print(f"ERROR: The CSV file '{CSV_FILE_PATH}' is empty. Cannot proceed.")
        exit() # Or handle as appropriate

    if df_loaded is not None and not df_loaded.empty:
        # 2. Get a snippet of the raw DataFrame data
        # Show enough rows to give the LLM a good chance to find headers
        raw_data_snippet_for_llm = get_raw_data_snippet(df_loaded, num_rows_to_show=12)

        # 3. Call CodeLlama to identify headers
        llm_response_headers_str = call_codellama_for_headers(
            raw_data_snippet_for_llm, CODELAMA_MODEL, BASE_URL, API_PATH
        )

        print(f"\n--- Raw LLM Response for Headers ---\n'{llm_response_headers_str}'")

        if llm_response_headers_str.strip().upper() == "NO_HEADERS_FOUND" or not llm_response_headers_str.strip():
            print("\nLLM indicated no headers were confidently found or returned an empty response.")
        else:
            # Parse the comma-separated string into a list
            # Also strip whitespace from each potential header
            identified_headers = [header.strip() for header in llm_response_headers_str.split(',') if header.strip()]
            print(f"\n--- Identified Potential Headers (as List) ---")
            print(identified_headers)

            if not identified_headers:
                 print("Parsed header list is empty. LLM might have returned only commas or unexpected format.")
            # Next steps would be to use these identified_headers to find the row,
            # set it as header, and clean the DataFrame.
            # For example:
            # 1. Iterate through df_loaded.head(N).iterrows() to find which row matches these headers.
            # 2. If found, new_header_row_index = index
            # 3. df_loaded.columns = df_loaded.iloc[new_header_row_index]
            # 4. df_loaded = df_loaded[new_header_row_index+1:].reset_index(drop=True)
            # This logic is for a subsequent step.

    else:
        print("DataFrame was not loaded or is empty after initial load. Skipping LLM header identification.")


except FileNotFoundError as fnf_error:
    print(f"File error: {fnf_error}")
except pd.errors.EmptyDataError:
    print(f"Pandas EmptyDataError: The CSV file '{CSV_FILE_PATH}' might be empty or improperly formatted.")
except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
    if hasattr(http_err, 'response') and http_err.response is not None:
        print(f"Response content: {http_err.response.content.decode()}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error: {conn_err}")
    print(f"Please ensure your local CodeLlama endpoint is running and accessible at {BASE_URL}{API_PATH}.")
except requests.exceptions.RequestException as req_err:
    print(f"Request error occurred: {req_err}")
except json.JSONDecodeError as json_err:
    print(f"Failed to decode JSON response from CodeLlama: {json_err}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()

Loading CSV file from: example_messy_recipe.csv (assuming no header row initially)
CSV loaded successfully with default integer column names.

Sending request to CodeLlama (codellama:7b-instruct) to identify headers...

--- Raw LLM Response for Headers ---
'Based on the provided data snippet, it appears that the actual column headers are present in rows 10 and 11. These rows contain the following values:
```
Point    Density    Sugar   Ratio            X                Y
1  5020.6074  98.6872  0.9976          145                0
2  4976.0088  98.1275  0.9955       108.75                0
```
Therefore, the meaningful header names are:
```
Point, Density, Sugar, Ratio,'

--- Identified Potential Headers (as List) ---
['Based on the provided data snippet', 'it appears that the actual column headers are present in rows 10 and 11. These rows contain the following values:\n```\nPoint    Density    Sugar   Ratio            X                Y\n1  5020.6074  98.6872  0.9976          145      

In [30]:
import requests
import base64
import os
from PIL import Image
import io
import json

# --- Configuration ---
# 1. Ollama API Configuration
OLLAMA_API_URL = "http://localhost:11434/api/generate"  # Or /api/chat if your model prefers
# Ensure this is a multimodal model capable of image understanding
MULTIMODAL_MODEL_NAME = "llava:7b" # Replace with your specific model, e.g., "llava:13b"

# 2. Image File Path
#    Replace 'path/to/your/dataset_screenshot.png' with the actual path to your image.
IMAGE_FILE_PATH = 'screenshot.png' # <--- !!! IMPORTANT: UPDATE THIS PATH !!!

# --- Helper Function to Encode Image to Base64 ---
def encode_image_to_base64(image_path):
    """
    Opens an image, converts it to base64 string (without data URI prefix).
    Handles potential image conversion issues for formats like HEIC by converting to JPEG.
    """
    try:
        if not os.path.exists(image_path):
            print(f"Error: Image file not found at {image_path}")
            return None

        img = Image.open(image_path)
        output_buffer = io.BytesIO()

        # Convert to RGB if it's RGBA (to avoid issues with some formats like PNG transparency)
        if img.mode == 'RGBA' or img.mode == 'P': # P for palettized images
            img = img.convert('RGB')
        
        # Determine format and save to buffer
        # For simplicity, we'll save as JPEG, which is widely supported.
        # You could try to preserve the original format if preferred, but JPEG is safe.
        img.save(output_buffer, format="JPEG")
        
        byte_data = output_buffer.getvalue()
        base64_str = base64.b64encode(byte_data).decode('utf-8')
        return base64_str
    except FileNotFoundError:
        print(f"Error: Image file not found at {image_path}")
        return None
    except Exception as e:
        print(f"Error encoding image {image_path}: {e}")
        return None

# --- Main Analysis Logic ---
def analyze_dataset_screenshot(image_path, ollama_url, model_name):
    """
    Analyzes a dataset screenshot using a local multimodal LLM via Ollama.
    """
    print(f"Attempting to analyze image: {image_path}")
    base64_image_data = encode_image_to_base64(image_path)

    if not base64_image_data:
        print("Failed to encode image. Aborting analysis.")
        return

    prompt_text = (
        "This is an image of a dataset, likely a table or spreadsheet. Please analyze it carefully. Describe:\n"
        "1. What is this dataset generally about? What kind of information does it contain?\n"
        "2. What are the likely column headers or categories of data visible?\n"
        "3. What fields qualify as column names, list these?\n"
        "4. Are there any obvious patterns, special values (like N/A, NULL), or characteristics you can infer from the visible data?\n"
        "Provide a concise summary based on your observations of the image."
    )

    payload = {
        "model": model_name,
        "prompt": prompt_text,
        "images": [base64_image_data],  # Ollama expects an array of base64 strings
        "stream": False
        # You can add "options" here if needed, e.g., "options": {"temperature": 0.3}
    }

    print(f"\nSending request to Ollama API at {ollama_url} with model {model_name}...")
    try:
        response = requests.post(ollama_url, json=payload)
        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)

        response_data = response.json()

        if "response" in response_data:
            print("\n--- LLM Analysis Summary ---")
            print(response_data["response"].strip())
        elif "error" in response_data:
            print(f"\nAPI Error from Ollama: {response_data['error']}")
            if "model not found" in response_data['error'].lower():
                print(f"Please ensure the model '{model_name}' is pulled in Ollama (e.g., 'ollama pull {model_name}') and Ollama is running.")
        else:
            print("\nReceived an unexpected response structure from Ollama:")
            print(json.dumps(response_data, indent=2))

    except requests.exceptions.ConnectionError:
        print(f"\nConnection Error: Could not connect to Ollama at {ollama_url}.")
        print("Please ensure Ollama is running and accessible.")
    except requests.exceptions.HTTPError as http_err:
        print(f"\nHTTP Error: {http_err}")
        try:
            error_details = http_err.response.json()
            print(f"Error details from server: {json.dumps(error_details, indent=2)}")
        except json.JSONDecodeError:
            print(f"Raw error response from server: {http_err.response.text}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

# --- Run the analysis ---
if __name__ == "__main__":
    # IMPORTANT: Update IMAGE_FILE_PATH before running!
    if IMAGE_FILE_PATH == 'path/to/your/dataset_screenshot.png' or not os.path.exists(IMAGE_FILE_PATH):
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print("!!! IMPORTANT: Please update the 'IMAGE_FILE_PATH' variable in the   !!!")
        print("!!! script with the correct path to your dataset screenshot image.   !!!")
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    else:
        analyze_dataset_screenshot(IMAGE_FILE_PATH, OLLAMA_API_URL, MULTIMODAL_MODEL_NAME)


Attempting to analyze image: screenshot.png

Sending request to Ollama API at http://localhost:11434/api/generate with model llava:7b...

--- LLM Analysis Summary ---
1. The dataset appears to be a spreadsheet with rows and columns organized in a structured manner, potentially for inventory management, tracking shipments, or similar activities given the context provided by the column headers. It contains information such as Item Number, Product Name, Description, Quantity in Stock, and various fields related to shipment status and tracking details like Truck ID, Ship Date/Time, and Temperature during transit.
2. The visible columns seem to be a mix of inventory management (Item Number, Product Name, Quantity) and shipment tracking (Truck ID, Ship Date/Time). There is also a column with Temperature readings for the goods in transit.
3. The column headers are: Item Number, Product Name, Description, Quantity, 2nd Truck ID, 2nd Ship Date/Time, 2nd Temperature, and others that are not full

In [6]:
import requests
import json
import pandas as pd
import io # To represent DataFrame info as string

# --- Configuration ---
# 1. Ollama/CodeLlama Endpoint Configuration
BASE_URL = "http://localhost:11434"  # Or your CodeLlama server URL
API_PATH = "/api/chat"              # Or "/api/generate" if you prefer
CODELAMA_MODEL = "codellama:7b"     # Your specific CodeLlama model

# 2. CSV File Path
#    Replace 'your_data.csv' with the actual path to your CSV file.
CSV_FILE_PATH = 'your_data.csv' # <--- !!! IMPORTANT: UPDATE THIS PATH !!!

# --- Helper Function to Prepare DataFrame Info for LLM ---
def get_dataframe_info_for_prompt(df, num_rows_to_show=5):
    """
    Generates a string representation of DataFrame info (header, dtypes, head)
    to be included in the LLM prompt.
    """
    if df is None:
        return "No DataFrame loaded."
        
    header = ", ".join(df.columns.tolist())
    
    dtypes_buffer = io.StringIO()
    df.info(buf=dtypes_buffer)
    dtypes_str = dtypes_buffer.getvalue()

    sample_data_str = df.head(num_rows_to_show).to_string()

    info = f"The DataFrame has the following columns (Header): {header}\n\n"
    info += f"Data Types and Non-Null Counts:\n{dtypes_str}\n\n"
    info += f"Here are the first {num_rows_to_show} rows of the data:\n{sample_data_str}\n"
    return info

# --- Helper Function to Call CodeLlama ---
def call_codellama(prompt_text, model_name, base_url, api_path):
    """Sends a prompt to CodeLlama and returns the response content."""
    endpoint_url = base_url + api_path
    
    if "chat" in api_path:
        payload = {
            "model": model_name,
            "messages": [{"role": "user", "content": prompt_text}],
            "stream": False,
            "options": {
                 "temperature": 0.2, # Lower temperature for more deterministic code/suggestions
                 # "num_predict": 1024 # Adjust as needed for longer responses
            }
        }
    else: # For /api/generate
        payload = {
            "model": model_name,
            "prompt": prompt_text,
            "stream": False,
            "options": {
                 "temperature": 0.2,
                 # "num_predict": 1024
            }
        }
    
    headers = {"Content-Type": "application/json"}
    
    print(f"\nSending request to CodeLlama ({model_name} at {endpoint_url})...")
    # print(f"Prompt snippet: {prompt_text[:200]}...") # For debugging

    response = requests.post(endpoint_url, json=payload, headers=headers)
    response.raise_for_status()
    result = response.json()

    if "chat" in api_path:
        content = result.get("message", {}).get("content", "")
    elif "generate" in api_path:
        content = result.get("response", "")
    else:
        content = json.dumps(result, indent=2)
    
    if not content.strip(): # Check if content is empty or just whitespace
        print("Warning: Received empty content from CodeLlama.")
        # print(f"Full response: {result}") # For debugging
    return content

# --- Main Logic ---
df_loaded = None
try:
    # 1. Load the CSV into a pandas DataFrame
    print(f"Loading CSV file from: {CSV_FILE_PATH}")
    try:
        df_loaded = pd.read_csv(CSV_FILE_PATH)
        print("CSV loaded successfully.")
    except FileNotFoundError:
        print(f"ERROR: CSV file not found at '{CSV_FILE_PATH}'. Please check the path.")
        # Create a dummy DataFrame for demonstration if file not found
        print("Using a dummy DataFrame for demonstration purposes.")
        data = {'col1': [1, 2, None, 4, '  ', 6],
                'col2': ['a', 'b', 'c', ' d ', 'e', 'a'],
                'date_col': ['2023-01-01', '2023-01-02', '2023-01-03', '2023/01/04', 'invalid_date', '2023-01-01'],
                'numeric_col_as_str': ['100', '200.5', '300', 'Nan', '500', '100'],
                'id': [1,2,3,4,5,1]
               }
        df_loaded = pd.DataFrame(data)
        print("Dummy DataFrame created.")
    except pd.errors.EmptyDataError:
        print(f"ERROR: The CSV file '{CSV_FILE_PATH}' is empty. Cannot proceed.")
        exit() # Or handle as appropriate

    # 2. Get DataFrame information for the prompt
    dataframe_details = get_dataframe_info_for_prompt(df_loaded, num_rows_to_show=3)

    # --- STEP 1: Get Data Cleanup Suggestions ---
    prompt_for_suggestions = (
        f"I have a pandas DataFrame, loaded from a CSV, with the following structure and sample data:\n\n"
        f"{dataframe_details}\n\n"
        f"Based on these column names, data types, and sample values, please describe in bullet points:\n"
        f"1. Potential data quality issues or inconsistencies you observe.\n"
        f"2. Specific data cleaning steps you would recommend to prepare this data for analysis (e.g., handling missing values, correcting data types, removing duplicates, standardizing formats, fixing inconsistencies).\n"
        f"Focus on actionable recommendations."
    )
    
    print("\n--- Requesting Data Cleanup Suggestions from CodeLlama ---")
    cleanup_suggestions = call_codellama(prompt_for_suggestions, CODELAMA_MODEL, BASE_URL, API_PATH)
    
    if not cleanup_suggestions.strip():
        print("Failed to get cleanup suggestions from CodeLlama. Exiting.")
        exit()

    print("\n--- CodeLlama's Data Cleanup Suggestions ---")
    print(cleanup_suggestions)

    # --- STEP 2: Generate Python Code for Cleanup ---
    prompt_for_code = (
        f"You are an expert Python programmer specializing in data cleaning with pandas.\n"
        f"I have a pandas DataFrame named `df` with the following structure and sample data:\n\n"
        f"{dataframe_details}\n\n"
        f"Based on the following data cleaning suggestions that were previously provided:\n"
        f"--- SUGGESTIONS ---\n"
        f"{cleanup_suggestions}\n"
        f"--- END SUGGESTIONS ---\n\n"
        f"Please write a Python script using the pandas library to perform these cleaning operations on the DataFrame `df`.\n"
        f"Assume the DataFrame `df` is already loaded.\n"
        f"Provide only the Python code, preferably in a single block. Include comments in the code to explain each step.\n"
        f"If a suggestion is too vague or cannot be directly translated to code, you can make a reasonable assumption or note it in a comment.\n"
        f"For example, if a suggestion is 'Handle missing values in col1', you might fill with a mean, median, or a constant, and state your choice in a comment."
    )

    print("\n--- Requesting Python Cleanup Code from CodeLlama ---")
    generated_python_code = call_codellama(prompt_for_code, CODELAMA_MODEL, BASE_URL, API_PATH)

    print("\n--- CodeLlama's Generated Python Code for Cleanup ---")
    # LLMs often wrap code in markdown backticks, try to extract if present
    if "```python" in generated_python_code:
        generated_python_code = generated_python_code.split("```python\n")[1].split("\n```")[0]
    elif "```" in generated_python_code: # Simpler extraction if just ```
         generated_python_code = generated_python_code.split("```\n")[1].split("\n```")[0]
    print(generated_python_code)

    # Note: The generated code is printed here.
    # For actual execution, you would typically copy this code into a new cell,
    # review it carefully, and then run it.
    # DO NOT EXECUTE UNTRUSTED CODE AUTOMATICALLY.
    print("\n--- IMPORTANT ---")
    print("The Python code above was generated by an LLM. REVIEW IT CAREFULLY before execution.")
    print("You would typically copy this code into a new cell to run it on your DataFrame.")


except FileNotFoundError as fnf_error:
    # This specific error is handled during df loading, but good to have a general catch
    print(f"File error: {fnf_error}")
except pd.errors.EmptyDataError:
    print(f"Pandas EmptyDataError: The CSV file '{CSV_FILE_PATH}' might be empty or improperly formatted.")
except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
    if hasattr(http_err, 'response') and http_err.response is not None:
        print(f"Response content: {http_err.response.content.decode()}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error: {conn_err}")
    print(f"Please ensure your local CodeLlama endpoint is running and accessible at {BASE_URL}{API_PATH}.")
except requests.exceptions.RequestException as req_err:
    print(f"Request error occurred: {req_err}")
except json.JSONDecodeError as json_err:
    print(f"Failed to decode JSON response from CodeLlama: {json_err}")
    # Potentially print response.text if available and not too large
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()

Loading CSV file from: your_data.csv
ERROR: CSV file not found at 'your_data.csv'. Please check the path.
Using a dummy DataFrame for demonstration purposes.
Dummy DataFrame created.

--- Requesting Data Cleanup Suggestions from CodeLlama ---

Sending request to CodeLlama (codellama:7b at http://localhost:11434/api/chat)...

--- CodeLlama's Data Cleanup Suggestions ---

1. Potential data quality issues or inconsistencies:
	* Missing values in the "col1" column (5 rows with None)
	* Inconsistent data types for the "numeric_col_as_str" column (some values are strings, while others are floats)
2. Recommended data cleaning steps:
	* Handling missing values: Replace any missing values in the "col1" column with a specific value (e.g., 0 or "NA") to avoid errors during analysis.
	* Correcting data types: Convert the "numeric_col_as_str" column to a numeric data type (e.g., float) to ensure consistent data types throughout the DataFrame.
	* Removing duplicates: Check for any duplicate rows in 

In [None]:
import requests
import json

# 1. Define your local CodeLlama endpoint
# If Ollama is running on the default port:
BASE_URL = "http://localhost:11434"
# API_PATH = "/api/generate" # For text generation
API_PATH = "/api/chat"   # For chat-style completions

# Or if you are using a different server (e.g., TGI on port 8080):
# BASE_URL = "http://localhost:8080"
# API_PATH = "/generate" # Or whatever your TGI endpoint path is

ENDPOINT_URL = BASE_URL + API_PATH

# 2. Prepare your payload (this depends on your CodeLlama endpoint's expected format)

# Example for Ollama's /api/chat endpoint:
payload = {
    "model": "codellama:7b",  # Replace with your specific CodeLlama model name if different
    "messages": [
        {"role": "user", "content": "write code to clean."}
    ],
    "stream": False, # Set to True if your endpoint supports it and you want to handle streaming
    # "options": { # Optional parameters
    #     "temperature": 0.7,
    #     "num_predict": 200 # Max tokens
    # }
}

# Example for Ollama's /api/generate endpoint (older style):
# payload = {
#     "model": "codellama:7b",
#     "prompt": "def python_function_to_add_two_numbers(a, b):\n",
#     "stream": False
# }

headers = {
    "Content-Type": "application/json"
}

print(f"Sending request to: {ENDPOINT_URL}")
print(f"Payload: {json.dumps(payload, indent=2)}")

try:
    response = requests.post(ENDPOINT_URL, json=payload, headers=headers)
    response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)

    # Process the response
    result = response.json()
    print("\nResponse from CodeLlama:")

    if "generate" in API_PATH: # Typical for Ollama generate
        print(result.get("response"))
    elif "chat" in API_PATH: # Typical for Ollama chat
        if result.get("message"):
            print(result.get("message", {}).get("content"))
        else: # Sometimes the full response for non-streaming chat is just the content
            print(result)
    else: # Adjust based on your specific endpoint's response structure
        print(json.dumps(result, indent=2))


except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
    print(f"Response content: {response.content.decode()}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error: {conn_err}")
    print("Please ensure your local CodeLlama endpoint is running and accessible at the specified URL.")
except requests.exceptions.RequestException as req_err:
    print(f"Request error occurred: {req_err}")
except json.JSONDecodeError:
    print("Failed to decode JSON response.")
    print(f"Response content: {response.text}")

Sending request to: http://localhost:11434/api/chat
Payload: {
  "model": "codellama:7b",
  "messages": [
    {
      "role": "user",
      "content": "Write a Python function that calculates the factorial of a number."
    }
  ],
  "stream": false
}

Response from CodeLlama:
[PYTHON]
def calculate_factorial(n):
    if n == 0:
        return 1
    else:
        return n * calculate_factorial(n-1)
[/PYTHON]
[TESTS]
# Test case 1:
assert calculate_factorial(0) == 1
# Test case 2:
assert calculate_factorial(1) == 1
# Test case 3:
assert calculate_factorial(5) == 120
# Test case 4:
assert calculate_factorial(10) == 3628800
[/TESTS]

