In [None]:
import easyocr
import pandas as pd
import os
import logging
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

# Initialize OCR
reader = easyocr.Reader(['en'])

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_text(image_path):
    """Extract text from an image using EasyOCR."""
    try:
        result = reader.readtext(image_path, detail=0)
        return " ".join(result)
    except Exception as e:
        logging.error(f"Error extracting text from {image_path}: {str(e)}")
        return ""

def clean_text(text):
    """Clean OCR text to handle any noisy characters and normalize."""
    text = re.sub(r'[^0-9a-zA-Z., ]', '', text)  # Remove any unwanted characters
    return text.strip().lower()

def preprocess_images(image_folder, dataset):
    """Apply OCR on images and clean the results."""
    ocr_results = []
    for _, row in dataset.iterrows():
        image_path = os.path.join(image_folder, row['image_link'].split('/')[-1])
        if not os.path.exists(image_path):
            logging.error(f"Image not found: {image_path}")
            ocr_results.append("")
            continue
        extracted_text = extract_text(image_path)
        cleaned_text = clean_text(extracted_text)
        ocr_results.append(cleaned_text)
    return ocr_results

def classify_with_knn(X_train, y_train, X_test):
    """Train KNN classifier and make predictions on test data."""
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    return predictions

def format_prediction(pred, unit):
    """Format the prediction as a value and unit."""
    if pred:
        return f"{pred} {unit}"
    return ""

try:
    # Step 1: Load the dataset and take the first 1000 rows for training
    logging.info("Loading dataset and extracting the first 10 rows")
    train_df = pd.read_csv("dataset/train.csv").head(10)

    # Step 2: Preprocess images and extract text
    logging.info("Applying OCR on images and preprocessing text")
    train_predictions = preprocess_images("images", train_df)
    train_df['predictions'] = train_predictions

    # Step 3: Encode entity names
    le = LabelEncoder()
    train_df['entity_cluster'] = le.fit_transform(train_df['entity_name'])

    # Prepare features (OCR text) and labels (entity clusters) for KNN
    texts = train_df['predictions'].values
    labels = train_df['entity_cluster'].values

    # Use the text lengths as features for the KNN classifier
    X_train = [[len(text)] for text in texts]

    # Step 4: Train KNN classifier
    logging.info("Training KNN classifier")
    knn_model = KNeighborsClassifier(n_neighbors=3)
    knn_model.fit(X_train, labels)

    # Step 5: Load test set for predictions
    test_df = pd.read_csv("dataset/test.csv")

    logging.info("Applying OCR and generating predictions for test set")
    test_predictions = preprocess_images("images_test", test_df)
    test_df['predictions'] = test_predictions

    # Use the text lengths as features for the test set
    X_test = [[len(text)] for text in test_predictions]

    # Step 6: Predict with KNN
    test_df['entity_cluster'] = knn_model.predict(X_test)

    # Step 7: Format the test predictions (example: 21.9 foot)
    logging.info("Formatting test predictions")
    final_test_predictions = []
    for _, row in test_df.iterrows():
        pred_value = row['predictions']  # This should be the numeric value extracted
        entity_name = le.inverse_transform([row['entity_cluster']])[0]
        final_test_predictions.append(format_prediction(pred_value, entity_name))

    # Step 8: Save test_out.csv
    test_df['prediction'] = final_test_predictions
    test_df[['index', 'prediction']].to_csv("dataset/test_out.csv", index=False)
    logging.info("Test predictions saved to test_out.csv")

except Exception as e:
    logging.error(f"An error occurred: {str(e)}")