<a href="https://colab.research.google.com/github/vibhorjoshi/kaggle-challenge/blob/main/amazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow opencv-python pillow numpy pandas


In [None]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from PIL import Image, UnidentifiedImageError
from io import BytesIO
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input

def is_valid_url(url):
    """Check if the URL is reachable."""
    try:
        response = requests.head(url, allow_redirects=True)
        return response.status_code == 200
    except requests.RequestException:
        return False

def load_image(img_link, img_size=(224, 224)):
    """Load and preprocess an image from a URL or local file."""
    img_array = np.zeros((img_size[0], img_size[1], 3))  # Default to a blank image
    try:
        if img_link.startswith(('http://', 'https://')) and is_valid_url(img_link):
            response = requests.get(img_link, stream=True)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
        else:
            img = Image.open(img_link)

        # Validate and process the image
        if img.format in ["JPEG", "PNG"]:
            img = img.convert('RGB')
            img = img.resize(img_size)
            img_array = img_to_array(img)
            img_array = preprocess_input(img_array)
        else:
            raise ValueError(f"Unsupported image format: {img.format}")
    except (requests.RequestException, UnidentifiedImageError, ValueError) as e:
        print(f"Error loading image {img_link}: {e}")
        # Use a blank or placeholder image for bad URLs or unsupported formats
        img_array = np.zeros((img_size[0], img_size[1], 3))  # Blank image
    except OSError as e:  # Handle truncated images
        print(f"Truncated image: {img_link}, error: {e}")
        img_array = np.zeros((img_size[0], img_size[1], 3))  # Use blank image

    return img_array

def load_images_pil(df, image_column, img_size=(224, 224), retry_count=3, top_n=1000):
    """Load and preprocess the top N images from a DataFrame column."""
    images = []
    for i, img_link in enumerate(tqdm(df[image_column][:top_n], desc=f"Downloading and processing top {top_n} images")):
        img_array = None
        for attempt in range(retry_count):
            img_array = load_image(img_link, img_size)
            if np.any(img_array):  # Check if the image is non-empty
                break
            print(f"Retrying {img_link}, attempt {attempt + 1}")

        images.append(img_array)

    return np.array(images)

# Example usage
train_df = pd.read_csv('/content/train.csv')
train_images = load_images_pil(train_df, 'image_link')
print("Loaded train images shape:", train_images.shape)

# Extract the top 1000 images suited for the model
top_1000_images = load_images_pil(train_df, 'image_link', top_n=1000)

# Check shape of the loaded images
print("Loaded top 1000 train images shape:", top_1000_images.shape)



In [None]:
# Step 2: Preprocess the entity_value (target) column (e.g., for weight)
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder from sklearn.preprocessing
from sklearn.model_selection import train_test_split # Import train_test_split
from tensorflow.keras.applications import VGG16 # Import VGG16
from tensorflow.keras.layers import Flatten, Dense # Import Flatten and Dense layers
from tensorflow.keras.models import Model # Import Model

label_encoder = LabelEncoder()

# Ensure train_labels only includes labels for the top 1000 images
train_labels = label_encoder.fit_transform(train_df['entity_value'][:1000])

# Split the dataset for training and validation
# Use top_1000_images which contains your 1000 image samples
X_train, X_val, y_train, y_val = train_test_split(top_1000_images, train_labels, test_size=0.2, random_state=42)

# Step 3: Train VGG16 Model
def build_vgg16_model(input_shape):
    # Load VGG16 without the top classification layer
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)

    # Freeze the base model layers (optional)
    for layer in base_model.layers:
        layer.trainable = False

    # Add custom layers on top
    x = Flatten()(base_model.output)
    x = Dense(128, activation='relu')(x)
    x = Dense(1, activation='linear')(x)  # Regression for predicting continuous entity values

    model = Model(inputs=base_model.input, outputs=x)
    model.compile(optimizer='adam', loss='mean_squared_error')

    return model

# Define input shape for VGG16 (224x224x3 for RGB images)
input_shape = (224, 224, 3)

# Build the model
model = build_vgg16_model(input_shape)

# Step 4: Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# After training, we can test the model on the test.csv data

In [None]:
# step 5:
# Load test images
test_df = pd.read_csv('/content/sample_test.csv')
test_images = load_images_pil(test_df, 'image_link')

# Make predictions
predictions = model.predict(test_images)
predicted_labels = np.argmax(predictions, axis=1)
predicted_values = label_encoder.inverse_transform(predicted_labels)

# Format predictions
def format_predictions(predictions):
    formatted_predictions = []
    for pred in predictions:
        # Assuming all predictions are in the format 'x unit' for simplicity
        formatted_predictions.append(f"{pred}")
    return formatted_predictions

test_df['prediction'] = format_predictions(predicted_values)

# Save predictions to CSV
test_df[['index', 'prediction']].to_csv('D:\acer\Documents\amazon ML Challenge\student_resource 3\dataset', index=False)



In [None]:
# step-6
import pandas as pd

def evaluate_predictions(ground_truth_df, predictions_df):
    """Evaluate the predictions using F1 score calculation and generate CSV."""
    tp = fp = fn = 0

    # Iterate over predictions and evaluate
    for idx, pred in predictions_df.iterrows():
        gt_row = ground_truth_df[ground_truth_df['index'] == pred['index']]

        # If ground truth is missing for some reason, skip evaluation
        if gt_row.empty:
            continue

        gt = gt_row['entity_value'].values[0]
        out = pred['prediction']

        # True Positive: Correct prediction and non-empty values
        if out != "" and gt != "" and out == gt:
            tp += 1
        # False Positive: Prediction is made but incorrect or no ground truth
        elif out != "" and (gt == "" or out != gt):
            fp += 1
        # False Negative: No prediction but there's a ground truth value
        elif out == "" and gt != "":
            fn += 1

    # Precision, Recall, and F1 score calculations
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    # Print evaluation results
    print(f"True Positives: {tp}, False Positives: {fp}, False Negatives: {fn}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

    return f1_score



In [None]:
import pandas as pd

In [None]:
# Load the test data
test_df = pd.read_csv('/content/sample_test.csv')

# Load the test images using the same function
test_images = load_images_pil(test_df, 'image_link')

# Make predictions
predictions = model.predict(test_images)

# Convert predictions back to entity values (e.g., '34 gram')
predicted_labels = label_encoder.inverse_transform(predictions.astype(int))

# Create the output DataFrame
output_df = pd.DataFrame({'index': test_df['index'], 'prediction': predicted_labels})

# Save the output CSV
output_df.to_csv('test_out.csv', index=False)

# Verify the format using sanity.py (if necessary)


In [None]:
def save_predictions_to_csv(predictions_df, output_file_path):
    """Save the predictions DataFrame to CSV in the required format."""
    predictions_df[['index', 'prediction']].to_csv(output_file_path, index=False)
    print(f"Predictions saved to {output_file_path}")

# Example Usage
# Load the ground truth and predictions CSV files
ground_truth_df = pd.read_csv('/content/sample_test_out.csv')  # Replace with actual ground truth
predictions_df = pd.read_csv('/content/test.csv')  # Replace with actual predictions

# Evaluate the predictions
f1_score = evaluate_predictions(ground_truth_df, predictions_df)

# Save predictions to CSV in the required format
output_file_path = '/mnt/data/final_predictions.csv'
save_predictions_to_csv(predictions_df, output_file_path)

# Example usage for test predictions
test_predictions = [
    {"index": 1, "prediction": "34 gram"},
    {"index": 2, "prediction": "15 centimetre"},
    {"index": 3, "prediction": ""},
    # Add other predictions here...
]

