In [None]:
# Install TensorFlow
!pip install tensorflow

# Install EasyOCR for text extraction
!pip install easyocr

# Install pandas for data manipulation and CSV handling
!pip install pandas

# Install numpy for array manipulation
!pip install numpy


In [None]:
import tensorflow as tf

from tensorflow.keras import layers, models, applications
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, LSTM, Bidirectional, Masking

import pandas as pd
import numpy as np
import easyocr  # For text extraction
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [None]:
# Step 1: Extract text from images using EasyOCR

def extract_text_from_image(image_path):
    reader = easyocr.Reader(['en', 'bn'])  # Initialize OCR reader for English and Bengali
    result = reader.readtext(image_path)  # Extract text
    text = " ".join([item[1] for item in result])  # Concatenate all the text
    return text



In [None]:
# Step 2: Preprocess and tokenize the text data (you can adjust this based on your needs)

def preprocess_text(text):
    # Simple text preprocessing: lowercasing, removing unnecessary spaces, etc.
    text = text.lower().strip()
    return text


In [None]:
# Step 3: Define the model for image and text classification


def create_model(word_index, embedding_matrix):

    # Image input (VGG16)
    input_img = Input(shape=(150, 150, 3))
    model_img = applications.VGG16(weights='imagenet', include_top=False, input_shape=(150, 150, 3))
    for layer in model_img.layers:
        layer.trainable = False
    x_img = model_img(input_img)
    flatten_img = Flatten()(x_img)
    flatten_img = Dense(1024, activation='relu')(flatten_img)
    flatten_img = Dense(512, activation='relu')(flatten_img)

    # Text input (LSTM and Embedding)
    input_txt = Input(shape=(100,), dtype='int32')
    txt = layers.Masking(mask_value=0)(input_txt)
    txt = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], input_length=100, trainable=False)(txt)
    txt = layers.Conv1D(32, 5)(txt)
    txt = layers.Conv1D(60, 4)(txt)
    txt = layers.Conv1D(100, 3)(txt)
    text_lstm = Bidirectional(LSTM(30, return_sequences=True))(txt)
    text_lstm = Bidirectional(LSTM(30, return_sequences=False))(text_lstm)
    text_lstm = Dense(512, activation='relu')(text_lstm)

    # Merge image and text features
    merged = layers.concatenate([text_lstm, flatten_img], axis=1)

    # Final dense layers
    dense = Dense(1024, activation='relu')(merged)
    dense = Dropout(0.1)(dense)
    dense = Dense(512, activation='relu')(dense)
    dense = Dense(256, activation='relu')(dense)
    dense = Dense(128, activation='relu')(dense)
    output = Dense(3, activation='softmax')(dense)

    model = Model(inputs=(input_img, input_txt), outputs=output)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=2e-5), metrics=["accuracy"])

    return model

In [None]:
# Step 4: Process images and texts from the folder and save the results


def process_and_predict(image_folder, model, word_index, embedding_matrix):
    predictions = []
    image_paths = []
    texts = []

    for filename in os.listdir(image_folder):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(image_folder, filename)

            # Step 4.1: Extract text from image
            text = extract_text_from_image(image_path)
            processed_text = preprocess_text(text)

            # Step 4.2: Prepare the text data for the LSTM model
            sequences = [processed_text]  # Wrap in a list
            tokenizer = tf.keras.preprocessing.text.Tokenizer()
            tokenizer.fit_on_texts(sequences)
            padded_sequences = pad_sequences(tokenizer.texts_to_sequences(sequences), maxlen=100)

            # Step 4.3: Prepare the image data
            image = load_img(image_path, target_size=(150, 150))
            image = img_to_array(image) / 255.0

            # Step 4.4: Make a prediction
            prediction = model.predict([np.array([image]), padded_sequences])
            predicted_label = np.argmax(prediction, axis=1)  # Get the label with highest probability

            # Collect the results
            predictions.append(predicted_label[0])
            image_paths.append(image_path)
            texts.append(text)

In [None]:
   # Step 5: Save results to a CSV file

    df = pd.DataFrame({
        "image_path": image_paths,
        "extracted_text": texts,
        "predicted_label": predictions
    })

    df.to_csv("memes_predictions.csv", index=False)
    print("Predictions saved to memes_predictions.csv")

In [None]:
# Step 6: Process the images and save results
def run(image_folder):
    # Example word_index and embedding_matrix (you can replace these with actual pre-trained data)
    word_index = {'word1': 1, 'word2': 2, 'word3': 3}  # You should create your own tokenizer and word_index
    embedding_matrix = np.random.random((len(word_index) + 1, 300))  # Dummy embedding matrix (replace with actual embeddings)

    # Create and train the model (if already trained, you can load the model from a file instead)
    model = create_model(word_index, embedding_matrix)

    # Load the model (assuming it is pre-trained, you can load a saved model here)
    # model = tf.keras.models.load_model("your_trained_model.h5")

    # Process images and make predictions
    process_and_predict(image_folder, model, word_index, embedding_matrix)

In [None]:
# Step 7: Call the function with your "memes" folder

run("memes")  # Replace with the path to your "memes" folder

# Gemini

In [None]:
from google import genai

client = genai.Client(api_key="AIzaSyAST_yHpjwDXK5m9HBpqjgI-MPB269Tcmo")

response = client.models.generate_content(
    model="gemini-1.5-flash", contents="Explain how AI works in a few words"
)

print(response.text)

Learning patterns from data to make predictions.



In [None]:
from google import genai

client = genai.Client(api_key="AIzaSyAST_yHpjwDXK5m9HBpqjgI-MPB269Tcmo")

for model in client.models.list():
  print(model)

name='models/embedding-gecko-001' display_name='Embedding Gecko' description='Obtain a distributed representation of a text.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo() input_token_limit=1024 output_token_limit=1 supported_actions=['embedText', 'countTextTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-1.0-pro-vision-latest' display_name='Gemini 1.0 Pro Vision' description='The original Gemini 1.0 Pro Vision model version which was optimized for image understanding. Gemini 1.0 Pro Vision was deprecated on July 12, 2024. Move to a newer Gemini version.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo() input_token_limit=12288 output_token_limit=4096 supported_actions=['generateContent', 'countTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-pro-vision' display_name='Gemini 1.0 Pro Vision' description='The original Gemini 1.0 Pro Vision model version which was optimized for image unde

In [None]:
from google import genai
from PIL import Image

# Initialize the client with your API key
client = genai.Client(api_key="AIzaSyAST_yHpjwDXK5m9HBpqjgI-MPB269Tcmo")

# Load the image
img = Image.open("/content/bamboo-vaiya (15).jpg")  # Replace with your image path



prompt = """
Please extract all the text from the image. The first task is to **extract everything** from the image, including all dialogue parts. If the image already contains a **caption** or **character labels** (like **[Me]**, **[He]**, etc.), **do not reassign or override** the existing labels. If the labels are already present in the text, **keep them as they are**.

After the text is extracted:
1. **Categorize the dialogue** by identifying who is speaking in the image, only for parts that do not already have a character label. For example:
   - If the text is spoken by the **father**, label it as **[father]**.
   - If the text is spoken by the **son**, label it as **[son]**.
   - If the dialogue is between a **boyfriend** and **girlfriend**, label the parts as **[boyfriend]** and **[girlfriend]**.

2. If there are multiple characters speaking in sequence, maintain the **correct order**. For example, if **Girlfriend** speaks first and **Boyfriend** responds, output them in that order.
3. **Do not repeat** the dialogue for any character. Each part should be uniquely attributed to one character.
4. If the model is unable to categorize the character, label the dialogue as **[caption]** instead of any character. For example, if there is no clear character to attribute, output the text as **[caption]** followed by the extracted text.
5. Only output the dialogue in the following format:
   - **[character]: [text]**
   - If the label already exists in the image, **do not modify** it.
6. **Avoid including any social media handles** if present in the image.
7. The model should base its categorization on **visual context** and **dialogue cues** from the image.

Example:
If the image contains a conversation like:
- [Me]: "Hey, what's up?"
- [He]: "Not much, you?"
- [Me]: "Just chilling!"
- [He]: "Cool!"

The output should be:
[Me]: Hey, what's up?
[He]: Not much, you?
[Me]: Just chilling!
[He]: Cool!

If the model cannot categorize the character, it should label the dialogue as **[caption]**:
[caption]: "This is an ambiguous part of the text with no clear character."

Make sure the output only contains **dialogue** in the specified format, with **no preamble or explanation**.
"""









# Send the image and the refined prompt to the model

response = client.models.generate_content(
    model="gemini-1.5-flash-latest",  # Or another multimodal model like gemini-1.5-pro-latest
    contents=[img, prompt]
)

# Output the response from the model
print(response.text)
