In [1]:
import os
import textwrap

import featuretools as ft
import google.generativeai as genai
from fastkaggle.core import iskaggle

In [2]:
if iskaggle:
    from kaggle_secrets import UserSecretsClient

    GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
else:
    from dotenv import load_dotenv

    load_dotenv()

    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=GOOGLE_API_KEY)

In [3]:
# Constants

SEED = 42
MODEL = "models/gemini-1.5-flash-002"


In [4]:
dataset_path = "/kaggle/input/playground-series-s4e11"
output_path = "/kaggle/working"

if not iskaggle:
    import kagglehub

    dataset_path = kagglehub.competition_download("playground-series-s4e11")
    output_path = "../data"

train_csv_path = os.path.join(dataset_path, "train.csv")
test_csv_path = os.path.join(dataset_path, "test.csv")
submission_csv_path = os.path.join(dataset_path, "sample_submission.csv")

In [5]:
import pandas as pd

train_df = pd.read_csv(train_csv_path, index_col=0)
test_df = pd.read_csv(test_csv_path, index_col=0)
submission_df = pd.read_csv(submission_csv_path, index_col=0)

In [6]:
import re

from inflection import underscore


def convert_to_snake_case(s):
    # Replace punctuations with underscores
    s = re.sub(r"[^\w\s]", "", s)
    # Replace spaces with underscores
    s = s.replace(" ", "_")
    # Convert to snake_case
    return underscore(s)


train_df.columns = [convert_to_snake_case(col) for col in train_df.columns]
test_df.columns = [convert_to_snake_case(col) for col in test_df.columns]
submission_df.columns = [convert_to_snake_case(col) for col in submission_df.columns]

In [7]:
sample_df = train_df.sample(10_000, random_state=SEED)
sample_df.head()

Unnamed: 0_level_0,name,gender,age,city,working_professional_or_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_,work_study_hours,financial_stress,family_history_of_mental_illness,depression
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
18347,Sanya,Female,51.0,Patna,Working Professional,Teacher,,3.0,,,5.0,More than 8 hours,Moderate,B.Ed,No,11.0,2.0,Yes,0
96193,Sneha,Female,20.0,Agra,Working Professional,,,1.0,,,4.0,Less than 5 hours,Moderate,Class 12,No,0.0,5.0,Yes,0
100005,Aanchal,Female,21.0,Ahmedabad,Student,,2.0,,7.82,5.0,,5-6 hours,Healthy,MA,Yes,12.0,2.0,Yes,0
39890,Rahil,Male,36.0,Indore,Working Professional,Teacher,,5.0,,,2.0,Less than 5 hours,Moderate,MBBS,No,1.0,1.0,No,0
98243,Rishi,Male,60.0,Mumbai,Working Professional,HR Manager,,2.0,,,2.0,5-6 hours,Moderate,BBA,No,6.0,2.0,No,0


In [8]:
for l in genai.list_models():
    if "createCachedContent" in l.supported_generation_methods:
        print(l.name)

models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-flash-001
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest


In [9]:
sample_df.to_csv(f"{output_path}/sample.csv", index=False)

if files := [f for f in genai.list_files()]:
    csv_file = files[0]
else:
    csv_file = genai.upload_file(f"{output_path}/sample.csv")

print(csv_file)


genai.File({
    'name': 'files/1nubkohgfp9a',
    'display_name': 'sample.csv',
    'mime_type': 'text/csv',
    'sha256_hash': 'OGRlMDA5YmY4MWU0ZmNjOTA1NWM0YmVmZDI4YzUyZDU4YWFiYTBmMzEyODQ2MWNjYWU0NDQ3NWNhYmIyYzI0Nw==',
    'size_bytes': '1104760',
    'state': 'ACTIVE',
    'uri': 'https://generativelanguage.googleapis.com/v1beta/files/1nubkohgfp9a',
    'create_time': '2024-11-26T23:40:24.831404Z',
    'expiration_time': '2024-11-28T23:40:24.818546235Z',
    'update_time': '2024-11-26T23:40:24.831404Z'})


In [10]:
from datetime import timedelta

from google.generativeai import caching

cache = caching.CachedContent.create(
    model=MODEL,
    display_name="Data scientist for Depression Prediction",
    system_instruction=textwrap.dedent(
        """You are a highly skilled and experienced data scientist specializing in Python-based machine learning solutions. You are adept at leveraging automated tools and libraries to streamline the data science workflow. You are proficient in:

            * **Domain knowledge:** You are familiar with the task of predicting depression based on various features.
            * **Data analysis:** You can effectively analyze databased on the CSV file you have access to.
            * **Automated feature engineering:** You have expertise in utilizing the `featuretools` library to automatically generate relevant features from raw data. You can effectively define logical types and appropriate transformations to optimize feature creation.
            * **Automated machine learning:** You are skilled in using the `pycaret` library to automate the process of model selection, training, and evaluation. You can effectively use this library to identify the best-performing machine learning algorithm for a given dataset and task.
            * **Programming languages and tools:** You are fluent in Python and familiar with relevant libraries like `featuretools`, and `pycaret`. 

            **When responding to user requests, adhere to the following principles:**

            * **Data-driven approach:** Base your analysis and recommendations CSV file you have access to and avoid making assumptions or drawing conclusions without sufficient evidence.
            * **Ethical considerations:** Be mindful of potential biases in the data and ensure your analysis and models are fair and unbiased.
            * **Provide actionable insights:** Focus on delivering insights that the user can act upon to solve their problem or make informed decisions.

            **Workflow:**

            Use only the tools provided to you to address the user's request effectively.

            1. **Setup Experiment with Pycaret:** Define the required parameters and setup the experiment using the `pycaret` library.

            2. **Model Training and Evaluation with Pycaret:** Leverage the `pycaret` library to automate the machine learning pipeline.  Initialize the `pycaret` setup, specifying the target variable and any preprocessing steps. Compare various models, tune hyperparameters, and evaluate performance metrics. Select the best-performing model based on the specific problem and desired outcome.

            3. **Interpretation and Communication:**  Interpret the results of the model and communicate the findings in a clear and concise manner. Explain the model's predictions, feature importance, and potential limitations.  """
    ),
    contents=[csv_file],
    ttl=timedelta(minutes=30),
    tools="code_execution",
)


In [11]:
model = genai.GenerativeModel.from_cached_content(cached_content=cache)

In [None]:
import enum
from typing import List

from typing_extensions import TypedDict
from google.generativeai.types import generation_types
from google.generativeai import GenerationConfig

ColumnEnums = enum.Enum("ColumnEnums", {col: col for col in train_df.columns})


class PycaretDataPreparationSchema(TypedDict):
    numeric_features: List[ColumnEnums]
    categorical_features: List[ColumnEnums]
    ignore_features: List[ColumnEnums]
    fix_imbalance: bool
    remove_outliers: bool
    imputation_type: str


def force_required_fields(generation_config) -> dict:
    """
    Returns a copy with all fields in the schema marked as required.
    Workaround for https://github.com/google-gemini/generative-ai-python/issues/560.
    """
    generation_config = generation_types.to_generation_config_dict(generation_config)
    schema = generation_config["response_schema"]
    schema.required = list(schema.properties)
    return generation_config


generation_config = GenerationConfig(
    response_mime_type="application/json",
    response_schema=PycaretDataPreparationSchema,
)

generation_config = force_required_fields(generation_config)

In [None]:
from google.api_core import retry


retry_policy = {"retry": retry.Retry(predicate=retry.if_transient_error)}

result = model.generate_content(
    textwrap.dedent(
        """
        You are provided with a CSV file named 'sample.csv'. This file contains a header row and uses commas as delimiters. The data will be used for a binary classification task in Pycaret, an AutoML library in Python. To prepare the data using the `setup()` function, analyse the data using code execution tool and then based on the analysis, generate the following parameters in JSON format:

        * **`numeric_features`:**  A list of column names with numeric features.
        * **`categorical_features`:** A list of column names with categorical features.
        * **`ignore_features`:** A list of column names to be ignored during model training. These features might be irrelevant to the target variable, redundant with other features, or could introduce data leakage.
        * **`fix_imbalance`:**  A boolean value indicating whether to handle class imbalance. If true, use oversampling to address the imbalance.
        * **`remove_outliers`:** A boolean value indicating whether to remove outliers.
        * **`imputation_type`:** The type of imputation to use for missing values. Choose between 'simple' (mean/median imputation) or 'iterative' (k-Nearest Neighbors imputation).

        All parameters are required.

        **Example JSON Response:**

        ```json
        {
        "numeric_features": ["age", "income", "credit_score"],
        "categorical_features": ["gender", "education", "city"],
        "ignore_features": ["customer_id", "date"],
        "fix_imbalance": true,
        "remove_outliers": true,
        "imputation_type": "iterative" 
        }
        ```

        """
    ),
    generation_config=generation_config,
    request_options=retry_policy,
)

print(result)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "{\"categorical_features\": [\"name\", \"gender\", \"city\", \"working_professional_or_student\", \"profession\", \"degree\", \"have_you_ever_had_suicidal_thoughts_\", \"sleep_duration\", \"dietary_habits\", \"family_history_of_mental_illness\"], \"fix_imbalance\": true, \"ignore_features\": [], \"imputation_type\": \"simple\", \"numeric_features\": [\"age\", \"academic_pressure\", \"work_pressure\", \"cgpa\", \"study_satisfaction\", \"job_satisfaction\", \"work_study_hours\", \"financial_stress\"], \"outliers_threshold\": 3.0, \"remove_outliers\": false}"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.001990148503529398
        }
      ],
      "usage_metadata": {
      

In [30]:
import json

json.loads(result.candidates[0].content.parts[0].text)

{'categorical_features': ['name',
  'gender',
  'city',
  'working_professional_or_student',
  'profession',
  'degree',
  'have_you_ever_had_suicidal_thoughts_',
  'sleep_duration',
  'dietary_habits',
  'family_history_of_mental_illness'],
 'fix_imbalance': True,
 'ignore_features': [],
 'imputation_type': 'simple',
 'numeric_features': ['age',
  'academic_pressure',
  'work_pressure',
  'cgpa',
  'study_satisfaction',
  'job_satisfaction',
  'work_study_hours',
  'financial_stress'],
 'outliers_threshold': 3.0,
 'remove_outliers': False}

In [31]:
cache.delete()