# Intro

...

## Setup

In [1]:
import os
import textwrap

import google.generativeai as genai
from fastkaggle.core import iskaggle

In [2]:
if iskaggle:
    from kaggle_secrets import UserSecretsClient

    GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
else:
    from dotenv import load_dotenv

    load_dotenv()

    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=GOOGLE_API_KEY)

In [3]:
# Constants

SEED = 42
MODEL = "models/gemini-1.5-flash-002"


In [4]:
from pathlib import Path

dataset_path = Path("/kaggle/input/playground-series-s4e11")
output_path = Path("/kaggle/working")

if not iskaggle:
    import kagglehub

    dataset_path = kagglehub.competition_download("playground-series-s4e11")
    dataset_path = Path(dataset_path)
    output_path = Path(dataset_path)

train_csv_path = dataset_path / "train.csv"
test_csv_path = dataset_path / "test.csv"
submission_csv_path = dataset_path / "sample_submission.csv"

In [5]:
import pandas as pd

train_df = pd.read_csv(train_csv_path, index_col=0)
test_df = pd.read_csv(test_csv_path, index_col=0)
submission_df = pd.read_csv(submission_csv_path, index_col=0)

In [None]:
import re

from inflection import underscore


def convert_to_snake_case(s):
    """
    Convert a string to snake_case.
    """

    s = re.sub(r"[^\w\s]", "", s)
    s = s.replace(" ", "_")
    return underscore(s.strip())


train_df.columns = [convert_to_snake_case(col) for col in train_df.columns]
test_df.columns = [convert_to_snake_case(col) for col in test_df.columns]
submission_df.columns = [convert_to_snake_case(col) for col in submission_df.columns]

In [7]:
sample_df = train_df.sample(5_000, random_state=SEED)
sample_df.head()

Unnamed: 0_level_0,name,gender,age,city,working_professional_or_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_,work_study_hours,financial_stress,family_history_of_mental_illness,depression
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
18347,Sanya,Female,51.0,Patna,Working Professional,Teacher,,3.0,,,5.0,More than 8 hours,Moderate,B.Ed,No,11.0,2.0,Yes,0
96193,Sneha,Female,20.0,Agra,Working Professional,,,1.0,,,4.0,Less than 5 hours,Moderate,Class 12,No,0.0,5.0,Yes,0
100005,Aanchal,Female,21.0,Ahmedabad,Student,,2.0,,7.82,5.0,,5-6 hours,Healthy,MA,Yes,12.0,2.0,Yes,0
39890,Rahil,Male,36.0,Indore,Working Professional,Teacher,,5.0,,,2.0,Less than 5 hours,Moderate,MBBS,No,1.0,1.0,No,0
98243,Rishi,Male,60.0,Mumbai,Working Professional,HR Manager,,2.0,,,2.0,5-6 hours,Moderate,BBA,No,6.0,2.0,No,0


In [None]:
import enum
from typing import List

from typing_extensions import TypedDict, NotRequired

ColumnEnums = enum.Enum("ColumnEnums", {col: col for col in train_df.columns})


# Temporary Fix for TypedDict issue in genai library: https://github.com/google-gemini/generative-ai-python/issues/560
def get_dict_schema(response_schema: type) -> dict:
    config = genai.GenerationConfig(response_schema=response_schema)
    config = genai.types.generation_types.to_generation_config_dict(config)
    schema = config["response_schema"]
    schema.required = list(response_schema.__required_keys__)
    return schema


class DataPreparationSchema(TypedDict):
    numeric_features: List[ColumnEnums]  # type: ignore
    categorical_features: List[ColumnEnums]  # type: ignore
    ignore_features: List[ColumnEnums]  # type: ignore
    fix_imbalance: bool
    remove_outliers: bool
    imputation_type: str


class ScaleAndTransformSchema(TypedDict):
    normalize: bool
    transformation: bool


class FeatureEngineeringSchema(TypedDict):
    polynomial_features: bool
    polynomial_degree: int
    group_features: NotRequired[List[ColumnEnums]]  # type: ignore
    bin_numeric_features: NotRequired[List[ColumnEnums]]  # type: ignore
    rare_to_value: NotRequired[float]


class FeatureSelectionSchema(TypedDict):
    feature_selection: bool
    n_features_to_select: NotRequired[float]
    remove_multicollinearity: NotRequired[bool]
    low_variance_threshold: NotRequired[float]

In [9]:
sample_df.to_csv(output_path / "sample.csv", index=False)

if files := [f for f in genai.list_files()]:
    csv_file = files[0]
else:
    csv_file = genai.upload_file(output_path / "sample.csv")

print(f"{csv_file.display_name} uploaded successfully")


sample.csv uploaded successfully


In [10]:
from datetime import timedelta

from google.generativeai import caching

cache = caching.CachedContent.create(
    model=MODEL,
    display_name="Data scientist for Depression Prediction",
    system_instruction=textwrap.dedent(
        """You are a highly skilled and experienced data scientist specializing in Python-based machine learning solutions. You are adept at leveraging automated tools and libraries to streamline the data science workflow. You are proficient in:

            * **Domain knowledge:** You are familiar with the task of predicting depression based on various features.
            * **Data analysis:** You can effectively analyze databased on the CSV file you have access to.
            * **Automated feature engineering:** You have expertise in utilizing the `pycaret` library to automatically generate relevant features from raw data.
            * **Automated machine learning:** You are skilled in using the `pycaret` library to automate the process of model selection, training, and evaluation. You can effectively use this library to identify the best-performing machine learning algorithm for a given dataset and task.
            * **Programming languages and tools:** You are fluent in Python and familiar with relevant libraries like `pycaret`. 

            **When responding to user requests, adhere to the following principles:**

            * **Data-driven approach:** Base your analysis and recommendations CSV file you have access to and avoid making assumptions or drawing conclusions without sufficient evidence.
            * **Ethical considerations:** Be mindful of potential biases in the data and ensure your analysis and models are fair and unbiased.
            * **Provide actionable insights:** Focus on delivering insights that the user can act upon to solve their problem or make informed decisions.

            **Workflow:**
            
            1. **Understand the Problem:** Use the provided CSV and run analysis to understand the problem of predicting depression based on various features.

            2. **Setup Experiment with Pycaret:** Define the required parameters and setup the experiment using the `pycaret` library.

            3. **Model Training and Evaluation with Pycaret:** Leverage the `pycaret` library to automate the machine learning pipeline.  Initialize the `pycaret` setup, specifying the target variable and any preprocessing steps. Compare various models, tune hyperparameters, and evaluate performance metrics. Select the best-performing model based on the specific problem and desired outcome.

            4. **Interpretation and Communication:**  Interpret the results of the model and communicate the findings in a clear and concise manner. Explain the model's predictions, feature importance, and potential limitations.  """
    ),
    contents=[csv_file],
    ttl=timedelta(minutes=30),
    tools="code_execution",
)


In [None]:
from google.api_core import retry


retry_policy = {"retry": retry.Retry(predicate=retry.if_transient_error)}

model = genai.GenerativeModel.from_cached_content(cached_content=cache)

chat = model.start_chat()

AttributeError: module 'google.generativeai' has no attribute 'generation_types'

In [None]:
result = chat.send_message(
    textwrap.dedent(
        f"""
        You are provided with a CSV file {csv_file.name}. This file contains a header row and uses commas as delimiters. The data will be used for a binary classification task in Pycaret, an AutoML library in Python. To prepare the data using the `setup()` function, analyse the data using code execution tool and then based on the analysis, generate the following parameters in JSON format:
        
        Remember that performing a binary classification to predict depression target variable based on various features. The parameters to generate are as follows:

        * **`numeric_features`:**  A list of column names with numeric features.
        * **`categorical_features`:** A list of column names with categorical features.
        * **`ignore_features`:** A list of column names to be ignored during model training. These features might be irrelevant to the target variable in this case 'depression' column, redundant with other features, or could introduce data leakage.
        * **`fix_imbalance`:**  A boolean value indicating whether to handle class imbalance. If true, use oversampling to address the imbalance.
        * **`remove_outliers`:** A boolean value indicating whether to remove outliers.
        * **`imputation_type`:** The type of imputation to use for missing values. Choose between 'simple' (mean/median imputation) or 'iterative' (k-Nearest Neighbors imputation).

        All parameters are required.

        **Example JSON Response:**

        ```json
        {{
        "numeric_features": ["age", "income", "credit_score"],
        "categorical_features": ["gender", "education", "city"],
        "ignore_features": ["customer_id", "date"],
        "fix_imbalance": true,
        "remove_outliers": true,
        "imputation_type": "iterative" 
        }}
        ```

        """
    ),
    generation_config=genai.GenerationConfig(
        response_schema=get_dict_schema(DataPreparationSchema),
        response_mime_type="application/json",
    ),
    request_options=retry_policy,
)

print(result)

In [None]:
result = chat.send_message(
    textwrap.dedent(
        f"""
        You are provided with a CSV file {csv_file.name}. This file contains a header row and uses commas as delimiters. The data will be used for a binary classification task in Pycaret, an AutoML library in Python. To prepare the data using the `setup()` function, analyse the data using code execution tool and then based on the analysis, generate the following parameters in JSON format:
        
        Remember that performing a binary classification to predict depression target variable based on various features. The parameters to generate are as follows:

        * normalize: A boolean value indicating whether to normalize the data. If true, the data will be scaled to have a mean of 0 and a standard deviation of 1.
        * transformation: A boolean value indicating whether to apply a transformation to the data. If true, the data will be transformed using a power transformation.

        All parameters are required.

        **Example JSON Response:**

        ```json
        {{
       "normalize": true,
        "transformation": true
        }}
        ```

        """
    ),
    generation_config=genai.GenerationConfig(
        response_schema=get_dict_schema(ScaleAndTransformSchema),
        response_mime_type="application/json",
    ),
    request_options=retry_policy,
)

print(result)


In [None]:
result = chat.send_message(
    textwrap.dedent(
        f"""
        You are provided with a CSV file {csv_file.name}. This file contains a header row and uses commas as delimiters. The data will be used for a binary classification task in Pycaret, an AutoML library in Python. To prepare the data using the `setup()` function, analyse the data using code execution tool and then based on the analysis, generate the following parameters in JSON format:
        
        Remember that performing a binary classification to predict depression target variable based on various features. The parameters to generate are as follows:

        * polynomial_features: A boolean value indicating whether to generate polynomial features. If true, polynomial features will be created based on the specified degree.
        * polynomial_degree: An integer specifying the degree of polynomial features to generate. This parameter is required if polynomial_features is set to true.
        * group_features: A list of column names to group together for feature engineering. This parameter is optional. If provided, the features in the list will be grouped together for feature engineering.
        * bin_numeric_features: A list of column names with numeric features to bin into discrete intervals. This parameter is optional. If provided, the numeric features will be binned into discrete intervals.
        * rare_to_value: A float value specifying the threshold for rare categories. Categories with a frequency less than this threshold will be replaced with a specified value. This parameter is optional and only applicable to categorical features.
        

        All parameters are required.

        **Example JSON Response:**

        ```json
        {{
            polynomial_features: true,
            polynomial_degree: 2,
            group_features: ["age", "income"],
            bin_numeric_features: ["credit_score"],
            rare_to_value: 0.01
        }}
        ```

        """
    ),
    generation_config=genai.GenerationConfig(
        response_schema=get_dict_schema(FeatureEngineeringSchema),
        response_mime_type="application/json",
    ),
    request_options=retry_policy,
)

print(result)


In [None]:
result = chat.send_message(
    textwrap.dedent(
        f"""
        You are provided with a CSV file {csv_file.name}. This file contains a header row and uses commas as delimiters. The data will be used for a binary classification task in Pycaret, an AutoML library in Python. To prepare the data using the `setup()` function, analyse the data using code execution tool and then based on the analysis, generate the following parameters in JSON format:
        
        Remember that performing a binary classification to predict depression target variable based on various features. The parameters to generate are as follows:

        * feature_selection: A boolean value indicating whether to perform feature selection. If true, feature selection will be performed based on the specified criteria.
        * n_features_to_select: A float value specifying the number of features to select. This parameter is optional and only applicable if feature_selection is set to true.
        * remove_multicollinearity: A boolean value indicating whether to remove multicollinear features. If true, multicollinear features will be removed.
        * low_variance_threshold: A float value specifying the threshold for low variance features. Features with a variance less than this threshold will be removed. This parameter is optional
        

        All parameters are required.

        **Example JSON Response:**

        ```json
        {{
            feature_selection: true,
            n_features_to_select: 0.5,
            remove_multicollinearity: true,
            low_variance_threshold: 0.01
        }}
        ```

        """
    ),
    generation_config=genai.GenerationConfig(
        response_schema=get_dict_schema(FeatureSelectionSchema),
        response_mime_type="application/json",
    ),
    request_options=retry_policy,
)

print(result)
