In [20]:
import os

import google.generativeai as genai
from google.generativeai import caching

from fastkaggle.core import iskaggle

In [None]:
if iskaggle:
    from kaggle_secrets import UserSecretsClient

    GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
else:
    from dotenv import load_dotenv

    load_dotenv()

    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=GOOGLE_API_KEY)


In [32]:
# Constants

SEED = 42
MODEL = "models/gemini-1.5-flash-002"

In [None]:
dataset_path = "/kaggle/input/playground-series-s4e11"

if not iskaggle:
    import kagglehub

    dataset_path = kagglehub.competition_download("playground-series-s4e11")

train_csv_path = os.path.join(dataset_path, "train.csv")
test_csv_path = os.path.join(dataset_path, "test.csv")
submission_csv_path = os.path.join(dataset_path, "sample_submission.csv")

In [10]:
import pandas as pd

train_df = pd.read_csv(train_csv_path, index_col=0)
test_df = pd.read_csv(test_csv_path, index_col=0)
submission_df = pd.read_csv(submission_csv_path, index_col=0)

In [11]:
train_df.head()

Unnamed: 0_level_0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [17]:
sample_df = train_df.sample(10_000, random_state=SEED)
sample_df.head()

Unnamed: 0_level_0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
18347,Sanya,Female,51.0,Patna,Working Professional,Teacher,,3.0,,,5.0,More than 8 hours,Moderate,B.Ed,No,11.0,2.0,Yes,0
96193,Sneha,Female,20.0,Agra,Working Professional,,,1.0,,,4.0,Less than 5 hours,Moderate,Class 12,No,0.0,5.0,Yes,0
100005,Aanchal,Female,21.0,Ahmedabad,Student,,2.0,,7.82,5.0,,5-6 hours,Healthy,MA,Yes,12.0,2.0,Yes,0
39890,Rahil,Male,36.0,Indore,Working Professional,Teacher,,5.0,,,2.0,Less than 5 hours,Moderate,MBBS,No,1.0,1.0,No,0
98243,Rishi,Male,60.0,Mumbai,Working Professional,HR Manager,,2.0,,,2.0,5-6 hours,Moderate,BBA,No,6.0,2.0,No,0


In [None]:
sample_df.to_csv("sample.csv", index=False)
if files := [f for f in genai.list_files()]:
    csv_file = files[0]
else:
    csv_file = genai.upload_file("sample.csv")

print(csv_file)

genai.File({
    'name': 'files/ptkdzcd6rvmo',
    'display_name': 'sample.csv',
    'mime_type': 'text/csv',
    'sha256_hash': 'NjZjZTlkYmExYTA2OWZmYWViZTc1ZTgzNDNkMTMwZWRlOGRkMTliNzE3Yjc2MTc3Y2YzMzc1OWIzMmI2OTY0NA==',
    'size_bytes': '1104761',
    'state': 'ACTIVE',
    'uri': 'https://generativelanguage.googleapis.com/v1beta/files/ptkdzcd6rvmo',
    'create_time': '2024-11-20T03:41:24.344539Z',
    'expiration_time': '2024-11-22T03:41:24.324680511Z',
    'update_time': '2024-11-20T03:41:24.344539Z'})


[genai.File({
     'name': 'files/ptkdzcd6rvmo',
     'display_name': 'sample.csv',
     'mime_type': 'text/csv',
     'sha256_hash': 'NjZjZTlkYmExYTA2OWZmYWViZTc1ZTgzNDNkMTMwZWRlOGRkMTliNzE3Yjc2MTc3Y2YzMzc1OWIzMmI2OTY0NA==',
     'size_bytes': '1104761',
     'state': 'ACTIVE',
     'uri': 'https://generativelanguage.googleapis.com/v1beta/files/ptkdzcd6rvmo',
     'create_time': '2024-11-20T03:41:24.344539Z',
     'expiration_time': '2024-11-22T03:41:24.324680511Z',
     'update_time': '2024-11-20T03:41:24.344539Z'})]

In [None]:
from typing import Literal


def describe_data():
    return train_df.describe().to_string()


def train_models(
    imputation_type: Literal["simple", "iterative"],
    normalize: bool = False,
    normalize_method: Literal["minmax", "zscore"] = "minmax",
):
    print("Training models")
    print(
        f"Args: imputation_type={imputation_type}, normalize={normalize}, normalize_method={normalize_method}"
    )
    return """"runtime","Object","Model","Accuracy","AUC","Recall","Prec\\.","F1","Kappa","MCC","TT (Sec)"
"1174.97","LGBMClassifier","Light Gradient Boosting Machine","0.9369","0.9735","0.8102","0.8374","0.8235","0.7851","0.7854","117.365"
"1301.98","GradientBoostingClassifier","Gradient Boosting Classifier","0.9367","0.973","0.8005","0.8433","0.8212","0.7828","0.7832","130.046"
"1184.52","AdaBoostClassifier","Ada Boost Classifier","0.9355","0.9721","0.8129","0.829","0.8207","0.7814","0.7816","118.312"
"1185.82","LogisticRegression","Logistic Regression","0.9352","0.972","0.8069","0.8317","0.819","0.7796","0.7798","118.427"
"1158.1","XGBClassifier","Extreme Gradient Boosting","0.935","0.9716","0.8037","0.8328","0.8179","0.7783","0.7785","115.677"
"1172.69","ExtraTreesClassifier","Extra Trees Classifier","0.932","0.9679","0.7705","0.842","0.8045","0.7635","0.7647","117.133"
"1149.69","RidgeClassifier","Ridge Classifier","0.9215","0.9646","0.7493","0.8053","0.7762","0.7287","0.7295","114.823"
"1167.81","KNeighborsClassifier","K Neighbors Classifier","0.9167","0.9323","0.75","0.7825","0.7658","0.7152","0.7154","116.653"
"1148.14","DecisionTreeClassifier","Decision Tree Classifier","0.902","0.8354","0.7307","0.7305","0.7305","0.6707","0.6707","114.677"
"1120.2","CatBoostClassifier","CatBoost Classifier","0.844","0.8768","0.7307","0.7566","0.7434","0.7093","0.7095","111.883"
"1141.24","RandomForestClassifier","Random Forest Classifier","0.8409","0.8726","0.705","0.7601","0.7314","0.6957","0.6965","113.994"
"1119.64","SGDClassifier","SVM - Linear Kernel","0.8334","0.8722","0.7085","0.7318","0.7149","0.6746","0.6783","111.832"
"1069.41","LinearDiscriminantAnalysis","Linear Discriminant Analysis","0.8265","0.868","0.7416","0.676","0.7072","0.6619","0.6631","106.81"
"1161.83","DummyClassifier","Dummy Classifier","0.8183","0.5","0","0","0","0","0","116.04"
"977.18","GaussianNB","Naive Bayes","0.6956","0.7647","0.689","0.5065","0.5631","0.5012","0.526","97.586"
"1146.41","QuadraticDiscriminantAnalysis","Quadratic Discriminant Analysis","0.3082","0.8284","0.9709","0.2316","0.359","0.0857","0.1111","114.506"""


def visualize_results(model: str):
    print(f"Visualizing data for model: {model}")
    return "Visualizing data"

In [None]:
from datetime import timedelta

cache = caching.CachedContent.create(
    model=MODEL,
    display_name="Depression Data",
    system_instruction=(
        "You are an expert data scientist working for a mental health organization. "
        "You have been tasked with predicting the likelihood of depression in individuals "
        "based on a dataset of survey responses. "
        "You can analyze data, train models using the provided tools, and interpret the results."
        "Use the provided sample datataset to get a general idea of the data and use the following tools"
        "to analyze the data, train models, and interpret the results."
        ""
        "# Tools provided with description:"
        "describe_data: Provides a summary of the dataset in pandas profiling report format."
        "train_models: Trains a model using pycaret and returns all the model metrics."
        "visualize_results: Visualizes the results of the trained models."
    ),
    contents=[csv_file],
    ttl=timedelta(minutes=5),
    tools=[describe_data, train_models, visualize_results],
)

In [41]:
model = genai.GenerativeModel.from_cached_content(cached_content=cache)

In [None]:
from google.api_core import retry

retry_policy = {"retry": retry.Retry(predicate=retry.if_transient_error)}

model = genai.GenerativeModel.from_cached_content(cached_content=cache)
chat = model.start_chat(enable_automatic_function_calling=True)

In [None]:
response = chat.send_message(
    "Based on the data, what are the key insights of each column? and what analysis can be done on the data?"
)
print(response.text)

PermissionDenied: 403 CachedContent not found (or permission denied)

In [50]:
print(response.usage_metadata)

prompt_token_count: 523459
candidates_token_count: 130
total_token_count: 523589
cached_content_token_count: 523436



In [51]:
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Based on the data description,  it's difficult to definitively recommend a single best model without actually training and comparing several. The choice depends on factors like the dataset size, the presence of non-linear relationships between features, and the desired balance between model complexity and interpretability.  \n\nTo determine the best model for your depression prediction task, I recommend using the `train_models` function in the `default_api` with different imputation and normalization strategies. The function will return comprehensive model metrics, enabling a comparison of various algorithms.  Then, use the `visualize_results` function to gain further insights into the models' performance.\n"
              }
            ],
            "role"

In [52]:
cache.delete()

PermissionDenied: 403 CachedContent not found (or permission denied)