In [None]:
#!pip install autogluon.tabular  > /dev/null 2>&1
#!pip install -U ipywidgets  > /dev/null 2>&1
#!pip install sweetviz > /dev/null 2>&1
#!pip install optuna-integration[sklearn] > /dev/null 2>&1
#!pip install langchain-core > /dev/null 2>&1
#!pip install langchain-openai  > /dev/null 2>&1
#!pip install catboost > /dev/null 2>&1

In [None]:
# Import libraries

# LLM Libraries
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# General Purpose Libraries
import json
import logging
import numpy as np
import pandas as pd
import sweetviz as sv
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
import warnings
from IPython.display import Markdown, display
#from kaggle_secrets import UserSecretsClient
from scipy.stats import ttest_ind, stats

In [3]:
# Function to classify columns into continuous and categorical
def classify_columns(df):
    continuous_cols = []
    categorical_cols = []
    for column in df.columns:
        if df[column].dtypes == 'object':
            categorical_cols.append(column)
        else:
            unique_values = df[column].nunique()
            if unique_values < 15:
                categorical_cols.append(column)
            else:
                continuous_cols.append(column)
    return continuous_cols, categorical_cols

# Function to perform basic visualizations for continuous and categorical features
def eda_visualizations(df, target=None):
    continuous_cols, categorical_cols = classify_columns(df)
    
    # Plotting continuous columns
    for col in continuous_cols:
        plt.figure(figsize=(10, 4))
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()
    
    # Plotting categorical columns
    for col in categorical_cols:
        plt.figure(figsize=(10, 4))
        sns.countplot(data=df, x=col, hue=target)
        plt.title(f'Count plot for {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()

# Function to compare train and test datasets
def compare_train_test(train, test):
    continuous_cols, categorical_cols = classify_columns(train)
    
    # Compare continuous columns
    for col in continuous_cols:
        plt.figure(figsize=(10, 4))
        sns.kdeplot(train[col], label='Train', shade=True)
        sns.kdeplot(test[col], label='Test', shade=True)
        plt.title(f'Comparison of {col} Distribution in Train vs Test')
        plt.xlabel(col)
        plt.ylabel('Density')
        plt.legend()
        plt.show()
    
    # Compare categorical columns
    for col in categorical_cols:
        if col in test.columns:  # Ensure the column exists in the test dataset
            plt.figure(figsize=(10, 4))
            train_counts = train[col].value_counts(normalize=True)
            test_counts = test[col].value_counts(normalize=True)
            train_counts.plot(kind='bar', alpha=0.5, label='Train', color='blue')
            test_counts.plot(kind='bar', alpha=0.5, label='Test', color='red')
            plt.title(f'Comparison of {col} Proportions in Train vs Test')
            plt.xlabel(col)
            plt.ylabel('Proportion')
            plt.legend()
            plt.xticks(rotation=45)
            plt.show()
            
# Function to create key statistics for a dataset
def eda_summary(df):
    summary = {}
    
    # General Info
    summary['general'] = {
        'num_rows': df.shape[0],
        'num_columns': df.shape[1],
        'num_missing_values': df.isnull().sum().sum(),
        'percent_missing_values': df.isnull().mean().mean() * 100
    }
    
    # Column Data Types
    summary['data_types'] = df.dtypes.to_dict()
    
    # Missing Value Summary (per column)
    summary['missing_values'] = (
        df.isnull()
        .sum()
        .to_frame(name='missing_count')
        .assign(percent_missing=lambda x: (x['missing_count'] / df.shape[0]) * 100)
        .to_dict(orient='index')
    )
    
    # Numerical Summary (Mean, Median, Std, Min, Max)
    describe_df = df.describe()
    numerical_columns = ['mean', '50%', 'std', 'min', 'max']
    available_columns = [col for col in numerical_columns if col in describe_df.columns]
    summary['numerical_summary'] = (
        describe_df[available_columns]
        .rename(columns={'50%': 'median'})
        .to_dict(orient='index')
    )
    
    # Unique Counts for Categorical Columns
    summary['categorical_summary'] = (
        df.select_dtypes(include=['object', 'category'])
        .nunique()
        .to_frame(name='unique_counts')
        .to_dict(orient='index')
    )
    
    # Skewness and Kurtosis
    summary['skewness_kurtosis'] = {
        column: {
            'skewness': df[column].skew(),
            'kurtosis': df[column].kurt()
        } for column in df.select_dtypes(include=[np.number]).columns
    }
    
    # Correlations
    try:
        summary['correlations'] = df.corr(numeric_only=True).to_dict()
    except ValueError:
        summary['correlations'] = "Unable to calculate correlations due to data type issues."
    
    # Outlier Count based on IQR
    outlier_summary = {}
    for column in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))]
        outlier_summary[column] = {
            'outlier_count': outliers.shape[0],
            'percent_outliers': (outliers.shape[0] / df.shape[0]) * 100
        }
    summary['outlier_summary'] = outlier_summary

    return summary

# Automated EDA

In [4]:
data = pd.read_csv('diabetes.csv')

In [9]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Use one-line Exploratory Data Analysis, e.g. [pandas profiling](https://pypi.org/project/pandas-profiling/)

In [9]:
summary = eda_summary(data)

# Convert summary to JSON format
summary_json = json.dumps(summary, indent=4, default=str)

In [10]:
# Define the prompt template for LangChain
template = """Provide an analysis of the following EDA summary:
{context}

Key insights and observations:
"""

prompt = ChatPromptTemplate.from_template(template)

# Define the LLM model using LangChain
model = ChatOpenAI(
    model='gpt-4o-2024-05-13',
    temperature=0
    #api_key=OPENAI_API_KEY
)

# Create a chain to pass the summary to the model
chain = prompt | model | StrOutputParser()

# Invoke the chain to analyze the EDA summary
result = chain.invoke(summary_json)

# Print the result
display(Markdown(result))

Based on the provided Exploratory Data Analysis (EDA) summary, here are some key insights and observations:

### General Overview
- **Number of Rows**: 768
- **Number of Columns**: 9
- **Missing Values**: There are no missing values in the dataset, which is excellent for analysis as it eliminates the need for imputation or data cleaning related to missing values.

### Data Types
- The dataset consists of both integer (`int64`) and float (`float64`) data types.
- **Integer Columns**: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, Age, Outcome
- **Float Columns**: BMI, DiabetesPedigreeFunction

### Missing Values
- As mentioned, there are no missing values in any of the columns, which simplifies the preprocessing steps.

### Skewness and Kurtosis
- **Pregnancies**: Positively skewed (0.90) with a slight positive kurtosis (0.16).
- **Glucose**: Slightly positively skewed (0.17) with a moderate kurtosis (0.64).
- **BloodPressure**: Highly negatively skewed (-1.84) with a high kurtosis (5.18), indicating a heavy tail on the left side.
- **SkinThickness**: Nearly symmetric (0.11) with a slight negative kurtosis (-0.52).
- **Insulin**: Highly positively skewed (2.27) with a high kurtosis (7.21), indicating a heavy tail on the right side.
- **BMI**: Slightly negatively skewed (-0.43) with a moderate kurtosis (3.29).
- **DiabetesPedigreeFunction**: Highly positively skewed (1.92) with a high kurtosis (5.59).
- **Age**: Positively skewed (1.13) with a moderate kurtosis (0.64).
- **Outcome**: Positively skewed (0.64) with a negative kurtosis (-1.60).

### Correlations
- **Pregnancies**: Positively correlated with Age (0.54) and Outcome (0.22).
- **Glucose**: Strongly correlated with Outcome (0.47), indicating that higher glucose levels are associated with diabetes.
- **BloodPressure**: Moderately correlated with BMI (0.28) and SkinThickness (0.21).
- **SkinThickness**: Moderately correlated with Insulin (0.44) and BMI (0.39).
- **Insulin**: Moderately correlated with Glucose (0.33) and SkinThickness (0.44).
- **BMI**: Moderately correlated with Outcome (0.29) and BloodPressure (0.28).
- **DiabetesPedigreeFunction**: Moderately correlated with Outcome (0.17).
- **Age**: Moderately correlated with Pregnancies (0.54) and Outcome (0.24).
- **Outcome**: Strongly correlated with Glucose (0.47) and moderately correlated with BMI (0.29) and Age (0.24).

### Outlier Summary
- **Pregnancies**: 4 outliers (0.52% of data).
- **Glucose**: 5 outliers (0.65% of data).
- **BloodPressure**: 45 outliers (5.86% of data), indicating a significant number of outliers.
- **SkinThickness**: 1 outlier (0.13% of data).
- **Insulin**: 34 outliers (4.43% of data), indicating a significant number of outliers.
- **BMI**: 19 outliers (2.47% of data).
- **DiabetesPedigreeFunction**: 29 outliers (3.78% of data).
- **Age**: 9 outliers (1.17% of data).
- **Outcome**: No outliers.

### Key Insights
1. **No Missing Values**: The dataset is complete with no missing values, which is advantageous for analysis.
2. **Skewness and Kurtosis**: Several features exhibit significant skewness and kurtosis, particularly Insulin, BloodPressure, and DiabetesPedigreeFunction. This suggests that these features may benefit from transformations to normalize their distributions.
3. **Correlations**: Glucose shows a strong positive correlation with the Outcome, indicating its importance in predicting diabetes. Other features like BMI and Age also show moderate correlations with the Outcome.
4. **Outliers**: BloodPressure and Insulin have a notable number of outliers, which may need to be addressed through techniques like capping, transformation, or robust statistical methods.

### Recommendations
- **Normalization/Transformation**: Consider normalizing or transforming highly skewed features to improve model performance.
- **Outlier Treatment**: Address outliers in BloodPressure and Insulin to prevent them from skewing the analysis and model training.
- **Feature Engineering**: Given the correlations, features like Glucose, BMI, and Age should be prioritized in predictive modeling for diabetes.

This analysis provides a comprehensive understanding of the dataset's structure, distribution, and relationships, which is crucial for further modeling and analysis.

In [11]:
# Define the prompt template for LangChain
template_features = """Provide an analysis of the following EDA summary and offer advice on feature engineering to improve predictions of loan approvals:
{context}

Feature Engineering Recommendations for tree-based models:
"""

prompt = ChatPromptTemplate.from_template(template_features)
chain = prompt | model | StrOutputParser()
result = chain.invoke(summary_json)
display(Markdown(result))

Based on the provided Exploratory Data Analysis (EDA) summary, here are some feature engineering recommendations to improve predictions of loan approvals using tree-based models:

### 1. Handling Skewness and Outliers
Tree-based models are generally robust to outliers, but handling extreme skewness can still improve model performance.

- **Log Transformation**: Apply log transformation to highly skewed features such as `Insulin`, `DiabetesPedigreeFunction`, and `Age`. This can help in reducing the impact of extreme values.
  ```python
  import numpy as np
  df['Insulin_log'] = np.log1p(df['Insulin'])
  df['DiabetesPedigreeFunction_log'] = np.log1p(df['DiabetesPedigreeFunction'])
  df['Age_log'] = np.log1p(df['Age'])
  ```

- **Binning**: For features like `Pregnancies` and `Age`, which have a high skewness, consider binning them into categorical bins.
  ```python
  df['Pregnancies_binned'] = pd.cut(df['Pregnancies'], bins=[0, 2, 5, 10, np.inf], labels=['0-2', '3-5', '6-10', '10+'])
  df['Age_binned'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 60, np.inf], labels=['20-30', '30-40', '40-50', '50-60', '60+'])
  ```

### 2. Interaction Features
Tree-based models can benefit from interaction features, especially when there are moderate correlations between features.

- **Interaction Terms**: Create interaction terms for features that have moderate correlations with each other and with the target variable `Outcome`.
  ```python
  df['Glucose_BMI'] = df['Glucose'] * df['BMI']
  df['Age_Glucose'] = df['Age'] * df['Glucose']
  df['Pregnancies_Age'] = df['Pregnancies'] * df['Age']
  ```

### 3. Polynomial Features
Adding polynomial features can help capture non-linear relationships.

- **Polynomial Features**: Create polynomial features for `Glucose`, `BMI`, and `Age` as they have significant correlations with `Outcome`.
  ```python
  from sklearn.preprocessing import PolynomialFeatures
  poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
  poly_features = poly.fit_transform(df[['Glucose', 'BMI', 'Age']])
  poly_feature_names = poly.get_feature_names(['Glucose', 'BMI', 'Age'])
  df_poly = pd.DataFrame(poly_features, columns=poly_feature_names)
  df = pd.concat([df, df_poly], axis=1)
  ```

### 4. Feature Scaling
Although tree-based models are not sensitive to feature scaling, it can still be beneficial for interpretability and to ensure that the model does not give undue importance to features with larger scales.

- **Standardization**: Standardize features like `Glucose`, `BloodPressure`, `SkinThickness`, `Insulin`, and `BMI`.
  ```python
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = scaler.fit_transform(df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']])
  ```

### 5. Feature Selection
Given the correlations and the importance of certain features, it might be useful to perform feature selection to reduce dimensionality and improve model performance.

- **Feature Importance**: Use feature importance from a preliminary tree-based model to select the most important features.
  ```python
  from sklearn.ensemble import RandomForestClassifier
  model = RandomForestClassifier()
  model.fit(X_train, y_train)
  feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
  important_features = feature_importances[feature_importances > 0.01].index
  X_train_selected = X_train[important_features]
  X_test_selected = X_test[important_features]
  ```

### 6. Encoding Categorical Variables
If you have created any categorical variables (e.g., binned features), ensure they are properly encoded.

- **One-Hot Encoding**: Apply one-hot encoding to categorical features.
  ```python
  df = pd.get_dummies(df, columns=['Pregnancies_binned', 'Age_binned'], drop_first=True)
  ```

### Summary
By addressing skewness, creating interaction and polynomial features, standardizing numerical features, performing feature selection, and encoding categorical variables, you can enhance the predictive power of your tree-based models for loan approvals. These steps will help in capturing complex relationships and improving model performance.