In [15]:
import pandas as pd
import numpy as np

In [16]:
"""
Determines target column by the ratio of unique values in the column
A binary column is more likely to be a target column
"""

def cardinality_heuristic(df):
    scores = {}
    n_rows = len(df)
    for col in df.columns:
        unique_count = df[col].nunique()
            
        if n_rows == 0:
            cardinality_ratio = 0
        else:
            cardinality_ratio = unique_count / n_rows
            
        if unique_count == 2:  # Binary classification
            scores[col] = 0.8
        elif 2 < unique_count <= 10:  # Multi-class classification
            scores[col] = 0.7
        elif 10 < unique_count <= 50:  # Could be regression 
            scores[col] = 0.4
        elif cardinality_ratio > 0.9:  # Likely unique identifiers
            scores[col] = 0.1
        else:  # Moderate cardinality
            scores[col] = 0.5
    return scores

In [17]:
#List of keywords that are very likely to belong to a target column
target_names = ['outcome', 'result', 'prediction', 'predict',
            'response', 'dependent', 'y', 'price', 'amount', 'value', 'score',
            'rating', 'category', 'type', 'status', 'diagnosis', 'churn',
            'fraud', 'risk', 'success', 'failure', 'survived', 'approved']

In [18]:
#List of keywords that are very likely to belong to a feature column
feature_names = ['id', 'index', 'key', 'name', 'description', 'comment', 'note',
            'created', 'updated', 'timestamp', 'date', 'time', 'uuid']

In [19]:
def column_name_heuristic(df):
    scores = {}
    for col in df.columns:
        score = 0
        col_lower = col.lower()
        for t_name in target_names:
            if t_name in col_lower:
                score += 0.8 #High score if name is a likely target column name
        if col_lower in ['target', 'label', 'class']:
            score += 1 #These are almost certain to be target column names
        for f_name in feature_names:
            if f_name in col_lower:
                score -= 0.5 #Low score if name is a likely feature column name
        score = max(0, min(score, 1))
        scores[col] = score
    return scores

In [20]:
#Target column is expected to have a lower number of null values
def null_value_heuristic(df):
    scores = {}
    if df.isnull().sum().sum() == 0:
        for col in df.columns:
            scores[col] = 0.5
        return scores
    for col in df.columns:
        null_ratio = df[col].isnull().sum()/len(df)
        if null_ratio == 0:
            scores[col] = 1
        elif null_ratio <= 0.05:
            scores[col] = 0.8
        elif null_ratio <= 0.1:
            scores[col] = 0.6
        elif null_ratio <= 0.3:
            scores[col] = 0.4
        else:
            scores[col] = 0.2
    return scores

In [21]:
"""
The target column is usually moderately correlated to the feature columns
A column with a very low correlation with other columns would not be a suitable target column and is therefore awarded a lower score
A column with a very high correlation is likely to be a redundant column that can be explained using the other columns in the dataset 
"""
def correlation_heuristic(df):
    scores = {}
    numeric_df = df.select_dtypes(include = 'number')
    if len(numeric_df.columns)<2:
        return {col: 0.5 for col in df.columns}
        
    corr_matrix = numeric_df.corr().abs()
    for col in df.columns:
        if col in numeric_df.columns:
            col_corrs = corr_matrix[col].drop(col).dropna()
            if len(col_corrs) == 0:
                scores[col] = 0.5
                continue

            max_corr = col_corrs.max()
            mean_corr = col_corrs.mean()

            if 0.3 <= max_corr <= 0.7 and 0.1 <= mean_corr <= 0.4:
                    scores[col] = 0.7
            elif max_corr > 0.95:  
                    scores[col] = 0.2
            elif max_corr < 0.1:
                    scores[col] = 0.3
            else:
                    scores[col] = 0.5
        else:
                scores[col] = 0.5
                
    return scores

In [22]:
#All the scores from the four heuristics are combined to generate the final prediction
def detect_target_variable(df, weights=None):
    if weights is None:
        weights = {
            'cardinality': 0.25,
            'name': 0.3,
            'null': 0.15,
            'correlation': 0.2
        }
    cardinality_scores = cardinality_heuristic(df)
    name_scores = column_name_heuristic(df)
    null_scores = null_value_heuristic(df)
    correlation_scores = correlation_heuristic(df)
    
    final_scores = {}
    detailed_scores = {}
    
    for col in df.columns:
        individual_scores = {
            'cardinality': cardinality_scores.get(col, 0),
            'name': name_scores.get(col, 0),
            'null': null_scores.get(col, 0),
            'correlation': correlation_scores.get(col, 0)
        }
        
        final_score = sum(individual_scores[metric] * weights[metric] 
                         for metric in weights.keys())
        
        final_scores[col] = final_score
        detailed_scores[col] = individual_scores
    
    ranked_columns = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
    
    if len(ranked_columns) >= 2:
        confidence = ranked_columns[0][1] - ranked_columns[1][1]
    else:
        confidence = ranked_columns[0][1] if ranked_columns else 0
    
    return {
        'predicted_target': ranked_columns[0][0] if ranked_columns else None,
        'confidence': confidence,
        'all_scores': final_scores,
        'detailed_scores': detailed_scores,
        'ranking': ranked_columns
    }

In [24]:
import seaborn as sns
from sklearn.datasets import load_iris

df_titanic = sns.load_dataset('titanic')

iris = load_iris(as_frame=True)
df_iris = iris.frame.rename(columns={"target": "species_type"})

In [25]:
datasets = {
    "Titanic": df_titanic,
    "Iris": df_iris
}
df_iris.rename(columns={"target": "species_type"}, inplace=True)
for name, df in datasets.items():
    print(f"\nDataset: {name}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    results = detect_target_variable(df)
    
    print(f"Predicted Target: {results['predicted_target']}")
    print(f"Confidence Score: {results['confidence']:.3f}")
    
    print("-" * 50)


Dataset: Titanic
Shape: (891, 15)
Columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']
Predicted Target: survived
Confidence Score: 0.005
--------------------------------------------------

Dataset: Iris
Shape: (150, 5)
Columns: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'species_type']
Predicted Target: species_type
Confidence Score: 0.275
--------------------------------------------------
