In [1]:
import pandas as pd
import numpy as np
from collections import Counter

def calculate_entropy(target_column):
    """
    Calculate the entropy of the target column.
    """
    target_counts = Counter(target_column)
    total_samples = len(target_column)
    entropy = 0
    for count in target_counts.values():
        probability = count / total_samples
        entropy -= probability * np.log2(probability)
    return entropy

def calculate_information_gain(data, feature_column, target_column):
    """
    Calculate the information gain for a given feature.
    """
    total_entropy = calculate_entropy(target_column)
    feature_values = data[feature_column].unique()

    weighted_entropy = 0
    for value in feature_values:
        subset = data[data[feature_column] == value]
        subset_entropy = calculate_entropy(subset[target_column])
        weighted_entropy += (len(subset) / len(data)) * subset_entropy

    information_gain = total_entropy - weighted_entropy
    return information_gain

def detect_root_node(data, features, target_column):
    """
    Detect the feature for the root node of the Decision Tree using Information Gain.
    """
    max_information_gain = -1
    root_node = None

    for feature in features:
        information_gain = calculate_information_gain(data, feature, target_column)
        if information_gain > max_information_gain:
            max_information_gain = information_gain
            root_node = feature

    return root_node

# Input your dataset path
dataset_path = "C:\\Users\\SACHIN.R\\OneDrive\\Desktop\\jobss.csv"

# Load the dataset
data = pd.read_csv(dataset_path)

# Define your list of features and target column
features = ['Functional Area', 'Industry']
target_column = 'Role'

# Detect the root node
root_node = detect_root_node(data, features, target_column)
print("Root Node:", root_node)


Root Node: Functional Area


In [None]:
import pandas as pd
import numpy as np
from collections import Counter

def calculate_entropy(target_column):
    """
    Calculate the entropy of the target column.
    """
    target_counts = Counter(target_column)
    total_samples = len(target_column)
    entropy = 0
    for count in target_counts.values():
        probability = count / total_samples
        entropy -= probability * np.log2(probability)
    return entropy

def calculate_information_gain(data, feature_column, target_column):
    """
    Calculate the information gain for a given feature.
    """
    total_entropy = calculate_entropy(target_column)
    feature_values = data[feature_column].unique()

    weighted_entropy = 0
    for value in feature_values:
        subset = data[data[feature_column] == value]
        subset_entropy = calculate_entropy(subset[target_column])
        weighted_entropy += (len(subset) / len(data)) * subset_entropy

    information_gain = total_entropy - weighted_entropy
    return information_gain

def detect_root_node(data, features, target_column):
    """
    Detect the feature for the root node of the Decision Tree using Information Gain.
    """
    max_information_gain = -1
    root_node = None

    for feature in features:
        information_gain = calculate_information_gain(data, feature, target_column)
        if information_gain > max_information_gain:
            max_information_gain = information_gain
            root_node = feature

    return root_node

def bin_continuous_feature(data, feature_column, binning_type='equal_width', num_bins=5):
    # Check for infinity and handle it (e.g., exclude rows, replace with finite value)
    data = data[~np.isinf(data[feature_column])]  # Exclude rows with infinity
    # Rest of the function logic...

    """
    Bin a continuous-valued feature into categorical bins based on the specified binning type and number of bins.
    """
    if binning_type == 'equal_width':
        data[feature_column + '_binned'] = pd.cut(data[feature_column], bins=num_bins, labels=False)
    elif binning_type == 'frequency':
        data[feature_column + '_binned'] = pd.qcut(data[feature_column], q=num_bins, labels=False, duplicates='drop')
    else:
        print("Invalid binning type. Please choose 'equal_width' or 'frequency'.")



# Load the dataset
dataset_path = "C:\\Users\\SACHIN.R\\OneDrive\\Desktop\\jobss.csv"
data = pd.read_csv(dataset_path)

# Handle missing values in the 'Functional Area' column
data['Functional Area'].fillna('Unknown', inplace=True)  # Fill missing values with 'Unknown' or any other appropriate strategy

# Handle infinity values in the 'Functional Area' column
# Replace the line with your preferred way to handle infinity
data['Functional Area'] = data['Functional Area'].replace([np.inf, -np.inf], np.nan)  # Or another strategy
 

# Drop rows with NaN values in the 'Functional Area' column
data = data.dropna(subset=['Functional Area'])

# Define your list of features and target column
features = ['Functional Area', 'Industry']
target_column = 'Role'

# Binning the continuous feature 'Functional Area' with equal width binning and 3 bins
bin_continuous_feature(data, 'Functional Area', binning_type='equal_width', num_bins=3)

# Detect the root node
root_node = detect_root_node(data, features + ['Functional Area_binned'], target_column)
print("Root Node:", root_node)



In [None]:
import pandas as pd
import numpy as np

def bin_continuous_feature(data, feature_column, binning_type='equal_width', num_bins=3):
    if binning_type == 'equal_width':
        data[feature_column + '_binned'] = pd.cut(data[feature_column], bins=num_bins, labels=False)
    elif binning_type == 'equal_frequency':
        data[feature_column + '_binned'] = pd.qcut(data[feature_column], q=num_bins, labels=False, duplicates='drop')

def detect_root_node(data, features, target_column):
    best_feature = None
    best_threshold = None
    best_gini = 999  # Initialize with a high value
    for feature in features:
        unique_values = data[feature].unique()
        for value in unique_values:
            left_subset = data[data[feature] <= value]
            right_subset = data[data[feature] > value]
            gini = calculate_gini_index(left_subset, right_subset, target_column)
            if gini < best_gini:
                best_gini = gini
                best_feature = feature
                best_threshold = value
    return best_feature, best_threshold

def calculate_gini_index(left_subset, right_subset, target_column):
    # Calculate Gini index based on left and right subsets
    return 0.5  # Placeholder for actual Gini index calculation

# Load the dataset
dataset_path = "C:\\Users\\SACHIN.R\\OneDrive\\Desktop\\jobss.csv"
data = pd.read_csv(dataset_path)

# Define your list of features and target column
features = ['Functional Area', 'Industry']
target_column = 'Role'

# Binning the continuous feature 'Functional Area' with equal width binning and 3 bins
bin_continuous_feature(data, 'Functional Area', binning_type='equal_width', num_bins=3)

# Build the Decision Tree
class Node:
    def __init__(self, feature, threshold, left=None, right=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right

def build_decision_tree(data, features, target_column):
    if stopping_condition_met(data):  # Define your stopping condition
        return LeafNode(data)
    
    best_feature, best_threshold = detect_root_node(data, features, target_column)
    
    left_subset = data[data[best_feature] <= best_threshold]
    right_subset = data[data[best_feature] > best_threshold]
    
    left_tree = build_decision_tree(left_subset, features, target_column)
    right_tree = build_decision_tree(right_subset, features, target_column)
    
    return Node(best_feature, best_threshold, left_tree, right_tree)

# Define your stopping condition and LeafNode class


decision_tree = build_decision_tree(data, features, target_column)
