In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import GridSearchCV
import time
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, make_scorer
)
from sklearn.feature_selection import RFE
from Preprocessing_functions import *

# pandas max columns display
pd.set_option('display.max_columns', None)

## Import Dataset

In [3]:
train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')
test_data = pd.read_csv('test_data.csv', index_col='Claim Identifier')

  train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')


In [4]:
train_data = train_data[~(train_data.drop(columns=['Assembly Date']).isna().all(axis=1) & train_data['Assembly Date'].notna())]

In [5]:
X = train_data.drop(columns=['Claim Injury Type', 'WCB Decision', 'Agreement Reached','OIICS Nature of Injury Description'])
y = train_data['Claim Injury Type']

test_data = test_data.drop(columns=['OIICS Nature of Injury Description'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

____

# Auxiliary Functions


## Preprocess Functions

In [6]:
CODE_COLUMNS = ['Industry Code', 'WCIO Cause of Injury Code',
       'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']

DESCRIPTION_COLUMNS = ['WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description','Industry Code Description']

BOOLEAN_COLUMNS = ['Alternative Dispute Resolution', 'Attorney/Representative','COVID-19 Indicator']

date_order = ['Accident Date', 'C-2 Date','C-3 Date','Assembly Date', 'First Hearing Date']

numerical_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year', 
    'C-2 Date', 
    'C-3 Date', 
    'First Hearing Date', 
    'IME-4 Count', 
]

outliers_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year',
    'IME-4 Count', 
]

categorical_features = ['Alternative Dispute Resolution',
 'Attorney/Representative',
 'Carrier Name',
 'Carrier Type',
 'County of Injury',
 'COVID-19 Indicator',
 'District Name',
 'Gender',
 'Industry Code',
 'Medical Fee Region',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code']


columns_to_scale = ['Accident Date',
                'Assembly Date',
                'Average Weekly Wage',
                'Age at Injury',
                'Birth Year', 
                'Number of Dependents',
                'IME-4 Count']

columns_to_drop = ['C-2 Date', 'C-3 Date', 'First Hearing Date']

low_cardinality_cols = [col for col in categorical_features if X[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_features if X[col].nunique() > 10]


In [7]:
def preprocessing_scaling_encoding_dum(X_train, X_val):
    X_train, X_val = type_coversion_categorical(X_train, X_val,categorical_features)
    X_train, X_val = drop_description_columns(X_train, X_val)
    X_train, X_val = convert_to_timestamp(X_train, X_val, date_order)
    X_train, X_val = convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)
    X_train, X_val = impute_mean_numerical(X_train, X_val, numerical_columns)
    X_train, X_val = fill_missing_with_mode(X_train, X_val)
    X_train, X_val = feature_creation_has_Cdate(X_train, X_val)
    X_train, X_val = drop_unwanted_columns(X_train, X_val, columns_to_drop)
    X_train, X_val = winsorize_outliers(X_train, X_val, outliers_columns)
    X_train, X_val = scaling_robust(X_train, X_val, columns_to_scale)
    X_train, X_val = encoding_onehot(X_train, X_val, low_cardinality_cols)
    X_train, X_val = encoding_frequency1(X_train, X_val, high_cardinality_cols)

    return X_train, X_val

## Feature Selection


In [8]:
X_enconded, X_val = preprocessing_scaling_encoding_dum(X, X_val)
y_encoded, y_val = encoding_label(y, y_val) 

NameError: name 'type_coversion_categorical' is not defined

In [None]:
 # Initialize RandomForest model
rf_model = RandomForestClassifier(random_state=42)

# Set up cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=5)

# Set up RFECV with RandomForest and cross-validation
rfecv = RFECV(estimator=rf_model, step=1, cv=cv_strategy, scoring='f1_macro') 

# Fit RFECV
rfecv.fit(X_enconded, y_encoded)

#Get the selected features
selected_features_RF = X_enconded.columns[rfecv.support_].tolist()
optimal_num_features = rfecv.n_features_

print("Optimal number of features:", optimal_num_features)
print("Selected Features:", selected_features_RF)

## GridSearchCV Performance Evaluation


In [None]:
def create_predifined_split(X, y, preprocess_steps, n_splits = 5):
    """
    Creates a PredefinedSplit object to be used in cross-validation, more specifically in GridSearchCV.

    Steps:
    - Defines the number of splits
    - Splits the data into training and validation sets
    - Applies the preprocessing steps to the training and validation sets
    - Returns the PredefinedSplit object and the preprocessed data
    """

    X_combined_list = []
    y_combined_list = []

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    test_data = np.zeros(len(X), dtype=int) - 1

    for fold_idx, (_, test_idx) in enumerate(kf.split(X, y)):
        test_data[test_idx] = fold_idx

    ps = PredefinedSplit(test_fold=test_data)

    for train_index, test_index in ps.split():

        # Get fold
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Preprocess and encode data    
        X_train, X_val = preprocess_steps(X_train, X_val)
        y_train, y_val = encoding_label(y_train, y_val)

        X_combined_list.append(X_train)
        y_combined_list.append(y_train)

    X_combined = pd.concat(X_combined_list, axis=0)
    y_combined = np.concatenate(y_combined_list, axis=0)

    return ps, X_combined, y_combined


        


In [None]:
def get_best_parameters(X, y, model, param_grid, preprocess_steps, n_splits=5):
    """
    Finds the best hyperparameters for a given model using GridSearchCV.

    Steps:
    - Creates a PredefinedSplit object
    - Creates a GridSearchCV object
    - Fits the GridSearchCV object
    - Returns the best hyperparameters and the best score
    """
    predefined_split, X_combined, y_combined = create_predifined_split(X, y, preprocess_steps, n_splits=n_splits)

    scoring = make_scorer(f1_score, average='macro')

    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=predefined_split,
        scoring=scoring,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_combined, y_combined)

    print("Best Parameters:", grid_search.best_params_)
    print("Best F1-macro Score:", grid_search.best_score_)

    return grid_search.best_params_, grid_search.best_score_

______

# Model Assessment

## Logistic Regression

## Random Forest

## NN

# XGBoost

## Knn

## Ensamble Models
look for last year project


# Final Model