# Import Dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import RFE
from Preprocessing_functions import *



In [2]:
train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')
test_data = pd.read_csv('test_data.csv', index_col='Claim Identifier')

  train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')


In [3]:
train_data.isna().sum()

Accident Date                          23134
Age at Injury                          19445
Alternative Dispute Resolution         19445
Assembly Date                              0
Attorney/Representative                19445
Average Weekly Wage                    48096
Birth Year                             48523
C-2 Date                               34005
C-3 Date                              406226
Carrier Name                           19445
Carrier Type                           19445
Claim Injury Type                      19445
County of Injury                       19445
COVID-19 Indicator                     19445
District Name                          19445
First Hearing Date                    442673
Gender                                 19445
IME-4 Count                           460668
Industry Code                          29403
Industry Code Description              29403
Medical Fee Region                     19445
OIICS Nature of Injury Description    593471
WCIO Cause

# Preprocessing

In [4]:
train_data = train_data[~(train_data.drop(columns=['Assembly Date']).isna().all(axis=1) & train_data['Assembly Date'].notna())]

In [5]:
X = train_data.drop(columns=['Claim Injury Type', 'WCB Decision', 'Agreement Reached'])
y = train_data['Claim Injury Type']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [6]:
y.isna().sum()

np.int64(0)

In [7]:
CODE_COLUMNS = ['Industry Code', 'WCIO Cause of Injury Code',
       'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']

DESCRIPTION_COLUMNS = ['WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description','Industry Code Description']

BOOLEAN_COLUMNS = ['Alternative Dispute Resolution', 'Attorney/Representative','COVID-19 Indicator']

date_order = ['Accident Date', 'C-2 Date','C-3 Date','Assembly Date', 'First Hearing Date']


In [8]:
numerical_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year', 
    'C-2 Date', 
    'C-3 Date', 
    'First Hearing Date', 
    'IME-4 Count', 
]

categorical_features = ['Alternative Dispute Resolution',
 'Attorney/Representative',
 'Carrier Name',
 'Carrier Type',
 'County of Injury',
 'COVID-19 Indicator',
 'District Name',
 'Gender',
 'Industry Code',
 'Medical Fee Region',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code']

col_minmax = ['Age at Injury',
               'Birth Year', 
               'Number of Dependents']

col_standart = ['Accident Date',
                'Assembly Date',
                'Average Weekly Wage',
                ]

low_cardinality_cols = [col for col in categorical_features if X_train[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_features if X_train[col].nunique() > 10]




In [9]:
high_cardinality_cols

['Carrier Name',
 'County of Injury',
 'Industry Code',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code']

In [10]:
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_val[categorical_features] = X_val[categorical_features].astype(str)

In [11]:
def drop_description_columns(X_train, X_val):
    """
    Drop all columns in X_train and X_val that contain the word 'description' in their names (case-insensitive).
    """
    description_columns = X_train.columns[X_train.columns.str.contains('description', case=False, na=False)]
    

    X_train = X_train.drop(description_columns, axis=1)
    X_val = X_val.drop(description_columns, axis=1)
    
    return X_train, X_val

X_train ,X_val = drop_description_columns(X_train, X_val)

In [12]:
def preprocessing_dum(X_train, X_val):
    drop_description_columns(X_train, X_val)
    convert_to_timestamp(X_train, X_val, date_order)
    convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)
    impute_mean_numerical(X_train, X_val, numerical_columns)
    fill_missing_with_mode(X_train, X_val)
    feature_creation_has_Cdate(X_train, X_val)


    return X_train, X_val



def scaling_encoding(X_train, X_val):
    scaling_minmax(X_train, X_val, col_minmax)
    scaling_standard(X_train, X_val, col_standart)
    X_train, X_val = encoding_onehot(X_train, X_val, low_cardinality_cols)
    X_train, X_val = encoding_frequency1(X_train, X_val, high_cardinality_cols)


    return X_train, X_val

X_train, X_val = preprocessing_dum(X_train, X_val)

X_train, X_val = scaling_encoding(X_train, X_val)

X_train.info()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mean_value, inplace=True)


<class 'pandas.core.frame.DataFrame'>
Index: 459220 entries, 5785935 to 6027959
Data columns (total 52 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Accident Date                                             459220 non-null  float64
 1   Age at Injury                                             459220 non-null  float64
 2   Assembly Date                                             459220 non-null  float64
 3   Average Weekly Wage                                       459220 non-null  float64
 4   Birth Year                                                459220 non-null  float64
 5   C-2 Date                                                  459220 non-null  float64
 6   C-3 Date                                                  459220 non-null  float64
 7   First Hearing Date                                        459220 non-null  float64
 8   IM

In [13]:
X_train

Unnamed: 0_level_0,Accident Date,Age at Injury,Assembly Date,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,First Hearing Date,IME-4 Count,Number of Dependents,...,Medical Fee Region_III,Medical Fee Region_IV,Medical Fee Region_UK,Carrier Name,County of Injury,Industry Code,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,1.441914e-01,0.418803,0.067364,0.037644,0.976710,1.628554e+09,1.630282e+09,1.647671e+09,2.000000,1.000000,...,0.0,1.0,0.0,0.003846,0.101546,0.017623,0.051032,0.063040,0.090142,0.002110
6090033,7.385627e-01,0.478632,1.359460,0.052969,0.974232,1.663718e+09,1.664237e+09,1.684195e+09,7.000000,0.833333,...,0.0,1.0,0.0,0.000825,0.101546,0.065049,0.029082,0.267362,0.010407,0.001084
6136197,8.178122e-01,0.410256,1.543591,-0.074992,0.978196,1.668730e+09,1.627336e+09,1.647671e+09,3.209525,0.333333,...,1.0,0.0,0.0,0.029792,0.015914,0.065049,0.018939,0.192385,0.003456,0.001387
6019545,-3.503843e-01,0.470085,1.067389,-0.074992,0.000000,1.655770e+09,1.647389e+09,1.647671e+09,3.209525,0.333333,...,0.0,0.0,1.0,0.022259,0.044920,0.053748,0.057702,0.004880,0.032782,0.000026
5792247,1.559321e-01,0.452991,0.092762,0.026142,0.975223,1.629245e+09,1.638230e+09,1.695600e+09,3.209525,1.000000,...,0.0,0.0,0.0,0.193350,0.023838,0.075874,0.051032,0.192385,0.023951,0.000416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5837651,2.483898e-01,0.222222,0.289592,0.113410,0.988603,1.634602e+09,1.636502e+09,1.642982e+09,1.000000,1.000000,...,1.0,0.0,0.0,0.015796,0.044920,0.198935,0.007617,0.004878,0.014165,0.000867
5781926,1.324507e-01,0.470085,0.048316,0.071735,0.934868,1.628035e+09,1.629331e+09,1.646611e+09,7.000000,0.166667,...,0.0,1.0,0.0,0.000762,0.101546,0.198935,0.064603,0.192385,0.039574,0.001792
5890060,2.880146e-01,0.487179,0.534043,0.045218,0.973241,1.641254e+09,1.627336e+09,1.647671e+09,3.209525,0.833333,...,0.0,0.0,0.0,0.000684,0.053676,0.023061,0.020744,0.267362,0.083744,0.001653
6148528,4.049759e-15,0.000000,1.603910,-0.074992,0.989098,1.670803e+09,1.670198e+09,1.647671e+09,3.209525,0.666667,...,0.0,1.0,0.0,0.002365,0.092931,0.046185,0.057702,0.063040,0.073102,0.003175


In [195]:

"""
def preprocessing_complex(X_train, X_val):
    convert_to_timestamp(X_train, X_val, date_order)
    convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)
    impute_mean_numerical(X_train, X_val, ['C-2 Date'])
    fillna_zip_code(X_train, X_val)
    fillnan_accident_date(X_train,X_val)
    fillnan_birth_year(X_train,X_val)
    feature_creation_has_Cdate (X_train, X_val)
    fill_missing_with_mode(X_train, X_val)
    drop_description_columns(X_train, X_val)
    scaling_standard(X_train, X_val, columns)
    encoding_onehot(X_train, X_val, columns)
    return X_train, X_val
preprocessing_complex(X_train, X_val)

X_train.info()
"""





"\ndef preprocessing_complex(X_train, X_val):\n    convert_to_timestamp(X_train, X_val, date_order)\n    convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)\n    impute_mean_numerical(X_train, X_val, ['C-2 Date'])\n    fillna_zip_code(X_train, X_val)\n    fillnan_accident_date(X_train,X_val)\n    fillnan_birth_year(X_train,X_val)\n    feature_creation_has_Cdate (X_train, X_val)\n    fill_missing_with_mode(X_train, X_val)\n    drop_description_columns(X_train, X_val)\n    scaling_standard(X_train, X_val, columns)\n    encoding_onehot(X_train, X_val, columns)\n    return X_train, X_val\npreprocessing_complex(X_train, X_val)\n\nX_train.info()\n"

# Model Training

In [None]:
X = X.reset_index(drop=True) 
y = y.reset_index(drop=True)


k_range = range(1, 10)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mean_f1_scores = []

for k in k_range:
    f1_scores = []
    for train_index, test_index in kf.split(X):

        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        X_train, X_val = drop_description_columns(X_train, X_val)
        X_train, X_val = preprocessing_dum(X_train, X_val)
        X_train, X_val = scaling_encoding(X_train, X_val)
        
        X_train_selected, selected_features, feature_ranking = feature_selection_rfe(X_train, y_train, 10, LogisticRegression())
        
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_selected, y_train)
        
        y_pred = knn.predict(X_val[selected_features])
        f1 = f1_score(y_val, y_pred, average='macro')
        f1_scores.append(f1)

    mean_f1_scores.append(np.mean(f1_scores))


optimal_k = k_range[np.argmax(mean_f1_scores)]
print(f"The optimal number of neighbors is {optimal_k}.")

plt.plot(k_range, mean_f1_scores)
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Mean F1 Score')
plt.title('Optimal k Selection using K-Fold Cross-Validation')
plt.show()

X_preprocessed, _ = preprocessing_dum(X, X)
X_scaled, _ = scaling_encoding(X_preprocessed, X_preprocessed)
selector = RFE(estimator=LogisticRegression(), n_features_to_select=10)
X_final = selector.fit_transform(X_scaled, y)
final_knn = KNeighborsClassifier(n_neighbors=optimal_k)
final_knn.fit(X_final, y)

print(f"Model trained with optimal k={optimal_k}.")
  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  def impute_mode_categorical(X_train, X_val, columns):
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  """
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preproce

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Range of k values to test (for XGBoost, we can try different hyperparameters)
learning_rate_range = [0.01, 0.1, 0.3, 0.5, 0.7]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# To store mean accuracy for each learning rate
mean_accuracies = []

# Perform K-Fold for each learning rate
for learning_rate in learning_rate_range:
    accuracies = []
    for train_index, test_index in kf.split(X):
        # Split the data into training and test sets for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the XGBoost model
        model = XGBClassifier(learning_rate=learning_rate, use_label_encoder=False, eval_metric='mlogloss')
        model.fit(X_train, y_train)
        
        # Evaluate on the test set
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    
    # Compute the mean accuracy for this learning rate
    mean_accuracies.append(np.mean(accuracies))

# Find the optimal learning rate
optimal_lr = learning_rate_range[np.argmax(mean_accuracies)]
print(f"The optimal learning rate is {optimal_lr}.")

# Plot the results
plt.plot(learning_rate_range, mean_accuracies)
plt.xlabel('Learning Rate')
plt.ylabel('Mean Accuracy')
plt.title('Optimal Learning Rate Selection using K-Fold Cross-Validation (XGBoost)')
plt.show()

# Final training with optimal learning rate
final_model = XGBClassifier(learning_rate=optimal_lr, use_label_encoder=False, eval_metric='mlogloss')
final_model.fit(X, y)
print(f"Model trained with optimal learning rate={optimal_lr}.")
