# Import Dataset

In [392]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from sklearn.svm import SVC
from Preprocessing_functions import *
from sklearn.model_selection import cross_val_score, train_test_split


In [393]:
train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')
test_data = pd.read_csv('test_data.csv', index_col='Claim Identifier')

  train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')


# Preprocessing

In [394]:
#Split the data into training and validation sets
train_data = train_data[~(train_data.drop(columns=['Assembly Date']).isna().all(axis=1) & train_data['Assembly Date'].notna())] 
X = train_data.drop(columns=['Claim Injury Type', 'WCB Decision', 'Agreement Reached'])
y = train_data['Claim Injury Type']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)


In [395]:
CODE_COLUMNS = ['Industry Code', 'WCIO Cause of Injury Code',
       'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']

DESCRIPTION_COLUMNS = ['WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description','Industry Code Description']

BOOLEAN_COLUMNS = ['Alternative Dispute Resolution', 'Attorney/Representative','COVID-19 Indicator']

date_order = ['Accident Date', 'C-2 Date','C-3 Date','Assembly Date', 'First Hearing Date']


In [396]:
numerical_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year', 
    'C-2 Date', 
    'C-3 Date', 
    'First Hearing Date', 
    'IME-4 Count', 
]

categorical_features = ['Alternative Dispute Resolution',
 'Attorney/Representative',
 'Carrier Name',
 'Carrier Type',
 'County of Injury',
 'COVID-19 Indicator',
 'District Name',
 'Gender',
 'Industry Code',
 'Medical Fee Region',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code']

col_minmax = ['Age at Injury',
               'Birth Year', 
               'Number of Dependents']

col_standart = ['Accident Date',
                'Assembly Date',
                'Average Weekly Wage',
                ]

low_cardinality_cols = [col for col in categorical_features if X_train[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_features if X_train[col].nunique() > 10]




In [397]:
high_cardinality_cols

['Carrier Name',
 'County of Injury',
 'Industry Code',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code']

In [398]:
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_val[categorical_features] = X_val[categorical_features].astype(str)

In [399]:
def drop_description_columns(X_train, X_val):
    """
    Drop all columns in X_train and X_val that contain the word 'description' in their names (case-insensitive).
    """
    description_columns = X_train.columns[X_train.columns.str.contains('description', case=False, na=False)]
    

    X_train = X_train.drop(description_columns, axis=1)
    X_val = X_val.drop(description_columns, axis=1)
    
    return X_train, X_val

X_train ,X_val = drop_description_columns(X_train, X_val)

In [400]:
def drop_description_columns_Test(X_test):
    """
    Drop all columns in X_train and X_val that contain the word 'description' in their names (case-insensitive).
    """
    description_columns = X_test.columns[X_test.columns.str.contains('description', case=False, na=False)]
    

    X_test = X_test.drop(description_columns, axis=1)
    
    return X_test


In [401]:
def preprocessing_dum(X_train, X_val):
    drop_description_columns(X_train, X_val)
    convert_to_timestamp(X_train, X_val, date_order)
    convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)
    impute_mean_numerical(X_train, X_val, numerical_columns)
    fill_missing_with_mode(X_train, X_val)
    feature_creation_has_Cdate(X_train, X_val)
    # columns_to_drop = ['C-2 Date', 'C-3 Date', 'First Hearing Date']
    # X_train = X_train.drop(columns=columns_to_drop)
    # X_val = X_val.drop(columns=columns_to_drop)


    return X_train, X_val

def preprocessing_dum_test(X_test):
    convert_to_timestamp_test(X_test, date_order)
    convert_to_bool_test(X_test, col_names=BOOLEAN_COLUMNS)
    impute_mean_numerical_test(X_test, numerical_columns)
    fill_missing_with_mode_test(X_test)
    feature_creation_has_Cdate_test(X_test)
    return X_test

def scaling_encoding(X_train, X_val):
    #scaling_minmax(X_train, X_val, col_minmax)
    #scaling_standard(X_train, X_val, col_standart)
    robust_scaling(X_train, X_val, numerical_columns)
    X_train, X_val = encoding_onehot(X_train, X_val, low_cardinality_cols)
    X_train, X_val = encoding_frequency1(X_train, X_val, high_cardinality_cols)


    return X_train, X_val

def scaling_encoding_test(X_test):
    #scaling_minmax_test(X_test, col_minmax)
    #scaling_standard_test(X_test, col_standart)
    X_test= encoding_onehot_test(X_test, low_cardinality_cols)
    X_test = encoding_frequency1_test(X_test, high_cardinality_cols)


    return X_test




# Model Training

In [402]:
# from sklearn.svm import SVC
# from sklearn.metrics import f1_score
# from sklearn.model_selection import KFold
# from sklearn.feature_selection import RFE
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.linear_model import LogisticRegression

# # Garantir que os índices de X e y estejam alinhados
# X = X.reset_index(drop=True)
# y = y.reset_index(drop=True)

# # Faixa de valores para o parâmetro C
# c_range = np.logspace(-3, 2, 10)  # Exemplo de valores de 0.001 a 100
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# mean_f1_scores = []

# for c in c_range:
#     f1_scores = []
#     for train_index, test_index in kf.split(X):
#         # Dividir o dataset em treino e validação
#         X_train, X_val = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
#         # Pré-processamento
#         X_train, X_val = drop_description_columns(X_train, X_val)
#         X_train, X_val = preprocessing_dum(X_train, X_val)
#         X_train, X_val = scaling_encoding(X_train, X_val)
#         y_train,y_val=encoding_label(y_train,y_val)
        
#         # Seleção de features com RFE
#         X_train_selected, selected_features, feature_ranking = feature_selection_rfe(
#             X_train, y_train, 10, LogisticRegression()
#         )
        
#         # Treinar o SVM com o valor atual de C
#         svm = SVC(C=c, kernel='linear', random_state=42)
#         svm.fit(X_train_selected, y_train)
        
#         # Fazer previsões e calcular o F1 score
#         y_pred = svm.predict(X_val[selected_features])
#         f1 = f1_score(y_val, y_pred, average='macro')
#         f1_scores.append(f1)

#     # Armazenar a média dos F1 scores para o valor de C atual
#     mean_f1_scores.append(np.mean(f1_scores))

# # Determinar o valor ótimo de C
# optimal_c = c_range[np.argmax(mean_f1_scores)]
# print(f"The optimal value of C is {optimal_c}.")

# # Plotar os F1 scores médios para cada valor de C
# plt.plot(c_range, mean_f1_scores)
# plt.xscale('log')  # Escala logarítmica para melhor visualização
# plt.xlabel('C (Regularization Parameter)')
# plt.ylabel('Mean F1 Score')
# plt.title('Optimal C Selection using K-Fold Cross-Validation')
# plt.show()

# # Treinar o modelo final usando todo o conjunto de dados
# X_preprocessed, _ = preprocessing_dum(X, X)
# X_scaled, _ = scaling_encoding(X_preprocessed, X_preprocessed)
# selector = RFE(estimator=LogisticRegression(), n_features_to_select=10)
# X_final = selector.fit_transform(X_scaled, y)
# final_svm = SVC(C=optimal_c, kernel='linear', random_state=42)
# final_svm.fit(X_final, y)

# print(f"Model trained with optimal C={optimal_c}.")


In [403]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score, f1_score
# import matplotlib.pyplot as plt
# from sklearn.linear_model import LogisticRegression

# # Garantir que os índices de X e y estejam alinhados
# X = X.reset_index(drop=True)
# y = y.reset_index(drop=True)

# # Definir o learning rate único
# learning_rate = 0.5  # Você pode alterar este valor conforme necessário

# # Pré-processamento
# print("Realizando o pré-processamento...")
# X_train, X_val = drop_description_columns(X_train, X_val)
# X_train, X_val = preprocessing_dum(X_train, X_val)
# X_train, X_val = scaling_encoding(X_train, X_val)
# y_train, y_val = encoding_label(y_train, y_val)

# # Seleção de features com RFE
# print("Selecionando features com RFE...")
# X_train_selected, selected_features, feature_ranking = feature_selection_rfe(
#     X_train, y_train, 35, LogisticRegression()
# )

# # Treinamento do modelo
# print("Treinando o modelo...")
# model = XGBClassifier(learning_rate=learning_rate, use_label_encoder=False, eval_metric='mlogloss')
# model.fit(X_train_selected, y_train)

# # Avaliação no conjunto de validação
# print("Avaliando no conjunto de validação...")
# X_val_selected = X_val[selected_features]
# y_pred_val = model.predict(X_val_selected)
# val_accuracy = accuracy_score(y_val, y_pred_val)
# val_f1 = f1_score(y_val, y_pred_val, average='weighted')  # Use "weighted" para classes desbalanceadas

# # Resultados
# print(f"Validation Accuracy: {val_accuracy:.4f}")
# print(f"Validation F1 Score: {val_f1:.4f}")




In [404]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression



X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

learning_rate = 0.5  

# preprocessing
print("preprocessing...")
X_train, X_val = preprocessing_dum(X_train, X_val)
X_train, X_val = scaling_encoding(X_train, X_val)
X_train,X_val=outliers_iqr(X_train,X_val,X_train.columns)

#preprocessing test data
X_test = test_data[~(test_data.drop(columns=['Assembly Date']).isna().all(axis=1) & test_data['Assembly Date'].notna())] 
X_test= drop_description_columns_Test(X_test)
X_test= preprocessing_dum_test(X_test)
X_test = scaling_encoding_test(X_test)

enc2 = LabelEncoder()
enc2.fit(y_train)



# Codifica os valores de y_train e y_val
y_train_encoded = enc2.transform(y_train)
y_val_encoded = enc2.transform(y_val)

# RFECV
print("Selecting features RFECV...")
# model_for_rfe = LogisticRegression(max_iter=1000)  # Modelo base para RFECV
# cv_strategy = StratifiedKFold(n_splits=5)  # Estratégia de validação cruzada

# rfecv = RFECV(estimator=model_for_rfe, step=1, cv=cv_strategy, scoring='accuracy', n_jobs=-1)
# rfecv.fit(X_train, y_train)

#selected_features = X_train.columns[rfecv.support_]
selected_features = ['Average Weekly Wage', 'C-2 Date', 'C-3 Date', 'First Hearing Date', 
                     'IME-4 Count', 'Attorney/Representative_False', 'Attorney/Representative_True', 
                     'Carrier Type_1A. PRIVATE', 'Carrier Type_2A. SIF']

X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test[selected_features]

#print(f"Número de features selecionadas: {len(selected_features)}")
#print("Features selecionadas:", selected_features.tolist())

# Treinamento do modelo com as features selecionadas
print("Treinando o modelo...")
model = XGBClassifier(learning_rate=learning_rate, use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train_selected, y_train_encoded)

# Avaliação no conjunto de validação
print("Avaliando no conjunto de validação...")
y_pred_val = model.predict(X_val_selected)
val_accuracy = accuracy_score(y_val_encoded, y_pred_val)
val_f1 = f1_score(y_val_encoded, y_pred_val, average='macro')  

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")


y_pred_test = model.predict(X_test_selected)
# Verificar as classes ajustadas
print("Classes ajustadas no encoder:", enc2.classes_)

# Garantir que os valores previstos estão dentro do domínio das classes ajustadas
print("Valores únicos previstos no teste:", np.unique(y_pred_test))

# Decodificar os valores previstos
try:
    y_pred_test_decoded = enc2.inverse_transform(y_pred_test)
    print("Valores decodificados:", y_pred_test_decoded[:10])  # Exibir os primeiros 10 valores
except ValueError as e:
    print(f"Erro durante a decodificação: {e}")



preprocessing...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

Selecting features RFECV...
Treinando o modelo...


Parameters: { "use_label_encoder" } are not used.



Avaliando no conjunto de validação...
Validation Accuracy: 0.7522
Validation F1 Score: 0.3002
Classes ajustadas no encoder: ['1. CANCELLED' '2. NON-COMP' '3. MED ONLY' '4. TEMPORARY'
 '5. PPD SCH LOSS' '6. PPD NSL' '7. PTD' '8. DEATH']
Valores únicos previstos no teste: [1 2 3 4]
Valores decodificados: ['3. MED ONLY' '3. MED ONLY' '2. NON-COMP' '2. NON-COMP' '2. NON-COMP'
 '2. NON-COMP' '4. TEMPORARY' '2. NON-COMP' '2. NON-COMP'
 '5. PPD SCH LOSS']


In [405]:
# val_f1 = f1_score(y_val, y_pred_val, average='macro')  
# val_f1


  warnings.warn(smsg, UserWarning)
Número de features selecionadas: 9
Features selecionadas: ['Average Weekly Wage', 'C-2 Date', 'C-3 Date', 'First Hearing Date', 'IME-4 Count', 'Attorney/Representative_False', 'Attorney/Representative_True', 'Carrier Type_1A. PRIVATE', 'Carrier Type_2A. SIF']
Treinando o modelo...
Avaliando no conjunto de validação...
Validation Accuracy: 0.7765
Validation F1 Score: 0.7295


In [406]:
test = enc2.inverse_transform(y_pred_test)
test

array(['3. MED ONLY', '3. MED ONLY', '2. NON-COMP', ..., '2. NON-COMP',
       '2. NON-COMP', '2. NON-COMP'], dtype=object)

In [407]:
## formating the submission file
X_test['Claim Injury Type'] = test
sample_submission = X_test[['Claim Injury Type']].set_index(X_test.index)
sample_submission.to_csv('submission_xgboost.csv')

In [408]:
# from catboost import CatBoostClassifier
# from sklearn.feature_selection import RFECV
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import accuracy_score, f1_score
# import matplotlib.pyplot as plt
# import pandas as pd

# # Preprocessing
# print("Preprocessing...")
# X_train, X_val = preprocessing_dum(X_train, X_val)
# X_train, X_val = scaling_encoding(X_train, X_val)
# X_train, X_val = outliers_iqr(X_train, X_val, X_train.columns)
# y_train, y_val = encoding_label(y_train, y_val)

# # Preprocessing test data
# X_test = test_data[~(test_data.drop(columns=['Assembly Date']).isna().all(axis=1) & test_data['Assembly Date'].notna())]
# X_test = drop_description_columns_Test(X_test)
# X_test = preprocessing_dum_test(X_test)
# X_test = scaling_encoding_test(X_test)

# enc2 = LabelEncoder()
# enc2.fit(y_train)

# # Seleção de features (substituindo a execução real para economizar tempo)
# print("Selecionando features...")
# selected_features = ['Average Weekly Wage', 'C-2 Date', 'C-3 Date', 'First Hearing Date',
#                      'IME-4 Count', 'Attorney/Representative_False', 'Attorney/Representative_True',
#                      'Carrier Type_1A. PRIVATE', 'Carrier Type_2A. SIF']

# X_train_selected = X_train[selected_features]
# X_val_selected = X_val[selected_features]
# X_test_selected = X_test[selected_features]

# # Treinamento com CatBoost
# print("Treinando o modelo com CatBoost...")
# model = CatBoostClassifier(
#     learning_rate=0.5,
#     iterations=500,
#     depth=6,
#     eval_metric='TotalF1',
#     verbose=100,  # Exibe progresso a cada 100 iterações
#     random_seed=42
# )

# model.fit(
#     X_train_selected,
#     y_train,
#     eval_set=(X_val_selected, y_val),
#     use_best_model=True
# )

# # Avaliação no conjunto de validação
# print("Avaliando no conjunto de validação...")
# y_pred_val = model.predict(X_val_selected)
# val_accuracy = accuracy_score(y_val, y_pred_val)
# val_f1 = f1_score(y_val, y_pred_val, average='macro')

# print(f"Validation Accuracy: {val_accuracy:.4f}")
# print(f"Validation F1 Score: {val_f1:.4f}")

# # Predição no conjunto de teste
# y_pred_test = model.predict(X_test_selected)

# # Resultados
# print("Predições no conjunto de teste feitas!")
