# Import Dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import importlib
import time



#Importing the functions created in main.py
from Preprocessing_functions import *
import importlib
imported_module = importlib.import_module("Preprocessing_functions")
importlib.reload(imported_module)

<module 'Preprocessing_functions' from 'c:\\Users\\timst\\OneDrive\\Desktop\\NOVA IMS\\Semester 1\\MachineLearning\\Project\\ML_Group36\\src\\Preprocessing_functions.py'>

In [6]:
train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')
test_data = pd.read_csv('test_data.csv', index_col='Claim Identifier')

  train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')


In [3]:
train_data.head().T

Claim Identifier,5393875,5393091,5393889,957648180,5393887
Accident Date,2019-12-30,2019-08-30,2019-12-06,,2019-12-30
Age at Injury,31.0,46.0,40.0,,61.0
Alternative Dispute Resolution,N,N,N,,N
Assembly Date,2020-01-01,2020-01-01,2020-01-01,2020-01-01,2020-01-01
Attorney/Representative,N,Y,N,,N
Average Weekly Wage,0.0,1745.93,1434.8,,
Birth Year,1988.0,1973.0,1979.0,,1958.0
C-2 Date,2019-12-31,2020-01-01,2020-01-01,,2019-12-31
C-3 Date,,2020-01-14,,,
Carrier Name,NEW HAMPSHIRE INSURANCE CO,ZURICH AMERICAN INSURANCE CO,INDEMNITY INSURANCE CO OF,,STATE INSURANCE FUND


# Preprocessing

In [7]:
# Drop empty column
train_data.drop(columns=['OIICS Nature of Injury Description'], inplace=True)
test_data.drop(columns=['OIICS Nature of Injury Description'], inplace=True)

In [None]:
# Get me a list of all numerical values
numerical = ['Age at Injury','Average Weekly Wage','Birth Year','IME-4 Count','Number of Dependents']

binary = [ 'Alternative Dispute Resolution','Attorney/Representative', 'COVID-19 Indicator',]

categorical = ['Accident Date','Assembly Date','C-2 Date','C-3 Date','Carrier Name',
               'Carrier Type','Claim Injury Type','County of Injury','District Name',
               'First Hearing Date','Gender','Industry Code Description','Medical Fee Region',
               'OIICS Nature of Injury Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description',
               'WCIO Part Of Body Description']

# Cheching targer variable distribution
train_data['Claim Injury Type'].value_counts()

Claim Injury Type
2. NON-COMP        291078
4. TEMPORARY       148507
3. MED ONLY         68906
5. PPD SCH LOSS     48280
1. CANCELLED        12477
6. PPD NSL           4211
8. DEATH              470
7. PTD                 97
Name: count, dtype: int64

In [None]:
def preprocessing(X_train, X_val, y_train, y_val):

    # Data Cleaning
    X_train, X_val = drop_empty_rows(X_train, X_val)
    X_train, X_val = convert_to_bool(X_train, X_val)

    # Imputing missing values
    X_train, X_val = impute_mean_numerical(X_train, X_val, numerical)
    X_train, X_val = impute_mean_numerical(X_train, X_val, binary)
    X_train, X_val = impute_mode_categorical(X_train, X_val, categorical)

    print(f'Missing values in training data after preprocessing: {X_train.isnull().sum().sum()}')

    #Handling outliers
    X_train, X_val = outliers_iqr(X_train, X_val, numerical)

    # Scaling
    X_train, X_val = scaling_standard(X_train, X_val, numerical)

    # Encoding
    X_train, X_val = encoding_onehot(X_train, X_val, categorical)
    y_train, y_val = encoding_label(y_train, y_val)

    # Feature Selection
    X_train, selected_features, feature_ranking, optimal_num_features = feature_selection_rfecv(X_train, y_train, LogisticRegression(), scoring='f1_weighted') # using f1_weighted as scoring metric because of class imbalance

    return X_train, X_val, y_train, y_val, selected_features, feature_ranking, optimal_num_features


# Model Training

### 1. Run (simple)

In [None]:
train_data = train_data[~(train_data.drop(columns=['Assembly Date']).isna().all(axis=1) & train_data['Assembly Date'].notna())] 

In [None]:
X = train_data.drop(columns=['Claim Injury Type', 'WCB Decision', 'Agreement Reached'])
y = train_data['Claim Injury Type']



In [None]:
def avg_score(model):
    # apply kfold
    skf = StratifiedKFold(n_splits=10)
    # create lists to store the results from the different models 
    score_train = []
    score_val = []
    timer = []
    n_iter = []
    
    for train_index, val_index in skf.split(X,y):
        # get the indexes of the observations assigned for each partition
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        X_train, X_val = preprocessing(X_train, X_val)

        # start counting time
        begin = time.perf_counter()
        # fit the model to the data
        model.fit(X_train, y_train)
        # finish counting time
        end = time.perf_counter()
        # check the mean accuracy for the train
        value_train = model.score(X_train, y_train)
        # check the mean accuracy for the validation
        value_val = model.score(X_val,y_val)
        # append the accuracies, the time and the number of iterations in the corresponding list
        score_train.append(value_train)
        score_val.append(value_val)
        timer.append(end-begin)
        n_iter.append(model.n_iter_)
    # calculate the average and the std for each measure (accuracy, time and number of iterations)
    avg_time = round(np.mean(timer),3)
    avg_train = round(np.mean(score_train),3)
    avg_val = round(np.mean(score_val),3)
    std_time = round(np.std(timer),2)
    std_train = round(np.std(score_train),2)
    std_val = round(np.std(score_val),2)
    avg_iter = round(np.mean(n_iter),1)
    std_iter = round(np.std(n_iter),1)
    
    return str(avg_time) + '+/-' + str(std_time), str(avg_train) + '+/-' + str(std_train),\
str(avg_val) + '+/-' + str(std_val), str(avg_iter) + '+/-' + str(std_iter)

def show_results(df, *args):
    """
    Receive an empty dataframe and the different models and call the function avg_score
    """
    count = 0
    # for each model passed as argument
    for arg in args:
        # obtain the results provided by avg_score
        time, avg_train, avg_val, avg_iter = avg_score(arg)
        # store the results in the right row
        df.iloc[count] = time, avg_train, avg_val, avg_iter
        count+=1
    return df