# Import Dataset

In [56]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import importlib
import time



#Importing the functions created in main.py
from Preprocessing_functions import *
import importlib
imported_module = importlib.import_module("Preprocessing_functions")
importlib.reload(imported_module)

<module 'Preprocessing_functions' from 'c:\\Users\\timst\\OneDrive\\Desktop\\NOVA IMS\\Semester 1\\MachineLearning\\Project\\ML_Group36\\src\\Preprocessing_functions.py'>

In [57]:
train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')
test_data = pd.read_csv('test_data.csv', index_col='Claim Identifier')

  train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')


In [58]:
train_data.head().T

Claim Identifier,5393875,5393091,5393889,957648180,5393887
Accident Date,2019-12-30,2019-08-30,2019-12-06,,2019-12-30
Age at Injury,31.0,46.0,40.0,,61.0
Alternative Dispute Resolution,N,N,N,,N
Assembly Date,2020-01-01,2020-01-01,2020-01-01,2020-01-01,2020-01-01
Attorney/Representative,N,Y,N,,N
Average Weekly Wage,0.0,1745.93,1434.8,,
Birth Year,1988.0,1973.0,1979.0,,1958.0
C-2 Date,2019-12-31,2020-01-01,2020-01-01,,2019-12-31
C-3 Date,,2020-01-14,,,
Carrier Name,NEW HAMPSHIRE INSURANCE CO,ZURICH AMERICAN INSURANCE CO,INDEMNITY INSURANCE CO OF,,STATE INSURANCE FUND


# Preprocessing

In [59]:
# Drop empty column
train_data.drop(columns=['OIICS Nature of Injury Description'], inplace=True)
test_data.drop(columns=['OIICS Nature of Injury Description'], inplace=True)

In [60]:
# Get me a list of all numerical values
numerical = ['Age at Injury','Average Weekly Wage','Birth Year','IME-4 Count','Number of Dependents']

binary = [ 'Alternative Dispute Resolution','Attorney/Representative', 'COVID-19 Indicator',]

categorical = ['Accident Date','Assembly Date','C-2 Date','C-3 Date','Carrier Name',
               'Carrier Type', 'County of Injury','District Name',
               'First Hearing Date','Gender','Industry Code Description','Medical Fee Region',
               'WCIO Cause of Injury Description','WCIO Nature of Injury Description',
               'WCIO Part Of Body Description', 'Industry Code', 'WCIO Cause of Injury Code',
               'WCIO Nature of Injury Code','WCIO Part Of Body Code','Zip Code']

# Cheching targer variable distribution
train_data[numerical].dtypes

Age at Injury           float64
Average Weekly Wage     float64
Birth Year              float64
IME-4 Count             float64
Number of Dependents    float64
dtype: object

In [None]:
def preprocessing(X_train, X_val, y_train, y_val):

    # Data Cleaning
    X_train, X_val = drop_empty_rows(X_train, X_val)
    print('Done dropping empty rows')
    X_train, X_val = convert_to_bool(X_train, X_val)
    print('Done converting to bool')

    # Imputing missing values
    X_train, X_val = impute_mean_numerical(X_train, X_val, numerical)
    print('Done imputing mean numerical')
    X_train, X_val = impute_mean_numerical(X_train, X_val, binary)
    print('Done imputing mean binary')
    X_train, X_val = impute_mode_categorical(X_train, X_val, categorical)
    print('Done imputing mode categorical')

    print(f'Missing values in training data after preprocessing: {X_train.isnull().sum().sum()}')
    # give me all the columns with missing values and the sum of missing values
    print(X_train.isnull().sum()[X_train.isnull().sum() > 0])

    #Handling outliers
    X_train, X_val = outliers_iqr(X_train, X_val, numerical)
    print('Done handling outliers')
    
    # Scaling
    X_train, X_val = scaling_minmax(X_train, X_val, numerical)
    print('Done scaling')

    # Encoding
    X_train, X_val = encoding_onehot(X_train, X_val, categorical)
    print('Done encoding onehot')
    y_train, y_val = encoding_label(y_train, y_val)
    print('Done encoding label')

    # Feature Selection
    X_train, selected_features, feature_ranking, optimal_num_features = feature_selection_rfecv(X_train, y_train, LogisticRegression(), scoring='f1_weighted') # using f1_weighted as scoring metric because of class imbalance
    print('Done feature selection')

    return X_train, X_val, y_train, y_val, selected_features, feature_ranking, optimal_num_features


# Model Training

### Stratified KFold function

In [62]:
train_data = train_data[~(train_data.drop(columns=['Assembly Date']).isna().all(axis=1) & train_data['Assembly Date'].notna())] 

In [63]:
X = train_data.drop(columns=['Claim Injury Type', 'WCB Decision', 'Agreement Reached'])
y = train_data['Claim Injury Type']

X, y = X.sample(n=50000, random_state=42), y.loc[X.sample(n=50000, random_state=42).index]

In [64]:
def avg_score(model):
    # apply kfold
    skf = StratifiedKFold(n_splits=10)
    # create lists to store the results from the different models 
    score_train = []
    score_val = []
    timer = []
    n_iter = []
    
    for train_index, val_index in skf.split(X,y):
        # get the indexes of the observations assigned for each partition
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # preprocess the data
        X_train, X_val, y_train, y_val, selected_features, feature_ranking, optimal_num_features = preprocessing(X_train, X_val, y_train, y_val)

        # Adjust the number of neurons in the hidden layers according to the number of selected features
        model.hidden_layer_sizes = (int(0.75 * len(selected_features)), int(0.5 * len(selected_features))) 

        # start counting time
        begin = time.perf_counter()

        # fit the model to the data
        model.fit(X_train, y_train)
        # finish counting time
        end = time.perf_counter()
        # check the mean accuracy for the train
        value_train = model.score(X_train, y_train)
        # check the mean accuracy for the validation
        value_val = model.score(X_val,y_val)
        # append the accuracies, the time and the number of iterations in the corresponding list
        score_train.append(value_train)
        score_val.append(value_val)
        timer.append(end-begin)
        n_iter.append(model.n_iter_)
    # calculate the average and the std for each measure (accuracy, time and number of iterations)
    avg_time = round(np.mean(timer),3)
    avg_train = round(np.mean(score_train),3)
    avg_val = round(np.mean(score_val),3)
    std_time = round(np.std(timer),2)
    std_train = round(np.std(score_train),2)
    std_val = round(np.std(score_val),2)
    avg_iter = round(np.mean(n_iter),1)
    std_iter = round(np.std(n_iter),1)
    
    return str(avg_time) + '+/-' + str(std_time), str(avg_train) + '+/-' + str(std_train),\
str(avg_val) + '+/-' + str(std_val), str(avg_iter) + '+/-' + str(std_iter)

def show_results(df, *args):
    """
    Receive an empty dataframe and the different models and call the function avg_score
    """
    count = 0
    # for each model passed as argument
    for arg in args:
        # obtain the results provided by avg_score
        time, avg_train, avg_val, avg_iter = avg_score(arg)
        # store the results in the right row
        df.iloc[count] = time, avg_train, avg_val, avg_iter
        count+=1
    return df

### 1. Run (simple preprocessing)

In [65]:
model =  model = MLPClassifier(
        activation='relu',
        solver='adam',
        learning_rate_init=0.01,
        max_iter=1000,  # Increase if needed
        random_state=42
    )

model_iterincrease =  model = MLPClassifier(
        activation='relu',
        solver='adam',
        learning_rate_init=0.01,
        max_iter=2000,  # Increase if needed
        random_state=42
    )

df = pd.DataFrame(columns = ['Time','Train','Test', 'Iterations'], index = ['1000 iter', '2000 iter'])
df = show_results(df, model, model_iterincrease)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

Done dropping empty rows
Done converting to bool
Done imputing mean numerical
Done imputing mean binary
Done imputing mode categorical
Missing values in training data after preprocessing: 0
Series([], dtype: int64)
Done handling outliers
Unique values in Age at Injury:
[65.  51.  42.  58.  33.  31.  54.  37.  39.  27.  24.  44.  57.  22.
 34.  43.  62.  32.  18.  17.  21.  28.  36.  20.  55.  30.  48.  46.
 38.  53.  50.  52.  60.  19.  59.  49.  47.  35.  45.  40.  23.  41.
 29.  25.  74.  26.  61.  80.  56.  63.  78.  70.   0.  67.  73.  71.
 66.  68.  64.  72.  75.  69.  79.  85.  77.  84.  16.  82.  88.5 83.
 76.  14.  88.  81.  87.  15.  86.  13. ]
Unique values in Average Weekly Wage:
[ 760.      1997.94375 1654.39    ... 1554.75    1205.3     1296.68   ]
Unique values in Birth Year:
[1954.  1968.  1977.  1961.  1989.  1966.  1984.  1925.5 1992.  1976.
 1997.  1986.  1962.  1987.  2002.  2003.  1999.  1988.  1990.  1978.
 1993.  1983.  1994.  1975.  1982.  1979.  1971.  1969.  19

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KeyboardInterrupt: 