In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, Imputer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

## Preprocess and Train Model

1. Preprocess data with imputer, scaler, label encoder and polynomial transformer
2. Fit logistic regression model on training dataset after 0.8/0.2 train test split and check score
3. Fit model on entire dataset

In [12]:
file = 'training.csv'
ins = pd.read_csv(file)

# drop columns with NaN% > 0.25
n = len(ins['Id'])
ins = ins.dropna(thresh = 0.75 * n, axis = 1)

# divide features
nom_str = 'Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41'
num_str = 'Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5'
dis_str = 'Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32'

res = ['Response']
num = [x for x in num_str.strip().split(', ') + dis_str.strip().split(', ') 
       if x in ins.columns.values]
cat = [x for x in nom_str.strip().split(', ') + ['Medical_Keyword_%s' % i for i in range(1, 49)] 
       if x in ins.columns.values]

# generate imputer, scaler, encoder and polynomial transformer 
imputer = Imputer(strategy = 'median')
imputer.fit(ins[num])
scaler = StandardScaler()
pf = PolynomialFeatures(degree = 3)
encoder = LabelEncoder()
encoder.fit(ins['Product_Info_2'].astype(str))

# Define preprocessing functions
def get_keep_col(df, res = 'Response', corr = 0.15):
    corr_matrix = df.corr()[res]
    keep_col = [col for col in df.columns.values if abs(corr_matrix[col]) >= corr and col != res]
    return keep_col

def preprocess(df, num = num, cat = cat, imputer = imputer, encoder = encoder):
    tr = imputer.transform(df[num])
    df[num] = pd.DataFrame(tr, columns = df[num].columns)
    df['Product_Info_2'] = encoder.transform(df['Product_Info_2'].astype(str))
    df[cat] = df[cat].fillna(-1)
    return df

def generate_train_data(df, pf = pf, scaler = scaler, num = num, cat = cat):
    df = preprocess(df)
    df[num] = scaler.fit_transform(df[num])
    keep_col = get_keep_col(df)
    X = df[keep_col]
    X_df = X.copy()
    X = pf.fit_transform(X)
    return X, keep_col, scaler, X_df

X, keep_col, scaler, X_df = generate_train_data(ins)
print(X)
y = ins['Response']

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

def fit_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    return score, y_pred

# Fit RandomForestClassifier
rfc = AdaBoostClassifier(random_state = 0)
rfc_score, rfc_y = fit_model(rfc, X_train, X_test, np.ravel(y_train), y_test)
rfc.fit(X, y)

rfc_score


[[ 1.         -0.33338014 -0.39777496 ...,  0.          0.          0.        ]
 [ 1.          0.59460177 -1.41365327 ...,  0.          0.          0.        ]
 [ 1.         -0.89016929  0.3055254  ...,  0.          0.          1.        ]
 ..., 
 [ 1.         -0.14778376  1.08697025 ...,  0.          0.          1.        ]
 [ 1.         -0.33338014 -0.00705254 ...,  0.          0.          0.        ]
 [ 1.         -0.33338014 -0.00705254 ...,  0.          0.          0.        ]]


0.46074999999999999

In [16]:
print(len(X[0]))

fi = rfc.feature_importances_ / np.max(rfc.feature_importances_)
# Display name and importance
print(fi)

165
[ 0.          0.33333333  0.          0.          0.33333333  0.          0.
  0.          0.          0.          0.33333333  0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.33333333  0.33333333  0.          0.          0.          0.33333333
  0.33333333  0.          0.          0.          0.33333333  0.33333333
  0.33333333  0.          0.          0.          0.33333333  0.          0.
  0.          0.          0.          0.          0.          0.
  0.33333333  0.          0.          0.          0.33333333  0.          0.
  0.          0.          0.33333333  0.33333333  0.          0.          0.
  0.33333333  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.33333333  0.
  0.33333333  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.33333333  0.          0.
  0.33333333  0.          0.          0

## Predict Test Data
1. Preprocess test data same as previous section
2. Predict response with model and generate output file

In [3]:
testfile = 'testing.csv'
test = pd.read_csv(testfile)

def adjust_columns(testX, X = X_df):
    extra_col = [x for x in testX.columns.values if x not in X.columns.values]
    lack_col = [x for x in X.columns.values if x not in testX.columns.values]
    df = testX.drop(extra_col, axis = 1)
    for x in lack_col:
        df[x] = 0
    return df

def generate_test_data(df, pf = pf, cat = cat, keep_col = keep_col, scaler = scaler):
    df = preprocess(df)
    df[num] = scaler.transform(df[num])
    X = df[keep_col]
    X = adjust_columns(X)
    X = pf.transform(X)
    return X

testX = generate_test_data(test)
y_pred = rfc.predict(testX)

Id = pd.DataFrame(test['Id'], columns = ['Id'])
response = pd.DataFrame(y_pred, columns = ['Response'], dtype = str)

result = pd.concat([Id, response], axis = 1)
result.to_csv('solution.csv', index = False)
