##### Dependencies

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# SVM?
# Random Forest?
# XGBoost?

In [2]:
data = pd.read_csv('data/train.csv', index_col="id")

In [3]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

##### Preprocessing

In [4]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df.Region_Code = df.Region_Code.astype("int8")

    df.Previously_Insured = df.Previously_Insured.astype("int8")

    df.Driving_License = df.Driving_License.astype("int8")

    df.Gender = df.Gender.map({"Male": 1, "Female": 0})

    df.Vehicle_Damage = df.Vehicle_Damage.map({"Yes": 1, "No": 0})

    vehicle_age = pd.get_dummies(df.Vehicle_Age, drop_first=True)
    df[["< 1 Year", "> 2 Years"]] = vehicle_age
    df.drop(columns="Vehicle_Age", inplace=True)
    
    # region_dummies = pd.get_dummies(df.Region_Code, drop_first=True)
    # regions = pd.get_dummies(df.Region_Code, drop_first=True)
    # df.drop(columns="Region_Code", inplace=True)
    # df = pd.concat([df, regions], axis=1)
    return df

In [5]:
train = clean_data(train)

In [16]:
independent_vars = [col for col in train.columns if col not in ["Previously_Insured", "Annual_Premium", "Vintage", "Response", "Region_Code"]]
features = train[independent_vars]
response = train.Response
# features.columns = features.columns.astype(str)

In [7]:
features.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Vehicle_Damage,Policy_Sales_Channel,< 1 Year,> 2 Years
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2405201,0,41,1,33,1,124.0,False,False
7069408,1,24,1,49,1,152.0,True,False
1044712,1,39,1,11,1,124.0,False,False
6286832,1,52,1,8,0,26.0,False,False
5807645,0,69,1,41,0,124.0,False,False


In [17]:
# Standardise Features
# Region_Code to be dummified, Annual_premium, Policy_sales_Channel, Vintage
policy_scaler = StandardScaler()
scaled_policy = policy_scaler.fit_transform(features.Policy_Sales_Channel.values.reshape(-1, 1))
features.Policy_Sales_Channel = scaled_policy

age_scaler = StandardScaler()
scaled_age = age_scaler.fit_transform(features.Age.values.reshape(-1, 1))
features.Age = scaled_age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.Policy_Sales_Channel = scaled_policy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.Age = scaled_age


In [18]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(features, response)


In [None]:
logreg = LogisticRegression()
logreg.fit(X_resampled, y_resampled)

In [None]:
import pickle
with open("logreg_no_region_code.pkl", "wb") as f:
    pickle.dump(logreg, f, protocol=5)

In [10]:
pipeline = Pipeline([("scaler", premium_scaler), ("logreg", logreg)])

NameError: name 'premium_scaler' is not defined

In [None]:
test = clean_data(test)

In [None]:
scaled_policy = policy_scaler.transform(test.Policy_Sales_Channel.values.reshape(-1, 1))
test.Policy_Sales_Channel = scaled_policy

scaled_age = age_scaler.transform(test.Age.values.reshape(-1, 1))
test.Age = scaled_age

In [None]:
# pipeline.transform(test).score()
# scaled_premium = premium_scaler.transform(test.Annual_Premium.values.reshape(-1,1))
# test.Annual_Premium = scaled_premium

In [None]:
y_pred = logreg.predict(test[independent_vars])

In [None]:
roc_auc_score(test["Response"], y_pred)

0.7662060577954461

In [None]:
# ROC AUC with resampled classes to avoid imbalance = 0.77
# ROC AUC without resmapling = 0.5, as bad as it gets

In [None]:
logreg.coef_

array([[ 0.08458402, -0.437915  ,  1.06531874,  4.02148074, -0.16459027,
        -1.31185652,  0.28715447]])