In [58]:
import pandas as pd
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go
from plotly.missing_ipywidgets import FigureWidget
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
import plotly.offline as py

In [59]:
with open('../data/random_forest_hyper_params.json','r') as file:
    rf_hyper_params = json.load(file)

In [60]:
X_train = pd.read_csv('../data/x_train_clean.csv', sep='$', decimal=".", engine='python') 
y_train = pd.read_csv('../data/y_train_clean.csv', sep='$', decimal=".", engine='python')   

del X_train['Unnamed: 0']
del y_train['Unnamed: 0']

In [61]:
df = pd.read_csv('../data/test.csv', sep='[,$]' , decimal=".", engine='python')

df.columns = df.columns.str.lower()

df.rename(columns={
    'id': 'id',
    'gender': 'gender',
    'age': 'age',
    'driving_license': 'driving_license',
    'region_code': 'region_code',
    'previously_insured': 'previously_insured',
    'vehicle_age': 'vehicle_age',
    'vehicle__damage': 'vehicle_damage',
    'annual__premium': 'annual_premium',
    'policy_sales_channel': 'policy_sales_channel',
    'vintage': 'vintage'
},
    inplace=True)



df_one_hot_encoded_data = pd.get_dummies(df, columns = ['driving_license', 'gender', 'vehicle_age', 'vehicle_damage','previously_insured'])

df_one_hot_encoded_data.rename(columns={
    'id': 'id',
    'gender': 'gender',
    'age': 'age',
    'driving_license_0': 'driving_license_No',
    'driving_license_1': 'driving_license_Yes',
    'region_code': 'region_code',
    'previously_insured_0': 'previously_insured_No',
    'previously_insured_1': 'previously_insured_Yes',
    'vehicle_age': 'vehicle_age',
    'vehicle_damage': 'vehicle_damage',
    'annual_premium': 'annual_premium',
    'policy_sales_channel': 'policy_sales_channel',
    'vintage': 'vintage'
},
    inplace=True)

df_one_hot_encoded_data.head()

Unnamed: 0,id,age,region_code,annual_premium,policy_sales_channel,vintage,driving_license_No,driving_license_Yes,gender_Female,gender_Male,vehicle_age_1-2 Year,vehicle_age_< 1 Year,vehicle_age_> 2 Years,vehicle_damage_No,vehicle_damage_Yes,previously_insured_No,previously_insured_Yes
0,381000,68,28.0,53066.0,12.0,195,0,1,0,1,1,0,0,1,0,0,1
1,381001,78,8.0,28301.0,124.0,195,0,1,0,1,1,0,0,0,1,1,0
2,381002,22,28.0,2630.0,153.0,170,0,1,0,1,0,1,0,1,0,0,1
3,381003,20,28.0,38627.0,124.0,62,0,1,1,0,1,0,0,0,1,1,0
4,381004,44,28.0,24984.0,152.0,255,0,1,1,0,1,0,0,1,0,0,1


In [62]:
rfc = RandomForestClassifier(**rf_hyper_params, n_jobs=-1, random_state=42)

X = df_one_hot_encoded_data.drop('id', axis=1)

random_forest_classifier = rfc.fit(X_train, y_train['response'])

y_pred = rfc.predict(X)

In [63]:
df_y_pred = pd.DataFrame(y_pred, columns=['y_pred'])

res = df_y_pred.set_index(df['id']).reset_index()
res['y_pred'].value_counts(normalize=True)

0    0.995564
1    0.004436
Name: y_pred, dtype: float64

In [64]:
df_pred = pd.merge(df, res, on='id')

df_positive = df_pred[df_pred['y_pred'] == 1]
df_positive.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,564.0,445776.157801,36029.195454,381187.0,414270.25,446148.0,476599.75,507634.0
age,564.0,40.437943,12.273151,25.0,33.0,36.0,42.25,78.0
driving_license,564.0,0.819149,0.385236,0.0,1.0,1.0,1.0,1.0
region_code,564.0,22.526596,14.811102,1.0,7.0,28.0,33.0,50.0
previously_insured,564.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annual_premium,564.0,26175.558511,13515.012565,2630.0,22423.5,27510.0,31855.75,91164.0
policy_sales_channel,564.0,129.514184,48.324666,3.0,124.0,157.0,163.0,163.0
vintage,564.0,168.304965,93.342182,10.0,90.0,177.5,254.0,299.0
y_pred,564.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [65]:
# Prozentuale Verteilung je Kategorie in region_code absteigend nach 
# den höchsten Werten in einem df ablegen
df_region_code = round(df_positive['region_code'].value_counts(normalize=True).to_frame().sort_index(), 4)

trace = [
    # Barchart für die Verteilung in der Variable region_code insgesamt erstellen
    go.Bar(x=df_region_code.index,
           y=df_region_code['region_code'],
           opacity=0.8,
           name="total",
           hoverinfo="y",
           marker=dict(
               color=df_region_code['region_code'],
               colorscale='ice',
               reversescale=True,
               showscale=True)
           ),
]

# Layout des Plots definieren
layout = go.Layout(title=dict(text='Region Code Verteilung', y=.95),
                   plot_bgcolor='rgb(240, 240, 240)',
                   autosize=True,
                   xaxis=dict(title="Region Code",
                              titlefont=dict(size=16),
                              tickmode="linear"),
                   yaxis=dict(title="%",
                              titlefont=dict(size=17)),
                   )

# Plot erstellen
fig = dict(data=trace, layout=layout)
py.iplot(fig)

In [66]:
# Prozentuale Verteilung je Kategorie in vehicle age absteigend nach 
# den höchsten Werten in einem df ablegen
df_vehicle_age = round(df_positive['vehicle_age'].value_counts(normalize=True).to_frame().sort_index(), 4)

# Hier spezifizieren wir für gesamt, male und female jeweils einen Bar Chart
trace = [
    #Bar Chart für gesamt
    go.Bar(x=df_vehicle_age.index,
           y=df_vehicle_age['vehicle_age'],
           opacity=0.8,
           name="total",
           hoverinfo="y",
           marker=dict(
               color=df_vehicle_age['vehicle_age'],
               colorscale='ice',
               reversescale=True,
               showscale=True)
           )
]

# Layout konfigurieren
layout = go.Layout(title=dict(text='Vehicle Age', y=.95),
                   plot_bgcolor='rgb(240, 240, 240)',
                   autosize=True,
                   xaxis=dict(title="vehicle_age",
                              titlefont=dict(size=15),
                              tickmode="linear"),
                   yaxis=dict(title="%",
                              titlefont=dict(size=20)),
                   )

# Plot erzeugen
fig = dict(data=trace, layout=layout)
py.iplot(fig)

In [67]:
# Absolute Anzahl von Male und Female in dff abspeichern
dff = df_positive['gender'].value_counts()[:10] 

# Label für den Pie chart festlegen
label = dff.index

# Summe der Ausprägungen je Label
size = dff.values

# Farben definieren und pie chart erzeugen
colors = ['rgb(20,29,67)', 'rgb(18,116,117)']
trace = go.Pie(labels=label, values=size, marker=dict(colors=colors), hole=.2)

data = [trace]

# Titel hinzufügen
layout = go.Layout(
    title='Geschlechtsverteilung'
)

# Plot erzeugen mit den zuvor definierten Spezifikationen
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [80]:
# Prozentuale Verteilung je Kategorie in policy_sales_channel absteigend nach 
# den höchsten Werten in einem df ablegen
df_policy_sales_channel = pd.DataFrame(df_positive['policy_sales_channel'])
df_policy_sales_channel['percentage'] = round(df_positive['policy_sales_channel'].value_counts(normalize=True).to_frame().sort_index(), 4)
df_policy_sales_channel_index = df_policy_sales_channel[df_policy_sales_channel['policy_sales_channel'] < 0.1].index

df_policy_sales_channel = df_policy_sales_channel.drop(df_policy_sales_channel_index)

trace = [
    # Barchart für die Verteilung insgesamt
    go.Bar(x=df_policy_sales_channel['policy_sales_channel'].index,
           y=df_policy_sales_channel['percentage'],
           opacity=0.8,
           name="total",
           hoverinfo="y",
           marker=dict(
               color=df_policy_sales_channel['policy_sales_channel'],
               colorscale='ice',
               reversescale=True,
               showscale=True)
           )


]

# Layout des Plots definieren
layout = go.Layout(title=dict(text='Policy Sales Channel', y=.95),
                   plot_bgcolor='rgb(240, 240, 240)',
                   autosize=True,
                   xaxis=dict(title="Policy Sales Channel",
                              titlefont=dict(size=16),
                              tickmode="linear"),
                   yaxis=dict(title="%",
                              titlefont=dict(size=17)),
                   )


fig = dict(data=trace, layout=layout)
py.iplot(fig)

In [81]:
df_policy_sales_channel

Unnamed: 0,policy_sales_channel,percentage
187,124.0,
934,122.0,
1542,163.0,
1782,124.0,
1880,163.0,
...,...,...
125564,124.0,
125662,124.0,
125793,124.0,
125797,124.0,
