Diverse Machine Learning Algorithms:
*   Linear Regression
*   Classfication
*   Clustering
*   Hidden Markov Models

Here we use a classification algorithm to predict classes

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
train_df = pd.read_csv(r'/content/train_data.csv')
test_df = pd.read_csv(r'/content/test_data.csv')
submission = pd.read_csv(r'/content/sample_submission.csv')

In [3]:
train_df.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [4]:
train_df.fillna(train_df.mean(), inplace= True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         7160 non-null   object 
 1   YearOfObservation   7160 non-null   int64  
 2   Insured_Period      7160 non-null   float64
 3   Residential         7160 non-null   int64  
 4   Building_Painted    7160 non-null   object 
 5   Building_Fenced     7160 non-null   object 
 6   Garden              7153 non-null   object 
 7   Settlement          7160 non-null   object 
 8   Building Dimension  7160 non-null   float64
 9   Building_Type       7160 non-null   int64  
 10  Date_of_Occupancy   7160 non-null   float64
 11  NumberOfWindows     7160 non-null   object 
 12  Geo_Code            7058 non-null   object 
 13  Claim               7160 non-null   int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 783.2+ KB


In [5]:
train_df.fillna(train_df.mode(), inplace= True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         7160 non-null   object 
 1   YearOfObservation   7160 non-null   int64  
 2   Insured_Period      7160 non-null   float64
 3   Residential         7160 non-null   int64  
 4   Building_Painted    7160 non-null   object 
 5   Building_Fenced     7160 non-null   object 
 6   Garden              7153 non-null   object 
 7   Settlement          7160 non-null   object 
 8   Building Dimension  7160 non-null   float64
 9   Building_Type       7160 non-null   int64  
 10  Date_of_Occupancy   7160 non-null   float64
 11  NumberOfWindows     7160 non-null   object 
 12  Geo_Code            7058 non-null   object 
 13  Claim               7160 non-null   int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 783.2+ KB


In [6]:
train_df.dropna(axis=0, inplace= True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7051 entries, 0 to 7057
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         7051 non-null   object 
 1   YearOfObservation   7051 non-null   int64  
 2   Insured_Period      7051 non-null   float64
 3   Residential         7051 non-null   int64  
 4   Building_Painted    7051 non-null   object 
 5   Building_Fenced     7051 non-null   object 
 6   Garden              7051 non-null   object 
 7   Settlement          7051 non-null   object 
 8   Building Dimension  7051 non-null   float64
 9   Building_Type       7051 non-null   int64  
 10  Date_of_Occupancy   7051 non-null   float64
 11  NumberOfWindows     7051 non-null   object 
 12  Geo_Code            7051 non-null   object 
 13  Claim               7051 non-null   int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 826.3+ KB


In [7]:
train_df.rename(columns = {'Building Dimension': 'Building_Dimension'}, inplace=True)

In [8]:
# Define the feature columns

categorical_columns = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'NumberOfWindows', 'Geo_Code']
numerical_columns = ['YearOfObservation', 'Insured_Period', 'Residential', 'Building_Dimension', 'Building_Type', 'Date_of_Occupancy']

my_feature_columns = []

for feature_name in categorical_columns:
  vocabulary = train_df[feature_name].unique()
  my_feature_columns.append(tf.feature_column.indicator_column(
      tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary, default_value=-1)))
  

for key in train_df[numerical_columns].keys():
  my_feature_columns.append(tf.feature_column.numeric_column(key= key))

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = train_df.drop(['Customer Id', 'Claim'], axis=1)
test_set = test_df.drop(['Customer Id'], axis=1, inplace = True)
y = train_df['Claim']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
def make_input_fn(train_df, label_df, training = True, batch_size=256):
  #Convert the inputs into a data
  ds = tf.data.Dataset.from_tensor_slices((dict(train_df), label_df))
  
  #Shuffle the dataset when in training mode
  if training:
    ds = ds.shuffle(1000).repeat()
  return ds.batch(batch_size)

Building a Model

In [13]:
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    hidden_units=[30, 10],
    n_classes=2
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpyizvcc3z', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


Training the Model

In [14]:
classifier.train(
    input_fn= lambda: make_input_fn(X_train, y_train, training=True),
    steps=1000
)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpyizvcc3z/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 5.288925, step = 0
INFO:tensorflow:global_step/sec: 74.071
INFO:tensorflow:loss = 0.539824, step = 100 (1.353 sec)
INFO:tensorflow:global_step/sec: 88.9644
INFO:tensorflow:loss = 0.48629576, step = 200 (1.126 sec)
INFO:tensorflow:global_step/sec: 

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7fe1a85b2c50>

In [15]:
eval_result = classifier.evaluate(input_fn= lambda: make_input_fn(X_test, y_test, training = False))
print('\nTest set accuracy: {accuracy: 0.3f}\n'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2022-01-31T10:02:31
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpyizvcc3z/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.00861s
INFO:tensorflow:Finished evaluation at 2022-01-31-10:02:32
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.7760454, accuracy_baseline = 0.77179307, auc = 0.6882347, auc_precision_recall = 0.4018186, average_loss = 0.5026751, global_step = 1000, label/mean = 0.22820695, loss = 0.51422065, precision = 0.54545456, prediction/mean = 0.2187615, recall = 0.111801244
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /tmp/tmpyizvcc3z/model.ckpt-1000

Test set accuracy:  0.776



Making Predictions