## This feature Selection Number 2 is to test vehicle id to vehicle year (might be more accureate to reflect our testing data)
- Change the dataset
- Run Feature test again (also try different feature selection technique like selectKBest)

In [52]:
# Import basic necessities
import pandas as pd
import os
import numpy as np

import tensorflow as tf
from tensorflow import keras

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [53]:
# same testing data + import vehcle data
drivers_df = pd.read_csv("ready_db/drviers.csv")
vehicles_df = pd.read_csv("ready_db/vehicles.csv")
vehicles_df.head()

Unnamed: 0,VEHICLE_ID,YEAR,MAKE,MODEL
0,1000030,2017,MERCEDES-BENZ,S CLASS MAYBACH 650
1,2005191,2012,PORSCHE,911 CARRERA CARRERA 4/CARRERA 2
2,2005195,2012,PORSCHE,911 CARRERA 4/CARRERA 2
3,2005211,2012,PORSCHE,911 CARRERA S/CARRERA GTS
4,2005213,2012,PORSCHE,911 CARRERA CARRERA S/CARRERA 4S


In [54]:
# Merge Year to the Test table
D_V_df = drivers_df.merge(vehicles_df, how='left', on='VEHICLE_ID')
# Drop Make, model for now
D_V_df = D_V_df.drop('MAKE', 1)
D_V_df = D_V_df.drop('MODEL', 1)
D_V_df.head()

Unnamed: 0,DRIVER_ID,GENDER,CREDIT_SCORE,AGE,VEHICLE_ID,YEAR
0,1,2.0,824,27.0,7420963,2013.0
1,2,1.0,824,28.0,7420963,2013.0
2,3,2.0,666,51.0,7396448,
3,4,2.0,666,34.0,6208988,2014.0
4,5,2.0,824,88.0,6078430,2013.0


In [55]:
# Drop missing values
D_V_df = D_V_df.dropna()
D_V_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 371 entries, 0 to 735
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   DRIVER_ID     371 non-null    int64  
 1   GENDER        371 non-null    float64
 2   CREDIT_SCORE  371 non-null    int64  
 3   AGE           371 non-null    float64
 4   VEHICLE_ID    371 non-null    int64  
 5   YEAR          371 non-null    float64
dtypes: float64(3), int64(3)
memory usage: 20.3 KB


In [56]:
# Remove creadit score outcome target from fetures data
y = D_V_df['CREDIT_SCORE']
X = D_V_df.drop(columns='CREDIT_SCORE')

In [57]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

In [58]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)

In [59]:
# The PCA algorithem is going to standardize the input data frame, calculate teh covariance matrix of the features
# eigenvector - covariance matrix of the features can be avaraged by generating another vector (pointer)
# distance form one point to another (pointer length) eigenvalues
# PCA shows : What number of principal components we need.
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
fit = pca.fit(X_scaled)
print(f"Explained Variance: {fit.explained_variance_ratio_}")
print(fit.components_)

Explained Variance: [0.52180874 0.1731221  0.15543164 0.08983638 0.05980113]
[[ 0.01440759 -0.99758991  0.04136058 -0.04281861  0.0325985 ]
 [ 0.8862104   0.0171567   0.20209753 -0.18892298 -0.37121622]
 [ 0.39518184  0.0281956   0.10836255  0.2485777   0.87721321]
 [-0.20585773  0.02375322  0.9478566   0.22520539 -0.08893128]
 [-0.12600579  0.0562211   0.21741967 -0.92193619  0.28935114]]


In [60]:
# The purpose of the ExtraTreesClassifier is to fit a number of randomized decision trees to the data, from of ensemble learning
# Particularly, random splits of all observations are carried out to ensure that the model does not overfit the data.
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# The importans of a feature is computed as the (normalized) Total reduction of the criterion brough by that feature.
print(model.feature_importances_)

[0.27574314 0.03659368 0.26230074 0.22707882 0.19828362]


In [61]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
z = SelectKBest(score_func=f_regression, k = 5).fit_transform(X, y)
print(z)

[[1.000000e+00 2.000000e+00 2.700000e+01 7.420963e+06 2.013000e+03]
 [2.000000e+00 1.000000e+00 2.800000e+01 7.420963e+06 2.013000e+03]
 [4.000000e+00 2.000000e+00 3.400000e+01 6.208988e+06 2.014000e+03]
 ...
 [7.340000e+02 2.000000e+00 8.700000e+01 6.129954e+06 2.015000e+03]
 [7.350000e+02 1.000000e+00 2.500000e+01 6.820645e+06 2.012000e+03]
 [7.360000e+02 1.000000e+00 3.800000e+01 6.820645e+06 2.012000e+03]]


In [63]:
# Random Forest, when imported from the sklearn library, provides a method where you can get the feature importance 
# of each of the variables. This is a good method to gauge the feature importance on datasets where Random Forest fits 
# the data with high accuracy.
from sklearn.ensemble import RandomForestClassifier as RClf

model = RClf(n_estimators = 100)
model.fit(X, y)
importances = model.feature_importances_
std = np.std([importances for tree in model.estimators_], axis = 0)

indices = np.argsort(importances)[::-1]

print('Feature Ranking:')

for f in range(X.shape[1]):
	print('%d. features %d (%f)'% (f+1, indices[f], importances[indices[f]]))

Feature Ranking:
1. features 0 (0.297217)
2. features 2 (0.260932)
3. features 3 (0.247085)
4. features 4 (0.169348)
5. features 1 (0.025417)
