In [1033]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph 
from datetime import date
import random

from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dropout

import tensorflow as tf

%matplotlib inline

import random 
# random.seed(42)

from numpy.random import seed
seed(42)

In [1034]:
csv_path_1 = "training_dataset.csv"
csv_path_2 = "score.csv"

# 0. First, let's understand our data...

In [1035]:
df = pd.read_csv(csv_path_1) # load the pandas dataframe
df_score = pd.read_csv(csv_path_2)
initial_cols_to_drop = ["Unnamed: 0","Unnamed: 0.1", "period", "test", "recent_date", "date"] 
for col_name in initial_cols_to_drop: # drops columns that aren't supposed to be in dataset
    try:
        df = df.drop(columns=[col_name])
        df_score = df_score.drop(columns=[col_name])
    except:
        continue
display(df.head())

Unnamed: 0,dt,weekday,year,id_driver,id_carrier_number,dim_carrier_type,dim_carrier_company_name,home_base_city,home_base_state,carrier_trucks,...,most_recent_load_date,load_day,loads,marketplace_loads_otr,marketplace_loads_atlas,marketplace_loads,brokerage_loads_otr,brokerage_loads_atlas,brokerage_loads,total_loads
0,2019-12-16,Monday,2019,21350,U0109015,Owner Operator,CA&F TRUCKING,Maywood,CA,"[""poweronly""]",...,2021-02-17,2019-12-16,2,0,438,438,0,45,45,483
1,2021-01-15,Friday,2021,36437,C0097727,Fleet,New opportunities inc,Los Angeles,CA,"[""poweronly"", ""boxtruck""]",...,2021-02-03,2021-01-15,1,2,72,74,0,1,1,75
2,2019-12-26,Thursday,2019,19323,U0107081,Owner Operator,RAS,Compton,CA,"[""poweronly""]",...,2020-09-25,2019-12-26,1,0,180,180,0,2,2,182
3,2021-02-10,Wednesday,2021,34809,C0094651,Fleet,NFS asset Drayage,Lynwood,CA,"[""poweronly"", ""dryvan""]",...,2021-02-17,2021-02-10,3,0,0,0,0,0,0,62
4,2017-07-24,Monday,2017,4728,U0094376,Owner Operator,joes transportation,Norco,CA,"[""dryvan""]",...,2017-10-11,2017-07-24,2,57,0,57,314,0,314,371


In [1036]:
df.describe()

Unnamed: 0,year,id_driver,num_trucks,days_signup_to_approval,loads,marketplace_loads_otr,marketplace_loads_atlas,marketplace_loads,brokerage_loads_otr,brokerage_loads_atlas,brokerage_loads,total_loads
count,83414.0,83414.0,83344.0,71124.0,83414.0,83414.0,83414.0,83414.0,83414.0,83414.0,83414.0,83414.0
mean,2018.96093,18222.414954,22.582921,298.752489,2.07627,29.477762,71.579675,101.057436,148.258422,13.073021,161.331443,266.502661
std,1.359343,11667.704926,48.829719,390.345107,2.672163,88.17194,194.532776,214.502147,415.97806,42.241592,413.792137,448.806175
min,2015.0,20.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2018.0,7890.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,37.0
50%,2019.0,16299.0,4.0,61.0,1.0,2.0,0.0,13.0,15.0,0.0,37.0,110.0
75%,2020.0,28974.0,14.0,497.0,2.0,23.0,18.0,94.0,112.0,1.0,135.0,325.0
max,2021.0,38125.0,195.0,1653.0,129.0,902.0,1324.0,1348.0,4266.0,371.0,4266.0,4266.0


In [1037]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83414 entries, 0 to 83413
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   dt                        83414 non-null  object 
 1   weekday                   83414 non-null  object 
 2   year                      83414 non-null  int64  
 3   id_driver                 83414 non-null  int64  
 4   id_carrier_number         83414 non-null  object 
 5   dim_carrier_type          83414 non-null  object 
 6   dim_carrier_company_name  83365 non-null  object 
 7   home_base_city            83369 non-null  object 
 8   home_base_state           83369 non-null  object 
 9   carrier_trucks            83414 non-null  object 
 10  num_trucks                83344 non-null  float64
 11  interested_in_drayage     83414 non-null  object 
 12  port_qualified            83414 non-null  object 
 13  signup_source             83414 non-null  object 
 14  ts_sig

# 1. Generate Labels

In [1038]:
# converts date from csv to a python datetime object making it easier to work with
def convert_dates(df):
    dates_columns = ['most_recent_load_date', 'first_load_date', 'load_day', 'dt']
    for col_name in dates_columns:
        try:
            df[col_name] = pd.to_datetime(df[col_name], format='%Y-%m-%d')
        except:
            continue

convert_dates(df)
convert_dates(df_score)

In [1039]:
df['most_recent_load_date'].head()

0   2021-02-17
1   2021-02-03
2   2020-09-25
3   2021-02-17
4   2017-10-11
Name: most_recent_load_date, dtype: datetime64[ns]

In [1040]:
total_loads75 = df.total_loads.quantile(0.75) # finds 75th percentile of loads
most_recent_load_date75 = df.most_recent_load_date.quantile(0.75) # finds 75th percentile of most recent load date

print(total_loads75)
print(most_recent_load_date75)
# Manual Check
# sorted_dts = sorted(list(df.most_recent_load_date))
# quartile_estimate_index = int(len(sorted_dts)*0.75)
# print("SORTED INDEX", sorted_dts[quartile_estimate_index])

325.0
2021-02-14 00:00:00


In [1041]:
# new_labels = {"label": {}}
num_days_worked_dict = {}

for index, row in df.iterrows(): # changes the labels in the label columns
    # checks if the load and most recent load date are in the 75th percentile
    if row["total_loads"] >= total_loads75 and row["most_recent_load_date"] >= most_recent_load_date75:
        df.at[index, "label"] = 1
    else:
        df.at[index, "label"] = 0

In [1042]:
uniqueValues, occurCount = np.unique(np.array(df["label"]), return_counts=True)
print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

Unique Values :  [0. 1.]
Occurrence Count :  [73021 10393]


In [1043]:
loc_arr = []
for index, row in df.iterrows():
    if pd.isnull(row["home_base_city"]) != True and pd.isnull(row["home_base_state"]) != True:
        if (row["home_base_city"], row["home_base_state"]) not in loc_arr:
            loc_arr.append((row["home_base_city"], row["home_base_state"]))

In [1044]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83414 entries, 0 to 83413
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   dt                        83414 non-null  datetime64[ns]
 1   weekday                   83414 non-null  object        
 2   year                      83414 non-null  int64         
 3   id_driver                 83414 non-null  int64         
 4   id_carrier_number         83414 non-null  object        
 5   dim_carrier_type          83414 non-null  object        
 6   dim_carrier_company_name  83365 non-null  object        
 7   home_base_city            83369 non-null  object        
 8   home_base_state           83369 non-null  object        
 9   carrier_trucks            83414 non-null  object        
 10  num_trucks                83344 non-null  float64       
 11  interested_in_drayage     83414 non-null  object        
 12  port_qualified    

In [1045]:
groups = df.groupby("id_driver")

In [1046]:
# NO: dt, weekday, year, id_carrier_number, dim_preferred_lanes, load_day, loads
new_arr = []
for key, group in groups:
    group.sort_values(by="load_day", ascending=False, inplace=True)
    temp_arr = []
    temp_arr.append(key)
    
    if group["dim_carrier_type"].nunique() == 2:
        temp_arr.append("Both")
    elif group["dim_carrier_type"].nunique() == 0:
        temp_arr.append(None)
    else:
        temp_arr.append((group["dim_carrier_type"].iloc[0]))
    
    
    idxmax_cols = ["dim_carrier_company_name", 
                   "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
                  "days_signup_to_approval"] #"home_base_city", "home_base_state",
    
    for col in idxmax_cols:
        try:
            temp_arr.append(group[col].value_counts().dropna(how="any").idxmax())
        except:
            
            temp_arr.append(None)
    
    try:
        x = len(group["home_base_city"].dropna(how="any").iloc[0])
        x = len(group["home_base_state"].dropna(how="any").iloc[0])
        temp_arr.append(group["home_base_city"].dropna(how="any").iloc[0])
        temp_arr.append(group["home_base_state"].dropna(how="any").iloc[0])
    except:
        index_num = random.randint(0, len(loc_arr)-1)
        temp_arr.append(loc_arr[index_num][0])
        temp_arr.append(loc_arr[index_num][1])
        
    
    
    try:
        temp_arr.append(group["num_trucks"].dropna(how="any").mean())
    except:
        temp_arr.append(None)
        
    iloc_cols = ["interested_in_drayage", "port_qualified", "driver_with_twic", 
                 "first_load_date", "most_recent_load_date", "marketplace_loads_otr", 
                 "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
                 "brokerage_loads_atlas", "brokerage_loads", "total_loads"]
    for col in iloc_cols:
        try:
            temp_arr.append(group[col].dropna(how="any").iloc[0])
        except:
            temp_arr.append(None)
    
    temp_arr.append(group.shape[0])
    
    temp_arr.append(((pd.to_datetime(date.today()) - group["most_recent_load_date"].dropna(how="any").iloc[0]).days) * (-1))
    
    temp_arr.append(group["label"].value_counts().dropna(how="any").idxmax())
    
    new_arr.append(np.array(temp_arr))

In [1047]:
column_names = ["id_driver", "dim_carrier_type", "dim_carrier_company_name", 
                "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
                "days_signup_to_approval", "home_base_city", "home_base_state", "num_trucks", 
                "interested_in_drayage", "port_qualified", "driver_with_twic", 
                "first_load_date", "most_recent_load_date", "marketplace_loads_otr", 
                "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
                "brokerage_loads_atlas", "brokerage_loads", "total_loads", "num_trips_made", 
                "days_since_last_load", "label"]

df = pd.DataFrame(np.array(new_arr), columns=column_names)

In [1048]:
for col in column_names:
    try:
        df[col] = df[col].convert_dtypes()
    except:
        continue

In [1049]:
# # NO: dt, weekday, year, id_carrier_number, dim_preferred_lanes, load_day, loads
new_dict = {}
label_dict = {}
most_recent_date_arr = []
num_trips_arr = []
for key, group in groups:
    group.sort_values(by="load_day", ascending=False, inplace=True)
    if key not in new_dict:
        try:
            new_dict[key] = (group["load_day"].dropna(how="any").iloc[0], group.shape[0])
        except:
            new_dict[key] = None
        
        try:
            label_dict[key] = group["label"].dropna(how="any").iloc[0]
        except:
            label_dict[key] = 1

In [1050]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5291 entries, 0 to 5290
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id_driver                 5291 non-null   Int64         
 1   dim_carrier_type          5291 non-null   string        
 2   dim_carrier_company_name  5284 non-null   string        
 3   carrier_trucks            5291 non-null   string        
 4   signup_source             5291 non-null   string        
 5   ts_signup                 5291 non-null   string        
 6   ts_first_approved         3962 non-null   string        
 7   days_signup_to_approval   3962 non-null   Int64         
 8   home_base_city            5291 non-null   string        
 9   home_base_state           5291 non-null   string        
 10  num_trucks                5249 non-null   float64       
 11  interested_in_drayage     5291 non-null   string        
 12  port_qualified      

In [1051]:
df["label"].describe()

count    5291.000000
mean        0.010395
std         0.101434
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64

In [1052]:
groups = df_score.groupby("id_driver")

In [1053]:
# # NO: dt, weekday, year, id_carrier_number, dim_preferred_lanes, load_day, loads
new_dict = {}
most_recent_date_arr = []
num_trips_arr = []
score_labels = []
days_since_last_work_arr = []
trips_dict = {}
for key, group in groups:
    group.sort_values(by="load_day", ascending=False, inplace=True)
    if key not in new_dict:
        try:
            new_dict[key] = (group["load_day"].dropna(how="any").iloc[0], group.shape[0])
        except:
            new_dict[key] = None
    if key not in trips_dict:
        trips_dict[key] = group.shape[0]

for index, row in df_score.iterrows():
    most_recent_date_arr.append(new_dict[row["id_driver"]][0])
    num_trips_arr.append(new_dict[row["id_driver"]][1])
    if row["id_driver"] in label_dict:
        score_labels.append(label_dict[row["id_driver"]])
    else:
        print("Not here: ", row["id_driver"])
        rand_num = random.randint(0, 1000)
        if rand_num <= 125:
            score_labels.append(1)
            label_dict[row["id_driver"]] = 1
        else:
            score_labels.append(0)
            label_dict[row["id_driver"]] = 0
            
    try:
        x = len(row["home_base_city"])
        x = len(row["home_base_state"])
    except:
        index_num = random.randint(0, len(loc_arr)-1)
        df_score.at[index, "home_base_city"] = loc_arr[index_num][0]
        df_score.at[index, "home_base_state"] = loc_arr[index_num][1]
        
    days_since_last_work_arr.append(((pd.to_datetime(date.today()) - row["load_day"]).days) * (-1))

score_labels = np.array(score_labels)
df_score["most_recent_load_date"] = np.array(most_recent_date_arr)
df_score["num_trips_made"] = np.array(num_trips_arr)
df_score["days_since_last_load"] = np.array(days_since_last_work_arr)

Not here:  13711
Not here:  31557
Not here:  16642
Not here:  13761
Not here:  8205
Not here:  35572
Not here:  7623
Not here:  31873
Not here:  12429
Not here:  1288
Not here:  29653
Not here:  20357
Not here:  15073
Not here:  29070
Not here:  10359
Not here:  11514
Not here:  35563
Not here:  7594
Not here:  26545


In [1054]:
column_names = ["id_driver", "dim_carrier_type", "dim_carrier_company_name", 
                "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
                "days_signup_to_approval", "home_base_city", "home_base_state", "num_trucks", 
                "interested_in_drayage", "port_qualified", "driver_with_twic", 
                "first_load_date", "most_recent_load_date", "marketplace_loads_otr", 
                "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
                "brokerage_loads_atlas", "brokerage_loads", "num_trips_made", 
                "days_since_last_load"]

df_temp = df_score.drop(columns = column_names)
to_drop = list(df_temp.columns.values)
df_score = df_score.drop(columns=to_drop)
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id_driver                 1000 non-null   int64         
 1   dim_carrier_type          1000 non-null   object        
 2   dim_carrier_company_name  1000 non-null   object        
 3   home_base_city            1000 non-null   object        
 4   home_base_state           1000 non-null   object        
 5   carrier_trucks            1000 non-null   object        
 6   num_trucks                1000 non-null   float64       
 7   interested_in_drayage     1000 non-null   object        
 8   port_qualified            1000 non-null   object        
 9   signup_source             1000 non-null   object        
 10  ts_signup                 1000 non-null   object        
 11  ts_first_approved         854 non-null    object        
 12  days_signup_to_approv

In [1055]:
for col in column_names:
    try:
        df_score[col] = df_score[col].convert_dtypes()
    except:
        continue

In [1056]:
display(df_score["num_trips_made"])

0      4
1      1
2      1
3      3
4      1
      ..
995    1
996    3
997    1
998    1
999    1
Name: num_trips_made, Length: 1000, dtype: Int64

In [1057]:
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id_driver                 1000 non-null   Int64         
 1   dim_carrier_type          1000 non-null   string        
 2   dim_carrier_company_name  1000 non-null   string        
 3   home_base_city            1000 non-null   string        
 4   home_base_state           1000 non-null   string        
 5   carrier_trucks            1000 non-null   string        
 6   num_trucks                1000 non-null   Int64         
 7   interested_in_drayage     1000 non-null   string        
 8   port_qualified            1000 non-null   string        
 9   signup_source             1000 non-null   string        
 10  ts_signup                 1000 non-null   string        
 11  ts_first_approved         854 non-null    string        
 12  days_signup_to_approv

In [1058]:
drop_cols = ["dt", "weekday", "year", "id_carrier_number", "dim_preferred_lanes", "load_day", "loads"]
for col in drop_cols:
    try:
        df_score = df_score.drop(columns=[col])
    except:
        continue

for col in drop_cols:
    try:
        df = df.drop(columns=[col])
    except:
        continue

In [1059]:
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id_driver                 1000 non-null   Int64         
 1   dim_carrier_type          1000 non-null   string        
 2   dim_carrier_company_name  1000 non-null   string        
 3   home_base_city            1000 non-null   string        
 4   home_base_state           1000 non-null   string        
 5   carrier_trucks            1000 non-null   string        
 6   num_trucks                1000 non-null   Int64         
 7   interested_in_drayage     1000 non-null   string        
 8   port_qualified            1000 non-null   string        
 9   signup_source             1000 non-null   string        
 10  ts_signup                 1000 non-null   string        
 11  ts_first_approved         854 non-null    string        
 12  days_signup_to_approv

# 3. Basic Statistics

In [1060]:
corr_matrix = df.corr()
for col_name in (list(df.columns.values)): # prints all the correlation matrices corresponding to each feature
    try:
        print(col_name)
        display(corr_matrix[col_name].sort_values(ascending=False))
        print('---------------------------------------------------------------------')
    except:
        print("{} is not of type integer".format(col_name))
        print('---------------------------------------------------------------------')

id_driver


id_driver                  1.000000
days_since_last_load       0.839446
num_trucks                 0.169567
brokerage_loads_atlas      0.145307
marketplace_loads_atlas    0.138116
marketplace_loads          0.123344
label                      0.075683
total_loads                0.052722
num_trips_made             0.018007
marketplace_loads_otr     -0.012441
brokerage_loads           -0.037352
brokerage_loads_otr       -0.063795
days_signup_to_approval   -0.734778
Name: id_driver, dtype: float64

---------------------------------------------------------------------
dim_carrier_type
dim_carrier_type is not of type integer
---------------------------------------------------------------------
dim_carrier_company_name
dim_carrier_company_name is not of type integer
---------------------------------------------------------------------
carrier_trucks
carrier_trucks is not of type integer
---------------------------------------------------------------------
signup_source
signup_source is not of type integer
---------------------------------------------------------------------
ts_signup
ts_signup is not of type integer
---------------------------------------------------------------------
ts_first_approved
ts_first_approved is not of type integer
---------------------------------------------------------------------
days_signup_to_approval


days_signup_to_approval    1.000000
brokerage_loads_otr        0.048116
brokerage_loads            0.027929
num_trips_made             0.010462
marketplace_loads_otr      0.007436
total_loads               -0.043765
num_trucks                -0.053498
label                     -0.064084
marketplace_loads         -0.089268
brokerage_loads_atlas     -0.092088
marketplace_loads_atlas   -0.099058
days_since_last_load      -0.628732
id_driver                 -0.734778
Name: days_signup_to_approval, dtype: float64

---------------------------------------------------------------------
home_base_city
home_base_city is not of type integer
---------------------------------------------------------------------
home_base_state
home_base_state is not of type integer
---------------------------------------------------------------------
num_trucks


num_trucks                 1.000000
id_driver                  0.169567
days_since_last_load       0.163449
num_trips_made             0.059784
total_loads                0.017664
brokerage_loads_otr        0.001285
brokerage_loads           -0.006155
label                     -0.035292
brokerage_loads_atlas     -0.042167
marketplace_loads_atlas   -0.050920
days_signup_to_approval   -0.053498
marketplace_loads_otr     -0.058752
marketplace_loads         -0.065660
Name: num_trucks, dtype: float64

---------------------------------------------------------------------
interested_in_drayage
interested_in_drayage is not of type integer
---------------------------------------------------------------------
port_qualified
port_qualified is not of type integer
---------------------------------------------------------------------
driver_with_twic
driver_with_twic is not of type integer
---------------------------------------------------------------------
first_load_date
first_load_date is not of type integer
---------------------------------------------------------------------
most_recent_load_date
most_recent_load_date is not of type integer
---------------------------------------------------------------------
marketplace_loads_otr


marketplace_loads_otr      1.000000
num_trips_made             0.511867
marketplace_loads          0.395290
total_loads                0.268972
label                      0.163324
days_since_last_load       0.157085
marketplace_loads_atlas    0.083147
brokerage_loads_otr        0.080873
brokerage_loads            0.080687
days_signup_to_approval    0.007436
brokerage_loads_atlas      0.005160
id_driver                 -0.012441
num_trucks                -0.058752
Name: marketplace_loads_otr, dtype: float64

---------------------------------------------------------------------
marketplace_loads_atlas


marketplace_loads_atlas    1.000000
marketplace_loads          0.948243
label                      0.577596
total_loads                0.541759
num_trips_made             0.445315
brokerage_loads_atlas      0.410271
days_since_last_load       0.202243
id_driver                  0.138116
marketplace_loads_otr      0.083147
brokerage_loads            0.059796
brokerage_loads_otr       -0.012599
num_trucks                -0.050920
days_signup_to_approval   -0.099058
Name: marketplace_loads_atlas, dtype: float64

---------------------------------------------------------------------
marketplace_loads


marketplace_loads          1.000000
marketplace_loads_atlas    0.948243
total_loads                0.585073
label                      0.584442
num_trips_made             0.573574
marketplace_loads_otr      0.395290
brokerage_loads_atlas      0.379811
days_since_last_load       0.236472
id_driver                  0.123344
brokerage_loads            0.080827
brokerage_loads_otr        0.014158
num_trucks                -0.065660
days_signup_to_approval   -0.089268
Name: marketplace_loads, dtype: float64

---------------------------------------------------------------------
brokerage_loads_otr


brokerage_loads_otr        1.000000
brokerage_loads            0.984385
total_loads                0.803391
num_trips_made             0.572785
label                      0.239032
marketplace_loads_otr      0.080873
days_since_last_load       0.061265
days_signup_to_approval    0.048116
marketplace_loads          0.014158
num_trucks                 0.001285
brokerage_loads_atlas     -0.011804
marketplace_loads_atlas   -0.012599
id_driver                 -0.063795
Name: brokerage_loads_otr, dtype: float64

---------------------------------------------------------------------
brokerage_loads_atlas


brokerage_loads_atlas      1.000000
marketplace_loads_atlas    0.410271
marketplace_loads          0.379811
total_loads                0.329874
label                      0.327819
num_trips_made             0.257713
days_since_last_load       0.179735
brokerage_loads            0.164395
id_driver                  0.145307
marketplace_loads_otr      0.005160
brokerage_loads_otr       -0.011804
num_trucks                -0.042167
days_signup_to_approval   -0.092088
Name: brokerage_loads_atlas, dtype: float64

---------------------------------------------------------------------
brokerage_loads


brokerage_loads            1.000000
brokerage_loads_otr        0.984385
total_loads                0.850586
num_trips_made             0.610399
label                      0.293505
brokerage_loads_atlas      0.164395
days_since_last_load       0.092076
marketplace_loads          0.080827
marketplace_loads_otr      0.080687
marketplace_loads_atlas    0.059796
days_signup_to_approval    0.027929
num_trucks                -0.006155
id_driver                 -0.037352
Name: brokerage_loads, dtype: float64

---------------------------------------------------------------------
total_loads


total_loads                1.000000
brokerage_loads            0.850586
brokerage_loads_otr        0.803391
num_trips_made             0.802860
marketplace_loads          0.585073
label                      0.541811
marketplace_loads_atlas    0.541759
brokerage_loads_atlas      0.329874
marketplace_loads_otr      0.268972
days_since_last_load       0.216076
id_driver                  0.052722
num_trucks                 0.017664
days_signup_to_approval   -0.043765
Name: total_loads, dtype: float64

---------------------------------------------------------------------
num_trips_made


num_trips_made             1.000000
total_loads                0.802860
brokerage_loads            0.610399
marketplace_loads          0.573574
brokerage_loads_otr        0.572785
marketplace_loads_otr      0.511867
label                      0.481593
marketplace_loads_atlas    0.445315
days_since_last_load       0.284921
brokerage_loads_atlas      0.257713
num_trucks                 0.059784
id_driver                  0.018007
days_signup_to_approval    0.010462
Name: num_trips_made, dtype: float64

---------------------------------------------------------------------
days_since_last_load


days_since_last_load       1.000000
id_driver                  0.839446
num_trips_made             0.284921
marketplace_loads          0.236472
total_loads                0.216076
marketplace_loads_atlas    0.202243
brokerage_loads_atlas      0.179735
num_trucks                 0.163449
marketplace_loads_otr      0.157085
label                      0.142288
brokerage_loads            0.092076
brokerage_loads_otr        0.061265
days_signup_to_approval   -0.628732
Name: days_since_last_load, dtype: float64

---------------------------------------------------------------------
label


label                      1.000000
marketplace_loads          0.584442
marketplace_loads_atlas    0.577596
total_loads                0.541811
num_trips_made             0.481593
brokerage_loads_atlas      0.327819
brokerage_loads            0.293505
brokerage_loads_otr        0.239032
marketplace_loads_otr      0.163324
days_since_last_load       0.142288
id_driver                  0.075683
num_trucks                -0.035292
days_signup_to_approval   -0.064084
Name: label, dtype: float64

---------------------------------------------------------------------


 Also year and TODO_FIND_COLUMN_NAME_2 and year are highly correlated and have a similar impact on label, so we could drop one? 

Is there really a need for brokerage_loads when it is so highly correlated to brokerage_loads_otr due to the vast majority of shipments being delivered over-the-road as compared to via ATLAS? 

I have the same question about total_loads due to the vast majority of loads being brokerage loads...

What's the point of having both year and date?

We can remove the id_carrier_number column from this dataset as it is not relevant to predicting a label of 0 or 1 (When trying to find high performing drivers, we need to know their carrier number, so we can extract the id_carrier_number column for now...)

We could one-hot-encode sign-up source and see its effect on labels.

We can remove the ts_first_approved column because the date of approval shouldn't matter that much but instead the days_signup_to_approval matter.

dim_preferred_lanes only has a few values so we can either remove the column or impute values.

Also first_load_date, most_recent_load_date and load_day shouldn't matter much. Instead we can have values such as: number of days doing the job = most_recent_load_date - first_load_date
AND
days_from_last_load_to_today = todays_date - most_recent_load_date

There are also a couple other features we need to impute.

Also, only people that are port qualified can provide drayage services, so we should create a field called qualified_and_interest_in_drayage which is only 1 (yes) when interested_in_drayage = "yes" and port_qualified = "yes". We can also cross these features...

# 4. Data Feature Extraction Plan and Pipeline

In [1061]:
def drayage_feature_cross(df):
    loc_cross = list(zip(df["home_base_city"], df["home_base_state"]))# feature cross to get (city, state) tuple
    # feature cross for interested in drayage and port qualified
    drayage_cross = list(zip(df["interested_in_drayage"], df["port_qualified"]))

    drayage_arr = []
    for list_item in drayage_cross:
        if list_item[0] == "yes" and list_item[1] == "yes":
            drayage_arr.append("000001")
        if list_item[0] == "yes" and list_item[1] == "no":
            drayage_arr.append("000010")
        if list_item[0] == "no" and list_item[1] == "yes":
            drayage_arr.append("000100")
        if list_item[0] == "no" and list_item[1] == "no":
            drayage_arr.append("001000")
        if list_item[0] == "not specified" and list_item[1] == "yes":
            drayage_arr.append("010000")
        if list_item[0] == "not specified" and list_item[1] == "no":
            drayage_arr.append("100000")

    df["drayage_interested_port_qualified"] = np.array(drayage_arr)
    display(df["drayage_interested_port_qualified"])

drayage_feature_cross(df)
drayage_feature_cross(df_score)

0       100000
1       100000
2       100000
3       100000
4       100000
         ...  
5286    100000
5287    000001
5288    000001
5289    000001
5290    000001
Name: drayage_interested_port_qualified, Length: 5291, dtype: object

0      100000
1      000010
2      100000
3      000010
4      000010
        ...  
995    100000
996    100000
997    100000
998    100000
999    000010
Name: drayage_interested_port_qualified, Length: 1000, dtype: object

In [1062]:
id_driver_number_col = np.array(df["id_driver"]) # extract id_driver column
id_driver_number_col_score = np.array(df_score["id_driver"]) # extract id_driver column

drop_cols = ["id_driver", "interested_in_drayage", "port_qualified", 
             "ts_signup", "ts_first_approved"] #"home_base_city","home_base_state",
for col in drop_cols:
    try:
        df_score = df_score.drop(columns=[col])
    except:
        continue

for col in drop_cols:
    try:
        df = df.drop(columns=[col])
    except:
        continue

In [1063]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5291 entries, 0 to 5290
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   dim_carrier_type                   5291 non-null   string        
 1   dim_carrier_company_name           5284 non-null   string        
 2   carrier_trucks                     5291 non-null   string        
 3   signup_source                      5291 non-null   string        
 4   days_signup_to_approval            3962 non-null   Int64         
 5   home_base_city                     5291 non-null   string        
 6   home_base_state                    5291 non-null   string        
 7   num_trucks                         5249 non-null   float64       
 8   driver_with_twic                   5291 non-null   string        
 9   first_load_date                    5291 non-null   datetime64[ns]
 10  most_recent_load_date              5

In [1064]:
names = {}
for index, row in df.iterrows():
    names[row["dim_carrier_company_name"]] = int(names.get(row["dim_carrier_company_name"], 0) + 1)
listo = list(names.items())
listo.sort(reverse=True, key=lambda x: int(x[1]))
#listo[:50]

In [1065]:
listo = listo[:50]
count_50 = sum([x[1] for x in listo])
print("# 50: ", count_50)
print("Percentage 50: ", count_50/len(names))

names_arr = [tuples[0] for tuples in listo]
print(names_arr)

# 50:  1290
Percentage 50:  0.5176565008025682
['NFS asset Drayage', 'MC Express Trucking LLC', 'Roadrunner Transportation', 'Dong Fang Marketing Inc', 'ROADMOND LOGISTICS INC.', 'Consistent Trucking Inc', 'BLUE FREIGHT TRANSPORT INC', 'Convoy Express', 'Mega Fleet', 'iDC Drayage', 'Cross World Logistics', 'USA Diamonds Trucking', 'pointdirect', 'Chaidez Trucking', 'Saia LTL Freight', 'American Better Choice Corporation', 'American Freightways Lp.', 'Fastrucking', 'MERIDIAN LOGISTICS INC', 'Carlos Flores', 'J&G Transportation Group Inc', 'KLF transport inc', 'Star Rain LLC', 'Starco Logistics Inc.', '664 Transport', 'FTS EXPRESS INC', 'IDC OTR', "Luna's Transportation group", 'AGRAMONT TRANSPORT INC', 'BGood VirtueT Inc', 'Geber Freight', 'R&Y Castellanos Trucking Inc.', 'JC Transport', 'Great Qin Transportation LLC', 'JM Express Inc', 'MT Brothers Groups', 'cbt trucking', 'Road Eagle Logistics Corp', 'O.A. EXPRESS INC', 'AMPAK Logistics INC.', 'Kuang Trucking Inc.', 'nolan transportat

In [1066]:
def bucketize(df):
    days_worked = []
    for index, row in df.iterrows(): # bucketize the most frequent dim_carrier_company names, 
                                     # put less frequent names in a single bucket
        try:
            if row["dim_carrier_company_name"] not in names_arr:
                df.at[index, "dim_carrier_company_name"] = "Other"
        except:
            df.at[index, "dim_carrier_company_name"] = "Other"

        # find number of days driver has worked
        if row["most_recent_load_date"] != np.nan and row["first_load_date"] != np.nan:
            days_worked.append((row["most_recent_load_date"] - row["first_load_date"]).days)
        else:
            days_worked.append(None)
    df["days_tenured"] = np.array(days_worked)

bucketize(df)
bucketize(df_score)

In [1067]:
drop_cols = ["most_recent_load_date", "first_load_date", "weekday", "load_day", "total_loads"]
for col in drop_cols:
    try:
        df_score = df_score.drop(columns=[col])
    except:
        continue

for col in drop_cols:
    try:
        df = df.drop(columns=[col])
    except:
        continue

In [1068]:
df_unlabeled = df.drop(columns=["label"])
labels = df["label"].copy()

In [1069]:
uniqueValues, occurCount = np.unique(labels, return_counts=True)
print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

Unique Values :  [0 1]
Occurrence Count :  [5236   55]


In [1070]:
df_unlabeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5291 entries, 0 to 5290
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   dim_carrier_type                   5291 non-null   string 
 1   dim_carrier_company_name           5291 non-null   string 
 2   carrier_trucks                     5291 non-null   string 
 3   signup_source                      5291 non-null   string 
 4   days_signup_to_approval            3962 non-null   Int64  
 5   home_base_city                     5291 non-null   string 
 6   home_base_state                    5291 non-null   string 
 7   num_trucks                         5249 non-null   float64
 8   driver_with_twic                   5291 non-null   string 
 9   marketplace_loads_otr              5291 non-null   Int64  
 10  marketplace_loads_atlas            5291 non-null   Int64  
 11  marketplace_loads                  5291 non-null   Int64

In [1071]:
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   dim_carrier_type                   1000 non-null   string
 1   dim_carrier_company_name           1000 non-null   string
 2   home_base_city                     1000 non-null   string
 3   home_base_state                    1000 non-null   string
 4   carrier_trucks                     1000 non-null   string
 5   num_trucks                         1000 non-null   Int64 
 6   signup_source                      1000 non-null   string
 7   days_signup_to_approval            854 non-null    Int64 
 8   driver_with_twic                   1000 non-null   string
 9   marketplace_loads_otr              1000 non-null   Int64 
 10  marketplace_loads_atlas            1000 non-null   Int64 
 11  marketplace_loads                  1000 non-null   Int64 
 12  brokera

In [1072]:
cols = list(df_unlabeled.columns.values)
print(cols)

['dim_carrier_type', 'dim_carrier_company_name', 'carrier_trucks', 'signup_source', 'days_signup_to_approval', 'home_base_city', 'home_base_state', 'num_trucks', 'driver_with_twic', 'marketplace_loads_otr', 'marketplace_loads_atlas', 'marketplace_loads', 'brokerage_loads_otr', 'brokerage_loads_atlas', 'brokerage_loads', 'num_trips_made', 'days_since_last_load', 'drayage_interested_port_qualified', 'days_tenured']


In [1073]:
df_score = df_score[cols]

df_score['num_trucks'] = pd.to_numeric(df_score['num_trucks'], errors='coerce')
df_score['days_signup_to_approval'] = pd.to_numeric(df_score['days_signup_to_approval'], errors='coerce')

df_unlabeled['num_trucks'] = pd.to_numeric(df_unlabeled['num_trucks'], errors='coerce')
df_unlabeled['days_signup_to_approval'] = pd.to_numeric(df_unlabeled['days_signup_to_approval'], errors='coerce')

convert = ["id_driver", "days_signup_to_approval", "marketplace_loads_otr", 
               "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
               "brokerage_loads_atlas", "brokerage_loads", "num_trips_made",
               "num_trucks", "dim_carrier_type", "dim_carrier_company_name",
               "interested_in_drayage", "port_qualified", "signup_source", "driver_with_twic",
          "home_base_city","home_base_state"] #"home_base_city","home_base_state"
for col in convert:
    try:
        df_score[col] = df_score[col].convert_dtypes()
    except:
        continue

for col in convert:
    try:
        df_unlabeled[col] = df_unlabeled[col].convert_dtypes()
    except:
        continue

In [1074]:
df_concat = pd.concat([df_unlabeled, df_score], ignore_index=True)
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6291 entries, 0 to 6290
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   dim_carrier_type                   6291 non-null   string
 1   dim_carrier_company_name           6291 non-null   string
 2   carrier_trucks                     6291 non-null   string
 3   signup_source                      6291 non-null   string
 4   days_signup_to_approval            4816 non-null   Int64 
 5   home_base_city                     6291 non-null   string
 6   home_base_state                    6291 non-null   string
 7   num_trucks                         6249 non-null   object
 8   driver_with_twic                   6291 non-null   string
 9   marketplace_loads_otr              6291 non-null   Int64 
 10  marketplace_loads_atlas            6291 non-null   Int64 
 11  marketplace_loads                  6291 non-null   Int64 
 12  broker

In [1075]:
for col in convert:
    try:
        df_concat[col] = df_concat[col].convert_dtypes()
    except:
        continue

In [1076]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6291 entries, 0 to 6290
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   dim_carrier_type                   6291 non-null   string 
 1   dim_carrier_company_name           6291 non-null   string 
 2   carrier_trucks                     6291 non-null   string 
 3   signup_source                      6291 non-null   string 
 4   days_signup_to_approval            4816 non-null   Int64  
 5   home_base_city                     6291 non-null   string 
 6   home_base_state                    6291 non-null   string 
 7   num_trucks                         6249 non-null   float64
 8   driver_with_twic                   6291 non-null   string 
 9   marketplace_loads_otr              6291 non-null   Int64  
 10  marketplace_loads_atlas            6291 non-null   Int64  
 11  marketplace_loads                  6291 non-null   Int64

In [1077]:
imputer = IterativeImputer()
categorical_features_one_hot = ["dim_carrier_type", "dim_carrier_company_name", "carrier_trucks", 
                                "signup_source", "driver_with_twic", "home_base_city","home_base_state"] #"home_base_city","home_base_state"

df_num = df_concat.drop(columns=categorical_features_one_hot)
numerical_features = list(df_num)

num_pipeline = Pipeline([
        ('imputer', imputer),
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, numerical_features),
        ("cat", OneHotEncoder(sparse=False), categorical_features_one_hot), #sparse=False
    ])
df_prepared = full_pipeline.fit_transform(df_concat)

In [1078]:
display(df_prepared)

array([[ 1.85116313, -0.39965744, -0.20109732, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.87262531, -0.39965744, -0.20109732, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.88612658, -0.39965744, -0.20109732, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.07263803, -0.39965744,  0.23165021, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.75297732,  1.95624695, -0.20109732, ...,  0.        ,
         0.        ,  0.        ],
       [-1.00674481, -0.18762604, -0.10999258, ...,  0.        ,
         0.        ,  0.        ]])

In [1079]:
X = df_prepared[:5291]
y = labels
X_test_score = df_prepared[5291:]
y_test_score = score_labels
y_test_score = np.array(y_test_score)
y = y.astype('int')
X, y = SMOTE().fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1080]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(X_test_score.shape, y_test_score.shape)

(8377, 647) (2095, 647)
(8377,) (2095,)
(1000, 647) (1000,)


# 5. Linear Regression

In [1081]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lr_predicted = lin_reg.predict(X_test)

In [1082]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, lr_predicted)
rmse = np.sqrt(mse)
rmse

20480946.84227338

In [1083]:
from sklearn.metrics import r2_score

r2_score(y_test, lr_predicted)

-1678155469327497.0

In [1084]:
import statsmodels.api as sm

X_new = sm.add_constant(X_train)
toyregr_sm = sm.OLS(y_train.astype(float), X_new.astype(float))
results_sm = toyregr_sm.fit()

print(results_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                  label   R-squared:                       0.904
Model:                            OLS   Adj. R-squared:                  0.897
Method:                 Least Squares   F-statistic:                     137.9
Date:                Thu, 18 Mar 2021   Prob (F-statistic):               0.00
Time:                        16:18:02   Log-Likelihood:                 3724.1
No. Observations:                8377   AIC:                            -6378.
Df Residuals:                    7842   BIC:                            -2616.
Df Model:                         534                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0002      0.025     -0.008      0.9

In [1085]:
from sklearn.linear_model import LogisticRegression

# y_temp = y_train.astype('int')

log_reg = LogisticRegression(max_iter=1000, multi_class='ovr')
log_reg.fit(X_train, y_train)
log_predicted = log_reg.predict(X_test)

In [1086]:
mse = mean_squared_error(y_test, log_predicted)
rmse = np.sqrt(mse)
rmse

0.06554344147787575

In [1087]:
r2_score(y_test, log_predicted)

0.9828133744852215

In [1088]:
from sklearn.metrics import confusion_matrix

accuracy = (accuracy_score(y_test, log_predicted))
print("accuracy = {}%".format(accuracy))
print("roc_auc_score = {}".format(roc_auc_score(y_test, log_predicted)))
print("F1-score = {}".format(f1_score(y_test, log_predicted)))

confusion_matrix(y_test, log_predicted)

accuracy = 0.9957040572792363%
roc_auc_score = 0.9957587181903864
F1-score = 0.9956668271545499


array([[1052,    9],
       [   0, 1034]])

# 6. PCA

In [1089]:
pca = PCA(n_components=0.95) # Create an instance of PCA model
pca.fit(X_train) # Fit X_train to PCA
X_train = pca.transform(X_train) # transform training data
X_test = pca.transform(X_test) # transform test data
X_test_score_new = pca.transform(X_test_score) # transform score test data
print(pca.explained_variance_)
print(pca.n_components_)

[23.02403235 14.8292382   4.7524796   3.43137662  1.34095534  0.91074499
  0.64874155  0.4554143 ]
8


In [1090]:
print(X_train)

[[-4.56072222 -0.57757824 -0.18739078 ...  0.37283851 -0.48076528
   0.21533659]
 [ 5.37456543  6.39873191 -6.11833163 ...  1.09087228  0.66087737
  -0.20263969]
 [ 2.88971566  8.39506031 -0.90284581 ... -2.09916304 -1.37083757
   0.9057837 ]
 ...
 [ 1.63578445 -1.82888682 -0.4267057  ... -0.13569799 -0.05801183
  -0.32058679]
 [-4.19701409  0.18815678 -0.38575999 ...  0.15592229 -0.03875752
  -0.03139817]
 [ 3.8415452  -1.82514819  9.13115583 ...  0.3777867   0.6996488
  -0.26273237]]


In [1091]:
print(X_test)

[[ 2.81515529 -2.39867401  5.66542471 ...  0.36900056 -0.1246603
   0.07862968]
 [ 2.84188191 -1.89420948  0.10792595 ... -0.28184803 -0.80357005
   0.30222688]
 [-4.6407357  -0.56059659 -0.21440874 ...  0.35191414  0.08350152
   0.46859202]
 ...
 [-3.60017128  0.77364615 -0.59388333 ... -0.22186177 -0.07029444
  -0.15169715]
 [ 3.66627559 -2.98059592  2.46828587 ...  0.43545875 -0.28098249
   0.13599173]
 [ 2.7714826   4.07534974 -3.73233382 ...  2.28801345  0.48994732
  -0.22419151]]


In [1092]:
print(X_test_score_new)

[[-3.8845819   0.65643712 -0.0213289  ...  0.28356137 -0.1871374
   0.17833302]
 [-4.04754212 -0.57698529  0.0609847  ...  0.64010924  0.20487844
  -0.65435265]
 [-3.62914434 -0.32193063 -0.75460868 ...  0.45249467 -0.42689509
   0.50712167]
 ...
 [-4.35941262 -0.51837053 -0.45934686 ...  0.53352694 -0.59794997
   0.33805611]
 [-4.59801611  0.39598001 -0.19308512 ... -0.29584143  2.24377335
   1.09539471]
 [-3.83071435 -0.20358795  0.15808783 ...  0.07165636  0.26965833
  -0.68485615]]


In [1093]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(X_test_score_new.shape, y_test_score.shape)

(8377, 8) (2095, 8)
(8377,) (2095,)
(1000, 8) (1000,)


# 7. Ensemble

In [1094]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score

# assuming we have X_train,X_test,y_train,y_test at this time
# I first run Random Forest using random hard coded settings to get a baseline
rf = RandomForestClassifier(n_estimators=80,max_depth=7,max_features=3)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
test_score = r2_score(y_test,y_pred)
print(test_score)
accuracy = (accuracy_score(y_test, y_pred))
print("accuracy = {}%".format(accuracy))
print("roc_auc_score = {}".format(roc_auc_score(y_test, y_pred)))
print("F1-score = {}".format(f1_score(y_test, y_pred)))

confusion_matrix(y_test, y_pred)

0.9809037494280239
accuracy = 0.9952267303102625%
roc_auc_score = 0.9952874646559849
F1-score = 0.9951876804619826


array([[1051,   10],
       [   0, 1034]])

In [1107]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# I then use RandomizedSearchCV to find the optimal hyperparameters
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 30)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 7, 10]
min_samples_leaf = [2, 5, 7, 10]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestClassifier(class_weight='balanced', random_state=42)

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 148,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 57}

In [1108]:
rf = RandomForestClassifier(n_estimators=148,max_depth=57,max_features='sqrt',min_samples_split=5,min_samples_leaf=2,bootstrap=True)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy = (accuracy_score(y_test, y_pred))
print("accuracy = {}%".format(accuracy))

accuracy = 0.9971360381861575%


In [1109]:
# we then use the hyperparameters we found from the RandomizedSearchCV to do a second more thorough check around that range
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [65, 68, 71],
    'max_features': ['auto'],
    'min_samples_leaf': [2, 4, 6],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [375, 411, 500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 81 candidates, totalling 405 fits


{'bootstrap': True,
 'max_depth': 65,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 375}

In [1110]:
# we then output the results using the optimal hyperparameters to check that our model has improved
rf = RandomForestClassifier(n_estimators=375,max_depth=65,max_features='auto',min_samples_split=5,min_samples_leaf=2,bootstrap=True)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
from sklearn.metrics import r2_score
test_score = r2_score(y_test,y_pred)
test_score

0.9885422496568144

# 8. Neural Network Classifier

In [1095]:
# Neural network
def create_model():
    model = Sequential()
    model.add(Dense(16, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', keras.metrics.Precision(),
                                                                        keras.metrics.Recall(), 
                                                                         keras.metrics.FalsePositives()])
    return model

model = create_model()

model.fit(X_train, y_train, batch_size=20, epochs=50)

y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
accuracy = (accuracy_score(y_test, y_pred))
print("accuracy = {}%".format(accuracy))
print("roc_auc_score = {}".format(roc_auc_score(y_test, y_pred)))
print("F1-score = {}".format(f1_score(y_test, y_pred)))
confusion_matrix(y_test, y_pred)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50
accuracy = 0.9966587112171837%
roc_auc_score = 0.9967012252591895
F1-score = 0.9966265060240964


array([[1054,    7],
       [   0, 1034]])

In [1096]:
y_pred = model.predict(X_test_score_new)
y_pred = (y_pred > 0.5)
print(len(y_pred))
accuracy = (accuracy_score(y_test_score, y_pred))
print("accuracy = {}%".format(accuracy))
print("roc_auc_score = {}".format(roc_auc_score(y_test_score, y_pred)))
print("F1-score = {}".format(f1_score(y_test_score, y_pred)))
confusion_matrix(y_test_score, y_pred)

1000
accuracy = 0.915%
roc_auc_score = 0.7200713783497539
F1-score = 0.5812807881773399


array([[856,  15],
       [ 70,  59]])

# 9. Cross-Validate

In [1097]:
pca = PCA(n_components=0.95) # Create an instance of PCA model
pca.fit(X) # Fit X_train to PCA
X_pca = pca.transform(X) # transform training data

In [1098]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from keras.wrappers.scikit_learn import KerasClassifier

kfold = KFold(n_splits=10, random_state=42, shuffle=True)

rf_model_kfold = RandomForestClassifier(n_estimators=80,max_depth=7,max_features=3)

rf_results_kfold = cross_val_score(rf_model_kfold, X_pca, y, cv=kfold)

print("RF Accuracy: %.2f%%" % (rf_results_kfold.mean()*100.0)) 

nn = KerasClassifier(build_fn=create_model, epochs=50, batch_size=20, verbose=0)

nn_results_kfold = cross_val_score(nn, X_pca, y, cv=kfold)

print("NN Accuracy: %.2f%%" % (nn_results_kfold.mean()*100.0))

RF Accuracy: 99.56%
NN Accuracy: 99.72%


# 10. Custom Model

In [1099]:
# Neural network
def create_custom_model(train_data, d1, d2):
    model = Sequential()
    model.add(Dense(25, input_dim=train_data.shape[1], activation='relu'))
    model.add(Dropout(d1))
    model.add(Dense(12, activation='selu'))
    model.add(Dropout(d2))
    model.add(Dense(4, activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', keras.metrics.Precision(),
                                                                        keras.metrics.Recall(), 
                                                                         keras.metrics.FalsePositives()])
    return model

In [1100]:
model = create_custom_model(X_train, 0.2, 0.1)

model.fit(X_train, y_train, batch_size=20, epochs=75)

y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
accuracy = (accuracy_score(y_test, y_pred))
print("accuracy = {}%".format(accuracy))
print("roc_auc_score = {}".format(roc_auc_score(y_test, y_pred)))
print("F1-score = {}".format(f1_score(y_test, y_pred)))
confusion_matrix(y_test, y_pred)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75


Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75
accuracy = 0.9976133651551312%
roc_auc_score = 0.9976437323279925
F1-score = 0.9975880366618427


array([[1056,    5],
       [   0, 1034]])

In [1101]:
# Neural network
def create_custom_model_cv():
    model = Sequential()
    model.add(Dense(25, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(12, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(4, activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', keras.metrics.Precision(),
                                                                        keras.metrics.Recall(), 
                                                                         keras.metrics.FalsePositives()])
    return model

In [1102]:
kfold = KFold(n_splits=10, random_state=42, shuffle=True)

custom_nn = KerasClassifier(build_fn=create_custom_model_cv, epochs=50, batch_size=20, verbose=0)

custom_nn_results_kfold = cross_val_score(custom_nn, X_pca, y, cv=kfold)

print("Custom NN Accuracy: %.2f%%" % (custom_nn_results_kfold.mean()*100.0))

Custom NN Accuracy: 99.66%


In [1103]:
pca = PCA(n_components=0.95) # Create an instance of PCA model
pca.fit(X) # Fit X_train to PCA
X_pca = pca.transform(X)
X_test_score_kaggle = pca.transform(X_test_score) # transform score test data

In [1104]:
y_pred = [0 for x in range(1000)]

params = [(75, 0.2, 0.1), (10, 0.5, 0.3), (28, 0.1, 0.4), (120, 0.6, 0.1), (50, 0.2, 0.5), (90, 0.2, 0.05),
         (42, 0.25, 0.1), (72, 0.3, 0.15), (25, 0.2, 0.1), (150, 0.2, 0.35)]

for epochs, d1, d2 in params:
    kaggle_model = create_custom_model(X_pca, d1, d2)
    kaggle_model.fit(X_pca, y, batch_size=20, epochs=epochs)

    y_pred_temp = kaggle_model.predict(X_test_score_kaggle)
    y_temp = []
    for i in range(len(y_pred_temp)):
        if y_pred_temp[i] > 0.5:
            y_temp.append(1)
        else:
            y_temp.append(0)
    for i in range(len(y_temp)):
        y_pred[i] += y_temp[i]

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75


Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/28
Epoch 2/28
Epoch 3/28
Epoch 4/28
Epoch 5/28
Epoch 6/28
Epoch 7/28


Epoch 8/28
Epoch 9/28
Epoch 10/28
Epoch 11/28
Epoch 12/28
Epoch 13/28
Epoch 14/28
Epoch 15/28
Epoch 16/28
Epoch 17/28
Epoch 18/28
Epoch 19/28
Epoch 20/28
Epoch 21/28
Epoch 22/28
Epoch 23/28
Epoch 24/28
Epoch 25/28
Epoch 26/28
Epoch 27/28
Epoch 28/28
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120


Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120


Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120


Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50


Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90


Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90


Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 83/90
Epoch 84/90
Epoch 85/90
Epoch 86/90
Epoch 87/90
Epoch 88/90
Epoch 89/90
Epoch 90/90
Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42
Epoch 21/42
Epoch 22/42
Epoch 23/42
Epoch 24/42
Epoch 25/42
Epoch 26/42
Epoch 27/42
Epoch 28/42
Epoch 29/42
Epoch 30/42
Epoch 31/42
Epoch 32/42
Epoch 33/42
Epoch 34/42
Epoch 35/42


Epoch 36/42
Epoch 37/42
Epoch 38/42
Epoch 39/42
Epoch 40/42
Epoch 41/42
Epoch 42/42
Epoch 1/72
Epoch 2/72
Epoch 3/72
Epoch 4/72
Epoch 5/72
Epoch 6/72
Epoch 7/72
Epoch 8/72
Epoch 9/72
Epoch 10/72
Epoch 11/72
Epoch 12/72
Epoch 13/72
Epoch 14/72
Epoch 15/72
Epoch 16/72
Epoch 17/72
Epoch 18/72
Epoch 19/72
Epoch 20/72
Epoch 21/72
Epoch 22/72
Epoch 23/72
Epoch 24/72
Epoch 25/72
Epoch 26/72
Epoch 27/72
Epoch 28/72
Epoch 29/72
Epoch 30/72
Epoch 31/72
Epoch 32/72
Epoch 33/72
Epoch 34/72
Epoch 35/72
Epoch 36/72
Epoch 37/72
Epoch 38/72


Epoch 39/72
Epoch 40/72
Epoch 41/72
Epoch 42/72
Epoch 43/72
Epoch 44/72
Epoch 45/72
Epoch 46/72
Epoch 47/72
Epoch 48/72
Epoch 49/72
Epoch 50/72
Epoch 51/72
Epoch 52/72
Epoch 53/72
Epoch 54/72
Epoch 55/72
Epoch 56/72
Epoch 57/72
Epoch 58/72
Epoch 59/72
Epoch 60/72
Epoch 61/72
Epoch 62/72
Epoch 63/72
Epoch 64/72
Epoch 65/72
Epoch 66/72
Epoch 67/72
Epoch 68/72
Epoch 69/72
Epoch 70/72
Epoch 71/72
Epoch 72/72
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25


Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150


Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150


Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150


Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [1105]:
for i in range(len(y_pred)):
    y_pred[i] = y_pred[i]/10
print(y_pred)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.4, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.4, 0.0, 1.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.3, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0, 0.2, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 1.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.3, 0.6, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.3, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,

In [1106]:
index = np.array([int(x) for x in range(83414, 84414)])
kaggle_df = pd.DataFrame(index, columns=['ID'])

y_pred_temp = []
for i in range(len(y_pred)):
    if y_pred[i] > 0.5:
        y_pred_temp.append(1)
    else:
        y_pred_temp.append(0)
y_pred_temp = np.array(y_pred_temp)
kaggle_df["Predicted"] = y_pred_temp

print(np.mean(y_pred_temp))

kaggle_df

0.098


Unnamed: 0,ID,Predicted
0,83414,0
1,83415,0
2,83416,0
3,83417,0
4,83418,0
...,...,...
995,84409,0
996,84410,0
997,84411,0
998,84412,0


In [1000]:
kaggle_df.to_csv("/Users/shrenik/Desktop/UCLA/Year-3/labels_1.csv", index=False, header=True)