In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph 
from datetime import date

from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader

%matplotlib inline

import random 
random.seed(42)

In [None]:
csv_path_1 = "training_dataset.csv"
csv_path_2 = "score.csv"

# 0. First, let's understand our data...

In [None]:
df = pd.read_csv(csv_path_1) # load the pandas dataframe
df_score = pd.read_csv(csv_path_2)
initial_cols_to_drop = ["Unnamed: 0","Unnamed: 0.1", "period", "test", "recent_date", "date"] 
for col_name in initial_cols_to_drop: # drops columns that aren't supposed to be in dataset
    try:
        df = df.drop(columns=[col_name])
        df_score = df_score.drop(columns=[col_name])
    except:
        continue
#df = df.rename(columns={"Unnamed: 0.1": "TODO_FIND_COLUMN_NAME_2"})
display(df.head())

In [None]:
df.describe()

In [None]:
df.info()

# 1. Generate Labels

In [None]:
# converts date from csv to a python datetime object making it easier to work with
def convert_dates(df):
    dates_columns = ['most_recent_load_date', 'first_load_date', 'load_day', 'dt']
    for col_name in dates_columns:
        try:
            df[col_name] = pd.to_datetime(df[col_name], format='%Y-%m-%d')
        except:
            continue

convert_dates(df)
convert_dates(df_score)

In [None]:
df['most_recent_load_date'].head()

In [None]:
total_loads75 = df.total_loads.quantile(0.75) # finds 75th percentile of loads
most_recent_load_date75 = df.most_recent_load_date.quantile(0.75) # finds 75th percentile of most recent load date

print(total_loads75)
print(most_recent_load_date75)


# Manual Check
# sorted_dts = sorted(list(df.most_recent_load_date))
# quartile_estimate_index = int(len(sorted_dts)*0.75)
# print("SORTED INDEX", sorted_dts[quartile_estimate_index])

In [None]:
# new_labels = {"label": {}}
num_days_worked_dict = {}

for index, row in df.iterrows(): # changes the labels in the label columns
    # checks if the load and most recent load date are in the 75th percentile
    if row["total_loads"] >= total_loads75 and row["most_recent_load_date"] >= most_recent_load_date75:
        df.at[index, "label"] = 1
    else:
        df.at[index, "label"] = 0

In [None]:
df["dt"].groupby([df["dt"].dt.year, df["dt"].dt.month]).count().plot(kind="bar")

In [None]:
groups = df.groupby("id_driver")

In [None]:
# NO: dt, weekday, year, id_carrier_number, dim_preferred_lanes, load_day, loads
new_arr = []
for key, group in groups:
    group.sort_values(by="load_day", ascending=False, inplace=True)
    temp_arr = []
    temp_arr.append(key)
    
    if group["dim_carrier_type"].nunique() == 2:
        temp_arr.append("Both")
    elif group["dim_carrier_type"].nunique() == 0:
        temp_arr.append(None)
    else:
        temp_arr.append((group["dim_carrier_type"].iloc[0]))
    
    
    idxmax_cols = ["dim_carrier_company_name", "home_base_city", "home_base_state", 
                   "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
                  "days_signup_to_approval"]
    
    for col in idxmax_cols:
        try:
            temp_arr.append(group[col].value_counts().dropna(how="any").idxmax())
        except:
            temp_arr.append(None)
    
    try:
        temp_arr.append(group["num_trucks"].dropna(how="any").mean())
    except:
        temp_arr.append(None)
        
    iloc_cols = ["interested_in_drayage", "port_qualified", "driver_with_twic", 
                 "first_load_date", "most_recent_load_date", "marketplace_loads_otr", 
                 "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
                 "brokerage_loads_atlas", "brokerage_loads", "total_loads"]
    for col in iloc_cols:
        try:
            temp_arr.append(group[col].dropna(how="any").iloc[0])
        except:
            temp_arr.append(None)
            
    temp_arr.append(group["label"].value_counts().dropna(how="any").idxmax())
            
    temp_arr.append(group.shape[0])
    
    new_arr.append(np.array(temp_arr))

In [None]:
column_names = ["id_driver", "dim_carrier_type", "dim_carrier_company_name", "home_base_city", "home_base_state", 
                "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
                "days_signup_to_approval", "num_trucks", "interested_in_drayage", 
                "port_qualified", "driver_with_twic", 
                "first_load_date", "most_recent_load_date", "marketplace_loads_otr", 
                "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
                "brokerage_loads_atlas", "brokerage_loads", "total_loads", "num_trips_made", "label"]

df = pd.DataFrame(np.array(new_arr), columns=column_names)

In [None]:
convert = ["id_driver", "days_signup_to_approval", "marketplace_loads_otr", 
               "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
               "brokerage_loads_atlas", "brokerage_loads", "total_loads", "num_trips_made", "label",
               "num_trucks", "dim_carrier_type", "dim_carrier_company_name", "home_base_city", "home_base_state",
               "interested_in_drayage", "port_qualified", "signup_source", "driver_with_twic"]
for col in convert:
    df[col] = df[col].convert_dtypes()

In [None]:
df.info()

In [None]:
df["label"].describe()

In [None]:
groups = df_score.groupby("id_driver")

In [None]:
# # NO: dt, weekday, year, id_carrier_number, dim_preferred_lanes, load_day, loads
# new_arr = []
# for key, group in groups:
#     group.sort_values(by="load_day", ascending=False, inplace=True)
#     temp_arr = []
#     temp_arr.append(key)
    
#     if group["dim_carrier_type"].nunique() == 2:
#         temp_arr.append("Both")
#     elif group["dim_carrier_type"].nunique() == 0:
#         temp_arr.append(None)
#     else:
#         temp_arr.append((group["dim_carrier_type"].iloc[0]))
    
    
#     idxmax_cols = ["dim_carrier_company_name", "home_base_city", "home_base_state", 
#                    "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
#                   "days_signup_to_approval"]
    
#     for col in idxmax_cols:
#         try:
#             temp_arr.append(group[col].value_counts().dropna(how="any").idxmax())
#         except:
#             temp_arr.append(None)
    
#     try:
#         temp_arr.append(group["num_trucks"].dropna(how="any").mean())
#     except:
#         temp_arr.append(None)
        
#     iloc_cols = ["interested_in_drayage", "port_qualified", "driver_with_twic", 
#                  "first_load_date", "load_day", "marketplace_loads_otr", 
#                  "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
#                  "brokerage_loads_atlas", "brokerage_loads"]
#     for col in iloc_cols:
#         try:
#             temp_arr.append(group[col].dropna(how="any").iloc[0])
#         except:
#             temp_arr.append(None)
            
#     temp_arr.append(group.shape[0])
    
#     new_arr.append(np.array(temp_arr))

In [None]:
# column_names = ["id_driver", "dim_carrier_type", "dim_carrier_company_name", "home_base_city", "home_base_state", 
#                 "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
#                 "days_signup_to_approval", "num_trucks", "interested_in_drayage", 
#                 "port_qualified", "driver_with_twic", 
#                 "first_load_date", "most_recent_load_date", "marketplace_loads_otr", 
#                 "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
#                 "brokerage_loads_atlas", "brokerage_loads", "num_trips_made"]

# df_score = pd.DataFrame(np.array(new_arr), columns=column_names)

In [None]:
# convert = ["id_driver", "days_signup_to_approval", "marketplace_loads_otr", 
#                "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
#                "brokerage_loads_atlas", "brokerage_loads", "num_trips_made",
#                "num_trucks", "dim_carrier_type", "dim_carrier_company_name", "home_base_city", "home_base_state",
#                "interested_in_drayage", "port_qualified", "signup_source", "driver_with_twic"]
# for col in convert:
#     df_score[col] = df_score[col].convert_dtypes()

In [None]:
# NO: dt, weekday, year, id_carrier_number, dim_preferred_lanes, load_day, loads
new_dict = {}
most_recent_date_arr = []
num_trips_arr = []
for key, group in groups:
    group.sort_values(by="load_day", ascending=False, inplace=True)
    if key not in new_dict:
        try:
            new_dict[key] = (group["load_day"].dropna(how="any").iloc[0], group.shape[0])
        except:
            new_dict[key] = None

for index, row in df_score.iterrows():
    most_recent_date_arr.append(new_dict[row["id_driver"]][0])
    num_trips_arr.append(new_dict[row["id_driver"]][1])
df_score["most_recent_load_date"] = np.array(most_recent_date_arr)
df_score["num_trips_made"] = np.array(num_trips_arr)

In [None]:
display(df_score["num_trips_made"])

In [None]:
df_score.info()

In [None]:
# # NO: dt, weekday, year, id_carrier_number, dim_preferred_lanes, load_day, loads



# ["id_driver", "dim_carrier_type", "dim_carrier_company_name", "home_base_city", "home_base_state", 
#                 "carrier_trucks", "signup_source", "ts_signup", "ts_first_approved",
#                 "days_signup_to_approval", "num_trucks", "interested_in_drayage", 
#                 "port_qualified", "driver_with_twic", 
#                 "first_load_date", "most_recent_load_date", "marketplace_loads_otr", 
#                 "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
#                 "brokerage_loads_atlas", "brokerage_loads", "total_loads", "num_trips_made", "label"]

drop_cols = ["dt", "weekday", "year", "id_carrier_number", "dim_preferred_lanes", "load_day", "loads"]
for col in drop_cols:
    try:
        df_score = df_score.drop(columns=[col])
    except:
        continue

In [None]:
df_score.info()

# 3. Basic Statistics

In [None]:
corr_matrix = df.corr()
for col_name in (list(df.columns.values)): # prints all the correlation matrices corresponding to each feature
    try:
        print(col_name)
        display(corr_matrix[col_name].sort_values(ascending=False))
        print('---------------------------------------------------------------------')
    except:
        print("{} is not of type integer".format(col_name))
        print('---------------------------------------------------------------------')

 Also year and TODO_FIND_COLUMN_NAME_2 and year are highly correlated and have a similar impact on label, so we could drop one? 

Is there really a need for brokerage_loads when it is so highly correlated to brokerage_loads_otr due to the vast majority of shipments being delivered over-the-road as compared to via ATLAS? 

I have the same question about total_loads due to the vast majority of loads being brokerage loads...

What's the point of having both year and date?

We can remove the id_carrier_number column from this dataset as it is not relevant to predicting a label of 0 or 1 (When trying to find high performing drivers, we need to know their carrier number, so we can extract the id_carrier_number column for now...)

We could one-hot-encode sign-up source and see its effect on labels.

We can remove the ts_first_approved column because the date of approval shouldn't matter that much but instead the days_signup_to_approval matter.

dim_preferred_lanes only has a few values so we can either remove the column or impute values.

Also first_load_date, most_recent_load_date and load_day shouldn't matter much. Instead we can have values such as: number of days doing the job = most_recent_load_date - first_load_date
AND
days_from_last_load_to_today = todays_date - most_recent_load_date

There are also a couple other features we need to impute.

Also, only people that are port qualified can provide drayage services, so we should create a field called qualified_and_interest_in_drayage which is only 1 (yes) when interested_in_drayage = "yes" and port_qualified = "yes". We can also cross these features...

# 4. Data Feature Extraction Plan and Pipeline

In [None]:
# df["location"] = list(zip(df["home_base_city"], df["home_base_state"]))# feature cross to get (city, state) tuple
# # feature cross for interested in drayage and port qualified
# df["drayage_interested_port_qualified"] = list(zip(df["interested_in_drayage"], df["port_qualified"]))
# display(df["location"])
# display(df["drayage_interested_port_qualified"])

In [None]:
def drayage_feature_cross(df):
    loc_cross = list(zip(df["home_base_city"], df["home_base_state"]))# feature cross to get (city, state) tuple
    # feature cross for interested in drayage and port qualified
    drayage_cross = list(zip(df["interested_in_drayage"], df["port_qualified"]))

    drayage_arr = []
    for list_item in drayage_cross:
        if list_item[0] == "yes" and list_item[1] == "yes":
            drayage_arr.append("000001")
        if list_item[0] == "yes" and list_item[1] == "no":
            drayage_arr.append("000010")
        if list_item[0] == "no" and list_item[1] == "yes":
            drayage_arr.append("000100")
        if list_item[0] == "no" and list_item[1] == "no":
            drayage_arr.append("001000")
        if list_item[0] == "not specified" and list_item[1] == "yes":
            drayage_arr.append("010000")
        if list_item[0] == "not specified" and list_item[1] == "no":
            drayage_arr.append("100000")

    df["drayage_interested_port_qualified"] = np.array(drayage_arr)
    display(df["drayage_interested_port_qualified"])

drayage_feature_cross(df)
drayage_feature_cross(df_score)

In [None]:
id_driver_number_col = np.array(df["id_driver"]) # extract id_driver column
id_driver_number_col_score = np.array(df_score["id_driver"]) # extract id_driver column

drop_cols = ["id_driver", "home_base_city",
             "home_base_state", "interested_in_drayage", "port_qualified", 
             "ts_signup", "ts_first_approved"]
for col in drop_cols:
    try:
        df = df.drop(columns = [col]) # drop columns that don't affect the label value by much
        df_score = df_score.drop(columns = [col]) # drop columns that don't affect the label value by much
    except:
        continue

In [None]:
names = {}
for index, row in df.iterrows():
    names[row["dim_carrier_company_name"]] = int(names.get(row["dim_carrier_company_name"], 0) + 1)
listo = list(names.items())
listo.sort(reverse=True, key=lambda x: int(x[1]))
#listo[:50]

In [None]:
listo = listo[:50]
count_50 = sum([x[1] for x in listo])
print("# 50: ", count_50)
print("Percentage 50: ", count_50/len(names))

names_arr = [tuples[0] for tuples in listo]
print(names_arr)

In [None]:
def bucketize(df):
    days_worked = []
    for index, row in df.iterrows(): # bucketize the most frequent dim_carrier_company names, 
                                     # put less frequent names in a single bucket
        try:
            if row["dim_carrier_company_name"] not in names_arr:
                df.at[index, "dim_carrier_company_name"] = "Other"
        except:
            df.at[index, "dim_carrier_company_name"] = "Other"

        # find number of days driver has worked
        if row["most_recent_load_date"] != np.nan and row["first_load_date"] != np.nan:
            days_worked.append((row["most_recent_load_date"] - row["first_load_date"]).days)
        else:
            days_worked.append(None)
    df["days_tenured"] = np.array(days_worked)

bucketize(df)
bucketize(df_score)

In [None]:
# Get total_loads for df_score in order to find labels
total_loads = []
for index, row in df_score.iterrows():
    total_loads.append(row["marketplace_loads"] + row["brokerage_loads"])
df_score["total_loads"] = np.array(total_loads)

In [None]:
total_loads_score_75 = df_score.total_loads.quantile(0.75) # finds 75th percentile of loads
most_recent_load_date_score_75 = df_score.most_recent_load_date.quantile(0.75) # finds 75th percentile of most recent load date

print(total_loads_score_75)
print(most_recent_load_date_score_75)

In [None]:
score_labels = []
for index, row in df_score.iterrows(): # changes the labels in the label columns
    # checks if the load and most recent load date are in the 75th percentile
    if row["total_loads"] >= total_loads_score_75 and row["most_recent_load_date"] >= most_recent_load_date_score_75:
        score_labels.append(1)
    else:
        score_labels.append(0)
print(len(score_labels))
print(score_labels)

In [None]:
cols = ["most_recent_load_date", "first_load_date", "weekday", "load_day", "total_loads"]
for col in cols:
    try:
        df = df.drop(columns=[col])
        df_score = df_score.drop(columns=[col])
    except:
        continue

In [None]:
df_unlabeled = df.drop(columns=["label"])
labels = df["label"].copy()

In [None]:
df_unlabeled.info()

In [None]:
df_score.info()

In [None]:
cols = list(df_unlabeled.columns.values)
print(cols)

In [None]:
df_score = df_score[cols]

df_score['num_trucks'] = pd.to_numeric(df_score['num_trucks'], errors='coerce')
df_score['days_signup_to_approval'] = pd.to_numeric(df_score['days_signup_to_approval'], errors='coerce')

convert = ["id_driver", "days_signup_to_approval", "marketplace_loads_otr", 
               "marketplace_loads_atlas", "marketplace_loads", "brokerage_loads_otr",
               "brokerage_loads_atlas", "brokerage_loads", "num_trips_made",
               "num_trucks", "dim_carrier_type", "dim_carrier_company_name",
               "interested_in_drayage", "port_qualified", "signup_source", "driver_with_twic"]
for col in convert:
    try:
        df_score[col] = df_score[col].convert_dtypes()
    except:
        continue

In [None]:
df_concat = pd.concat([df_unlabeled, df_score])
df_concat.info()

In [None]:
for col in convert:
    try:
        df_concat[col] = df_concat[col].convert_dtypes()
    except:
        continue

In [None]:
df_concat.info()

In [None]:
# drop = ["dim_carrier_type", "dim_carrier_company_name", "carrier_trucks", 
#                                 "signup_source", "driver_with_twic", "drayage_interested_port_qualified"]
# df_concat = df_concat.drop(columns=drop)

In [None]:
imputer = IterativeImputer()
categorical_features_one_hot = ["dim_carrier_type", "dim_carrier_company_name", "carrier_trucks", 
                                "signup_source", "driver_with_twic"]

df_num = df_concat.drop(columns=categorical_features_one_hot)
# df_num = df_concat
numerical_features = list(df_num)

num_pipeline = Pipeline([
        ('imputer', imputer),
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, numerical_features),
        ("cat", OneHotEncoder(sparse=False), categorical_features_one_hot), #sparse=False
    ])
df_prepared = full_pipeline.fit_transform(df_concat)

In [None]:
display(df_prepared)

In [None]:
X = df_prepared[:5291]
y = labels
score_y = df_prepared[5291:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

# 5. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lr_predicted = lin_reg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, lr_predicted)
rmse = np.sqrt(mse)
rmse

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, lr_predicted)

In [None]:
import statsmodels.api as sm

X_new = sm.add_constant(X_train)
toyregr_sm = sm.OLS(y_train.astype(float), X_new.astype(float))
results_sm = toyregr_sm.fit()

print(results_sm.summary())

In [None]:
from sklearn.linear_model import LogisticRegression

y_temp = y_train.astype('int')

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_temp)
log_predicted = log_reg.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, log_predicted)
rmse = np.sqrt(mse)
rmse

In [None]:
r2_score(y_test, log_predicted)

# 6. PCA

In [None]:
pca = PCA(n_components=0.95) # Create an instance of PCA model
pca.fit(X_train) # Fit X_train to PCA
X_train = pca.transform(X_train) # transform training data
X_test = pca.transform(X_test) # transform test data
print(pca.explained_variance_)
print(pca.n_components_)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

# 7. Ensemble AKA Robert's BS pls mercy

In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# assuming we have X_train,X_test,y_train,y_test at this time
# I first run Random Forest using random hard coded settings to get a baseline
rf = RandomForestRegressor(n_estimators=80,max_depth=7,max_features=3)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
test_score = r2_score(y_test,y_pred)
test_score

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# I then use RandomizedSearchCV to find the optimal hyperparameters
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 7, 10, 20]
min_samples_leaf = [1, 2, 5, 10]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_

In [None]:
# I then output the r2 score again as a sanity check to verify that my RanomdizedSearchCV actually did find the best settings
rf = RandomForestRegressor(n_estimators=700,max_depth=47,max_features='auto',min_samples_split=2,min_samples_leaf=2,bootstrap=True)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
from sklearn.metrics import r2_score
test_score = r2_score(y_test,y_pred)
test_score

In [None]:
# we then use the hyperparameters we found from the RandomizedSearchCV to do a second more thorough check around that range
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [40, 45, 50, 55, 60],
    'max_features': [2, 5, 7, 10, 12],
    'min_samples_leaf': [2, 3, 4, 5, 6],
    'min_samples_split': [2, 3, 4, 5, 6],
    'n_estimators': [100, 200, 500, 700, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
# we then output the results using the optimal hyperparameters to check that our model has improved
rf = RandomForestRegressor(n_estimators=500,max_depth=40,max_features=7,min_samples_split=4,min_samples_leaf=2,bootstrap=True)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
from sklearn.metrics import r2_score
test_score = r2_score(y_test,y_pred)
test_score

In [None]:
# AdaBoost using the same settings
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

dt = DecisionTreeClassifier(max_depth=3)
ab = AdaBoostClassifier(base_estimator=dt,learning_rate=1,n_estimators=50)
ab.fit(X_train,y_train)
y_pred = ab.predict(X_test)
test_score = r2_score(y_test,y_pred)
accuracyResult = metrics.accuracy_score(y_test,y_pred)
print("R2 Score: ",test_score)
print("Accuracy Score: ",accuracyResult)

# 8. Neural Network Classifier

In [None]:
# # Define a transform to normalize the data
# transform = transforms.Compose([transforms.ToTensor(),
#                                 transforms.Normalize((0.5,), (0.5,)),
#                               ])
# Download and load the training data
NN_X_train = torch.tensor(df.drop(["label"], axis=1).values)
NN_y_train = torch.tensor(df["label"].values)
trainset = TensorDataset(NN_X_train, NN_y_train)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)

In [None]:
NUM_FEATURES = 20
NUM_HIDDEN1_NODES = 400
NUM_EPOCHS = 30

model = nn.Sequential(nn.Linear(NUM_FEATURES, NUM_HIDDEN1_NODES),
                      nn.Sigmoid(),
                      nn.Linear(NUM_HIDDEN1_NODES, 1)
                     )

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.003)

for e in range(NUM_EPOCHS):
    running_loss = 0
    for data, labels in trainloader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Training loss: {running_loss/len(trainloader)}")
    
# class NeuralNet(nn.Module):
#     def __init__(self):
#         super(NeuralNet, self).__init__()
        
#         # Inputs to hidden layer linear transformation
#         self.hidden1 = nn.Linear(NUM_FEATURES, NUM_HIDDEN1_NODES)
#         self.output = nn.Linear(NUM_HIDDEN1_NODES, 1)
        
#         self.sigmoid = nn.Sigmoid()
        
#     def forward(self, x):
#         x = self.hidden1(x)
#         x = self.sigmoid(x)
#         x = self.output(x)
#         x = self.softmax(x)
#         return x

In [None]:
# model = Network()
# model

# 9. Cross-Validate

In [None]:
from sklearn import model_selection

kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)

sample_model_kfold = sample()

sample_results_kfold = model_selection.cross_val_score(sample_model_kfold, df_prepared, df_labels, cv=kfold)