<a href="https://colab.research.google.com/github/tjvilliard/MahineLearning/blob/main/MachineLearning_FinalProj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
def prediction_error(actual, prediction):
    if len(actual) != len(prediction):
        print("arrays not equal length")
        return None
    error = np.equal(actual, prediction)
    error_rate = np.round(1 - (sum(error) / len(actual)), 4)
    return error_rate

In [3]:
train_df = pd.read_csv("train_final.csv")
test_df = pd.read_csv("test_final.csv").set_index("ID")

In [None]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income>50K
0,53,Self-emp-not-inc,93449,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,1
1,33,Self-emp-not-inc,123424,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,47,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,40,Private,114580,HS-grad,9,Divorced,Craft-repair,Other-relative,White,Female,0,0,40,Vietnam,0
4,39,Private,115618,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,0


In [None]:
train_df.iloc[24999]

age                          18
workclass                     ?
fnlwgt                   192321
education          Some-college
education.num                10
marital.status    Never-married
occupation                    ?
relationship          Own-child
race                      White
sex                        Male
capital.gain                  0
capital.loss                  0
hours.per.week               40
native.country    United-States
income>50K                    0
Name: 24999, dtype: object

In [None]:
train_df.where(train_df.isin(["?"])).any()

TypeError: ignored

In [None]:
test_df.head()

Unnamed: 0_level_0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,33,Self-emp-not-inc,222162,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
2,68,Private,29240,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,0,12,United-States
3,34,Private,103596,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States
4,57,Private,103403,5th-6th,3,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States
5,48,Private,152915,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23842 entries, 1 to 23842
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             23842 non-null  int64 
 1   workclass       23842 non-null  object
 2   fnlwgt          23842 non-null  int64 
 3   education       23842 non-null  object
 4   education.num   23842 non-null  int64 
 5   marital.status  23842 non-null  object
 6   occupation      23842 non-null  object
 7   relationship    23842 non-null  object
 8   race            23842 non-null  object
 9   sex             23842 non-null  object
 10  capital.gain    23842 non-null  int64 
 11  capital.loss    23842 non-null  int64 
 12  hours.per.week  23842 non-null  int64 
 13  native.country  23842 non-null  object
dtypes: int64(6), object(8)
memory usage: 2.7+ MB


In [4]:
# Fill missing values in test_df with column mode or median 
train_df_dropna = train_df.replace(["?"], np.nan).dropna()

for (columnName, columnData) in test_df.iteritems():
  if test_df[columnName].dtype == "object":
    most_common = test_df[columnName].mode()[0]
    print(most_common)
    test_df[columnName].fillna(most_common, inplace=True)
    test_df[columnName] = test_df[columnName].replace(["?"], most_common)
  else:
    fill_val = test_df[columnName].mean()
    test_df[columnName].fillna(fill_val, inplace=True)
    test_df[columnName] = test_df[columnName].replace(["?"], fill_val)



Private
HS-grad
Married-civ-spouse
Prof-specialty
Husband
White
Male
United-States


In [5]:
# declare encoder
encoder = OneHotEncoder(handle_unknown='ignore')

# categorical columns
cat_col = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# fit the encoder to the training data categorical columns
encoder.fit(train_df[cat_col])

# encode the categorical columns of test and train df
train_encoding = encoder.transform(train_df[cat_col])
test_encoding = encoder.transform(test_df[cat_col])


In [6]:
# get the df of encoded cat variable columns
encoded_train_df = pd.DataFrame(train_encoding.toarray())

encoded_test_df = pd.DataFrame(test_encoding.toarray())
# align the index of encoded df
encoded_test_df.index = test_df.index

print(encoded_test_df.shape)
print(encoded_train_df.shape)

(23842, 101)
(25000, 101)


In [7]:
# drop original cat columns and join encoded df
target_col = train_df['income>50K']
encoded_train_df = encoded_train_df.join(target_col)
drop_col = cat_col.copy()
drop_col.append("income>50K")

train_df.drop(columns=drop_col, axis=1, inplace=True)
encoded_train_df = train_df.join(encoded_train_df)



In [8]:
reduced_test_df = test_df.drop(cat_col, axis=1)
encoded_test_df = reduced_test_df.join(encoded_test_df)

In [9]:
# convert df to np array and split x and y
train_array = encoded_train_df.to_numpy()
x_train, y_train = train_array[:, :-1], train_array[:, -1]

# prepare encoded test data
test_array = encoded_test_df.to_numpy()

In [108]:
len(test_array)

23842

In [10]:
x, x_holdout, y, y_holdout = train_test_split(x_train, y_train, test_size=0.10)

# training svm on more than 5-10% of the data resulted in unreasonable training times
# performance was still comparable to other classifiers
x_svm, x_holdout_svm, y_svm, y_holdout_svm = train_test_split(x_train, y_train, test_size=0.95)

#Using Svm to Classify test data

In [None]:
#SVM
# linear performed best experimentally
# kernels = ['linear', 'poly', 'rbf', 'sigmoid']
regulizers = [0.5, 1, 2, 5, 10, 50]
for c in regulizers:
  clf = svm.SVC(kernel='linear', C=c)
  clf.fit(x_svm, y_svm)

  train_pred = clf.predict(x_svm)
  holdout_pred = clf.predict(x_holdout_svm)

  # print("Kernel: ", k)
  print("Train Error: ", prediction_error(y_svm, train_pred))
  print("Test Error: ", prediction_error(y_holdout_svm, holdout_pred))
  print("\n")

Train Error:  0.2152
Test Error:  0.2056


Train Error:  0.2184
Test Error:  0.2059


Train Error:  0.2144
Test Error:  0.2051


Train Error:  0.2176
Test Error:  0.2052


Train Error:  0.2168
Test Error:  0.2052


Train Error:  0.2224
Test Error:  0.2081




In [None]:

svm_test_series = pd.Series(svm_test, name='Prediction')
svm_test_series.index = test_df.index
svm_test_series.to_csv("svm_submission.csv")

NameError: ignored

#Ensemble Methods


## Random Forest


In [11]:
depths =[1, 10, 20, 50, 100]
n_estimators = [100, 500, 1000]

for max_depth in depths:
  print("max_depth: ", max_depth)
  for n in n_estimators:
    print("n_estimators: ", n)
    rf_clf = RandomForestClassifier(n_estimators=n,max_depth=max_depth).fit(x,y)
    print("Train Score: ", rf_clf.score(x, y))
    print("Test Score: ", rf_clf.score(x_holdout, y_holdout))
    print('\n')



max_depth:  1
n_estimators:  100
Train Score:  0.7588444444444444
Test Score:  0.764


n_estimators:  500
Train Score:  0.7588444444444444
Test Score:  0.764


n_estimators:  1000
Train Score:  0.7588444444444444
Test Score:  0.764


max_depth:  10
n_estimators:  100
Train Score:  0.8655555555555555
Test Score:  0.8628


n_estimators:  500
Train Score:  0.8671555555555556
Test Score:  0.8624


n_estimators:  1000
Train Score:  0.8665333333333334
Test Score:  0.8616


max_depth:  20
n_estimators:  100
Train Score:  0.9257777777777778
Test Score:  0.8704


n_estimators:  500
Train Score:  0.9250222222222222
Test Score:  0.8708


n_estimators:  1000
Train Score:  0.9256888888888889
Test Score:  0.8704


max_depth:  50
n_estimators:  100
Train Score:  0.9999555555555556
Test Score:  0.8592


n_estimators:  500
Train Score:  0.9999555555555556
Test Score:  0.8636


n_estimators:  1000
Train Score:  0.9999555555555556
Test Score:  0.864


max_depth:  100
n_estimators:  100
Train Score:  0.99

In [None]:
rf_test = RandomForestClassifier(n_estimators=1000, max_depth=10).fit(x_train, y_train).predict(test_array)

In [None]:
rf_test_series = pd.Series(rf_test, name="Prediction")
rf_test_series.index = test_df.index
rf_test_series.to_csv("rf_submission.csv")

In [None]:
rf_test_series

## AdaBoost 

In [None]:
# learning rate of 1 conistently outperforms others
# learning_rates = [0.5, 0.75, 1, 1.5, 2, 5]
n_estimators = [1000, 2000, 5000, 10000]

for n in n_estimators:
    print("n_estimators: ", n)
  # for lr in learning_rates:
    clf = AdaBoostClassifier(n_estimators=n, learning_rate=1)

    clf.fit(x, y)

    train_pred = clf.predict(x)
    holdout_pred = clf.predict(x_holdout)


    print("Train Error: ", prediction_error(y, train_pred))
    print("Test Error: ", prediction_error(y_holdout, holdout_pred))
    print("\n")




n_estimators:  1000
Train Error:  0.1225
Test Error:  0.1308


n_estimators:  2000
Train Error:  0.1198
Test Error:  0.1328


n_estimators:  5000


KeyboardInterrupt: ignored

In [None]:
clf = AdaBoostClassifier(n_estimators=1000, learning_rate=1)
clf.fit(x_train, y_train)

ada_final_pred = clf.predict(test_array)

ada_test_series = pd.Series(ada_final_pred, name="Prediction")
ada_test_series.index = test_df.index
ada_test_series.to_csv("ada_submission.csv")

# Nearest Neighbors

In [None]:
k_neighbors = [3, 5, 10, 50, 100]
for k in range(5, 15):
  knc_uniform = KNeighborsClassifier(k)
  knc_distance = KNeighborsClassifier(k, weights="distance")

  knc_uniform.fit(x, y)
  knc_distance.fit(x, y)

  uni_train_pred = knc_uniform.predict(x)
  uni_holdout_pred = knc_uniform.predict(x_holdout)

  # dist_train_pred = knc_distance.predict(x)
  # dist_holdout_pred = knc_distance.predict(x_holdout)

  print("k: ", k)
  print('w: Uniform========')
  print("Train Error: ", prediction_error(y, uni_train_pred))
  print("Test Error: ", prediction_error(y_holdout, uni_holdout_pred))
  # print('w: Distance=======')
  # print("Train Error: ", prediction_error(y, uni_train_pred))
  # print("Test Error: ", prediction_error(y_holdout, uni_holdout_pred))
  # print('\n')

# MLP

In [None]:
alpha = [.0001, .001, .1, 1, 10]
for a in alpha:
  mlpc_logistic = MLPClassifier(alpha=a,  max_iter=1000, activation="logistic")
  mlpc_relu = MLPClassifier(alpha=a,  max_iter=1000, activation="relu")

  mlpc_logistic.fit(x, y)
  mlpc_relu.fit(x, y)

  log_train_pred = mlpc_logistic.predict(x)
  log_holdout_pred = mlpc_logistic.predict(x_holdout)

  relu_train_pred = mlpc_relu.predict(x)
  relu_holdout_pred = mlpc_relu.predict(x_holdout)

  print("alpha: ", a)
  print('Logistic ========')
  print("Train Error: ", prediction_error(y, log_train_pred))
  print("Test Error: ", prediction_error(y_holdout, log_holdout_pred))
  print('Relu ========')
  print("Train Error: ", prediction_error(y, relu_train_pred))
  print("Test Error: ", prediction_error(y_holdout, relu_holdout_pred))
  print("\n")

alpha:  0.0001
Train Error:  0.2328
Test Error:  0.24
Train Error:  0.2016
Test Error:  0.2098


alpha:  0.001
Train Error:  0.2333
Test Error:  0.2404
Train Error:  0.2147
Test Error:  0.22


alpha:  0.1
Train Error:  0.239
Test Error:  0.244
Train Error:  0.2018
Test Error:  0.2101


alpha:  1
Train Error:  0.239
Test Error:  0.244
Train Error:  0.214
Test Error:  0.2195


alpha:  10
Train Error:  0.239
Test Error:  0.244
Train Error:  0.761
Test Error:  0.756




# Naive Bays

In [None]:
gmnb = GaussianNB().fit(x, y)
mnb = MultinomialNB().fit(x, y)

gmnb_train_pred = gmnb.predict(x)
gmnb_holdout_pred = gmnb.predict(x_holdout)

mnb_train_pred = mnb.predict(x)
mnb_holdout_pred = mnb.predict(x_holdout)

# print('Gaussian ========')
# print("Train Error: ", prediction_error(y, gmnb_train_pred))
# print("Test Error: ", prediction_error(y_holdout, gmnb_holdout_pred))
# print('Multinomial ========')
# print("Train Error: ", prediction_error(y, mnb_train_pred))
# print("Test Error: ", prediction_error(y_holdout, mnb_holdout_pred))
# print("\n")

print('Gaussian ========')
print("Train Error: ", gmnb.score(x, y))
print("Test Error: ", gmnb.score(x_holdout, y_holdout))
print('Multinomial ========')
print("Train Error: ", mnb.score(x, y))
print("Test Error: ", mnb.score(x_holdout, y_holdout))
print("\n")

Train Error:  0.7949777777777778
Test Error:  0.808
Train Error:  0.7822666666666667
Test Error:  0.7976




In [None]:
# for submission
gnb_final = GaussianNB().fit(x_train, y_train).predict(test_array)
gnb_test_series = pd.Series(gnb_final, name="Prediction")
gnb_test_series.index = test_df.index
gnb_test_series.to_csv("gnb_submission.csv")

# Log Regression 

#Keras Test


In [None]:
!pip install tensorflow

In [13]:
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(x.shape[1],)),
    keras.layers.Dense(10, activation=tf.nn.relu),
	  keras.layers.Dense(10, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x, y, epochs=100, batch_size=32)

print(model.evaluate(x_holdout, y_holdout))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [16]:
keras_predictions = model.predict(test_array).flatten()





In [17]:
keras_predictions

array([6.1113659e-02, 5.4381764e-04, 2.2346221e-02, ..., 1.0000000e+00,
       5.5828369e-03, 9.9896216e-01], dtype=float32)

In [18]:
keras_test_series = pd.Series(keras_predictions, name="Prediction")
keras_test_series.index = test_df.index
keras_test_series.to_csv("keras_submission.csv")