In [21]:
import numpy as np
import pandas as pd # data manipulation and analysis
import os # to get absolute path
import datetime # to get current date
from sklearn.ensemble import RandomForestClassifier # for creating a random forest classification model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc # to calculate the accuracy of the model
from mlxtend.feature_selection import SequentialFeatureSelector as sfs # for performing stepwise feature selection

In [22]:
# Read data
# absolute path: npath = "/Users/tanlab/Desktop/YC_Selection_02_14_2024/Data/Raw/winequality-white.csv"
# absolute path: npath = os.path.abspath(os.pardir)+"//Data/Raw/winequality-white.csv"

# We want the codes also work on other computers, so relative path will be our first choice.
npath = '../Data/Raw/winequality-white.csv' # relative path
df = pd.read_csv(npath,index_col = 0, sep=';') # This dataset seperates with ";"

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,:-1], # This takes all rows and all columns except the last one (data)
    df.values[:,-1:], # This selects all rows and only the last column (result - quality)
    test_size = 0.25,
    random_state = 42) # This sets a seed for the random number generator

y_train = y_train.ravel() # The ravel method is used to reduce a multidimensional array to a one-dimensional array
y_test = y_test.ravel()

print('Training dataset shape:', X_train.shape, y_train.shape) # shape - to view the dimensions of an array
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (3673, 10) (3673,)
Testing dataset shape: (1225, 10) (1225,)


In [23]:
# Repeat k_features = 5

# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators = 100, n_jobs = -1) 
# n_estimators=100 - The number of decision trees to be used by the model is 100
# n_jobs = -1 - When set to -1, the algorithm uses all available cores on your computer

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features = 5, # Using k_features = 5. Number of features finally selected is 6 here
           forward = True,
           floating = False,
           verbose = 2,
           scoring = 'accuracy',
           cv = 5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)
# Our best performing model, given our scoring metric, is some subset of 5 features, with a score of 0.635

# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols) # The columns at these indexes are those which were selected

# Build full model with selected features
clf = RandomForestClassifier(n_estimators = 1000, random_state = 42, max_depth = 4) # A bigger tree, still use pevious seed
clf.fit(X_train[:, feat_cols], y_train) # X_train[:, feat_cols] - only use the features we selected

y_train_pred = clf.predict(X_train[:, feat_cols]) # Also only use the features we selected
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

# Build full model on ALL features, for comparison
clf = RandomForestClassifier(n_estimators = 1000, random_state = 42, max_depth = 4) # Same setting as above
clf.fit(X_train, y_train) # Here we use all features

y_train_pred = clf.predict(X_train)
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   25.8s finished

[2024-02-14 14:24:51] Features: 1/5 -- score: 0.49468683386161005[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   23.7s finished

[2024-02-14 14:25:15] Features: 2/5 -- score: 0.5442462325529667[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   23.4s finished

[2024-02-14 14:25:38] Features: 3/5 -- score: 0.6038625368403492[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1

[0, 4, 6, 7, 9]
Training accuracy on selected features: 0.558
Testing accuracy on selected features: 0.513
Training accuracy on all features: 0.567
Testing accuracy on all features: 0.514


In [24]:
# When k_features = 6

# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators = 100, n_jobs = -1) 
# n_estimators=100 - The number of decision trees to be used by the model is 100
# n_jobs = -1 - When set to -1, the algorithm uses all available cores on your computer

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features = 6, # Using k_features = 6. Number of features finally selected is 6 here
           forward = True,
           floating = False,
           verbose = 2,
           scoring = 'accuracy',
           cv = 5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)
# Our best performing model, given our scoring metric, is some subset of 6 features, with a score of 0.644 

# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols) # The columns at these indexes are those which were selected. Results change here

# Build full model with selected features
clf = RandomForestClassifier(n_estimators = 1000, random_state = 42, max_depth = 4) # A bigger tree, still use pevious seed
clf.fit(X_train[:, feat_cols], y_train) # X_train[:, feat_cols] - only use the features we selected

y_train_pred = clf.predict(X_train[:, feat_cols]) # Also only use the features we selected
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

# Build full model on ALL features, for comparison
clf = RandomForestClassifier(n_estimators = 1000, random_state = 42, max_depth = 4) # Same setting as above
clf.fit(X_train, y_train) # Here we use all features

y_train_pred = clf.predict(X_train)
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   20.2s finished

[2024-02-14 14:27:33] Features: 1/6 -- score: 0.49414150401304935[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   26.3s finished

[2024-02-14 14:27:59] Features: 2/6 -- score: 0.5472379469498971[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   26.6s finished

[2024-02-14 14:28:25] Features: 3/6 -- score: 0.6022291423381342[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1

[0, 1, 2, 6, 7, 9]
Training accuracy on selected features: 0.561
Testing accuracy on selected features: 0.512
Training accuracy on all features: 0.567
Testing accuracy on all features: 0.514


In [18]:
# Calculate descriptive statistics
description = df.describe()

# Get current date
current_month = datetime.datetime.now().strftime("%m")
current_date = datetime.datetime.now().strftime("%d")
current_year = datetime.datetime.now().strftime("%Y")

# I set a relative path to put the "Description" file to the "Results" folder
directory_path = '../Results/' # relative path

# Define file name
file_name = f"{directory_path}/YC_Description_{current_month}_{current_date}_{current_year}.csv"

# Save descriptive statistics to CSV
description.to_csv(file_name)

print(f"Descriptive statistics saved to {file_name}")

Descriptive statistics saved to ../Results//YC_Description_02_14_2024.csv


# **The description of the output:**\
  In this practice, we first used a small random forest to identify five or six features, simplifying the model by reducing the number of features. Then, we employed a larger random forest to test the accuracy. The results showed that the accuracy using only five or six features was also high, only slightly lower than the accuracy achieved using all features.\