In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Adult analysis

In [None]:
df_rain = pd.read_csv('datasets/weatherAUS_clean.csv')
rain_feats = df_rain.columns.values
rain_x = df_rain[rain_feats[1:-1]]
rain_y = df_rain[rain_feats[-1]]
X_adult_train, X_adult_test, Y_adult_train, Y_adult_test = train_test_split(rain_x, rain_y, test_size=0.2)

# Common Functions

In [None]:
def square_loss(y_true, y_pred):
    n = y_true.shape[0]
    return np.sum((y_pred - y_true) ** 2) / n

def rank_feature_importance(feature_name, feature_score):
    n = feature_name.shape[0]
    lst = [(feature_name[i], feature_score[i]) for i in range(n)]
    lst.sort(key=lambda x:x[1], reverse=True)
    # uncomment to see just the rank without the score
    # return [x[0] for x in lst]
    return lst

def select_features_filter(all_features, target_features):
    lst = []
    for i in range(len(all_features)):
        if all_features[i] in target_features:
            lst.append(True)
        else:
            lst.append(False)
    return lst

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Observation: when n_estimators=100 the ranked_importance changes!
n_estimators = 1000
for i in range(1):
    rf_clf = RandomForestClassifier(random_state=i, n_estimators=n_estimators)
    rf_clf.fit(X_adult_train, Y_adult_train)
    y_pred = rf_clf.predict(X_adult_test)
    loss = square_loss(Y_adult_test, y_pred)
    print(f"random state = {i}")
    print(f"loss = {loss}")
    ranked_importance = rank_feature_importance(X_adult_train.columns, rf_clf.feature_importances_)
    print(ranked_importance)

In [None]:
all_features = list(X_adult_train.columns)
ranked_features = [ x[0] for x in ranked_importance]

random_states = [0, 1]
num_estimators = 100

square_loss_matrix = np.zeros((len(random_states), len(ranked_features)))
for i in range(len(random_states)):
    print(f"random state = {i}")
    rs = random_states[i]
    square_loss_lst = []
    end = len(ranked_features) + 1
    start = end - 15
    for j in range(start, end):
        print(f"j = {j}")
        target_features = ranked_features[:j]
        features_filter = select_features_filter(all_features, target_features)
        X_adult_train_selected_features = X_adult_train.loc[:, features_filter]
        X_adult_test_selected_features = X_adult_test.loc[:, features_filter]
        rf_clf = RandomForestClassifier(random_state=rs, n_estimators=num_estimators)
        rf_clf.fit(X_adult_train_selected_features, Y_adult_train)
        square_loss_value = square_loss(Y_adult_test, rf_clf.predict(X_adult_test_selected_features))
        square_loss_lst.append(square_loss_value)
    square_loss_matrix[i,:] = np.array(square_loss_lst)

In [None]:
x_axis = np.arange(start, end)
for i in range(len(random_states)):
    plt.plot(x_axis,square_loss_matrix[i], label=f"random state = {i}")
plt.xlabel("number of most important features used")
plt.ylabel("squared loss")
plt.title("Random Forrest")
plt.legend()
plt.show()

# Random forest selected features
Select top 8 performs almost as well as the complete set of features

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC


rf_selected_features = [x[0] for x in ranked_importance[:8]]
num_features = len(rf_selected_features)
X_adult_train_selected_features = X_adult_train.loc[:, rf_selected_features]
X_adult_test_selected_features = X_adult_test.loc[:, rf_selected_features]

print(f"{num_features} significant RF features with RF classifer ------")
num_estimators = 1000
rf_clf = RandomForestClassifier(random_state=rs, n_estimators=num_estimators)
rf_clf.fit(X_adult_train_selected_features, Y_adult_train)
y_pred = rf_clf.predict(X_adult_test_selected_features)

acc = accuracy_score(Y_adult_test, y_pred)
print(f"accuracy = {acc}")

f1 = f1_score(Y_adult_test, y_pred)
print(f"f1 score = {f1}")

con_matrix = confusion_matrix(Y_adult_test, y_pred)
print(f"confusion_matrix = {con_matrix}")

print(f"{num_features} significant RF features with Ridge classifer -----")
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_adult_train_selected_features, Y_adult_train)
y_pred = ridge_clf.predict(X_adult_test_selected_features)

acc = accuracy_score(Y_adult_test, y_pred)
print(f"accuracy = {acc}")

f1 = f1_score(Y_adult_test, y_pred)
print(f"f1 score = {f1}")

con_matrix = confusion_matrix(Y_adult_test, y_pred)
print(f"confusion_matrix = {con_matrix}")

print(f"{num_features} significant RF features with SVC Classifer -----")
svc_clf = SVC()
svc_clf.fit(X_adult_train_selected_features, Y_adult_train)
y_pred = svc_clf.predict(X_adult_test_selected_features)

acc = accuracy_score(Y_adult_test, y_pred)
print(f"accuracy = {acc}")

f1 = f1_score(Y_adult_test, y_pred)
print(f"f1 score = {f1}")

con_matrix = confusion_matrix(Y_adult_test, y_pred)
print(f"confusion_matrix = {con_matrix}")


# Lasso Feature Selection

In [None]:
from sklearn import linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import normalize

In [None]:
def select_feature(all_features, feature_filter):
    lst = []
    for i in range(len(feature_filter)):
        
        if feature_filter[i]:
            lst.append(all_features[i])
    return lst

n = 10
loss_lst = []
features_used = []
alphas = np.linspace(0.00001,0.0001,15)
for a in alphas:
    # print(f"a = {a} =========")
    lasso_clf = linear_model.Lasso(alpha=a, normalize=True, random_state=1)
    lasso_clf.fit(X_adult_train, Y_adult_train)
    feature_filter = ~np.isclose(lasso_clf.coef_, 0)
    selected_features = select_feature(X_adult_train.columns, feature_filter)
    print(len(selected_features))
    print(selected_features)
    y_pred = lasso_clf.predict(X_adult_test)
    loss = square_loss(Y_adult_test, y_pred)
    loss_lst.append(loss)
    features_used.append(np.sum(feature_filter))
    # print(f"loss = {loss}")

In [None]:
fig, ax1 = plt.subplots()

color = 'red'
ax1.set_xlabel('alpha (L1 constant)')
ax1.set_ylabel('squared loss', color=color)
ax1.plot(alphas, loss_lst, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'blue'
ax2.set_ylabel('features used', color=color)  # we already handled the x-label with ax1
ax2.plot(alphas, features_used, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.title("Lasso squared Loss vs features used")

plt.show()

# LASSO selected features

['volatile acidity', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'alcohol']
Random forest didn't use 'free sulfur dioxide'

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC

lasso_selected_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'marital_status_ Married-AF-spouse', 'marital_status_ Married-civ-spouse', 'marital_status_ Married-spouse-absent', 'marital_status_ Never-married', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'relationship_ Husband', 'relationship_ Not-in-family', 'relationship_ Wife', 'race_ White', 'sex_ Female', 'native_country_ ?', 'native_country_ China', 'native_country_ Columbia', 'native_country_ England', 'native_country_ France', 'native_country_ Ireland', 'native_country_ South']
num_features = len(lasso_selected_features)
X_adult_train_selected_features = X_adult_train.loc[:, lasso_selected_features]
X_adult_test_selected_features = X_adult_test.loc[:, lasso_selected_features]

print(f"{num_features} significant LASSO features with RF classifer ------")
num_estimators = 10
rf_clf = RandomForestClassifier(random_state=rs, n_estimators=num_estimators)
rf_clf.fit(X_adult_train_selected_features, Y_adult_train)
y_pred = rf_clf.predict(X_adult_test_selected_features)

acc = accuracy_score(Y_adult_test, y_pred)
print(f"accuracy = {acc}")

f1 = f1_score(Y_adult_test, y_pred)
print(f"f1 score = {f1}")

con_matrix = confusion_matrix(Y_adult_test, y_pred)
print(f"confusion_matrix = {con_matrix}")

print(f"{num_features} significant LASSO features with Ridge classifer -----")
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_adult_train_selected_features, Y_adult_train)
y_pred = ridge_clf.predict(X_adult_test_selected_features)

acc = accuracy_score(Y_adult_test, y_pred)
print(f"accuracy = {acc}")

f1 = f1_score(Y_adult_test, y_pred)
print(f"f1 score = {f1}")

con_matrix = confusion_matrix(Y_adult_test, y_pred)
print(f"confusion_matrix = {con_matrix}")

print(f"{num_features} significant LASSO features with SVC Classifer -----")
svc_clf = SVC()
svc_clf.fit(X_adult_train_selected_features, Y_adult_train)
y_pred = svc_clf.predict(X_adult_test_selected_features)

acc = accuracy_score(Y_adult_test, y_pred)
print(f"accuracy = {acc}")

f1 = f1_score(Y_adult_test, y_pred)
print(f"f1 score = {f1}")

con_matrix = confusion_matrix(Y_adult_test, y_pred)
print(f"confusion_matrix = {con_matrix}")