In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error

# read in dataset with date column parsed
df = pd.read_csv('cleanedWeatherAUS.csv',
    parse_dates=['Date'],
    index_col='Date')

# split dataset, 80% train 20% test
from sklearn.model_selection import train_test_split
y = df['RainTomorrow']
xs = df[df.columns.difference(['RainTomorrow'])]
x_train, x_test, y_train, y_test = train_test_split(xs, y, test_size=0.2, train_size=0.8)


In [3]:
# dimension reduction? via PCA

from sklearn.decomposition import PCA 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


pca = PCA()

print(len(x_train.columns))
rmse = 1
min_test_error = 1
for m in range(1, len(x_train.columns)): # M = # columns of data?
    # apply PCA to dataset
    pca.n_components = m
    
    x_train_copy = x_train.copy()
    x_test_copy = x_test.copy()
    x_train_copy = pca.fit_transform(x_train_copy)
    x_test_copy = pca.transform(x_test_copy)
    
#     fit a linear regression model using cross-validation
    model = LogisticRegression()
    cv = GridSearchCV(model, {}, refit=True, cv=5)
    cv = cv.fit(x_train_copy, y_train)
    y_pred = cv.predict(x_test_copy)
    
    rmse = mean_squared_error(y_test, y_pred)
    if rmse < min_test_error:
        min_test_error = rmse
        score = cv.score(x_test_copy, y_test)
        best_m = m
        
    
# obtain test error
print("Score for test: ", score)
print("RMSE for test: ", rmse)

# obtain value of M 
print("M: ", best_m) # obtain what columns are included???

21
Score for test:  0.8441225078237632
RMSE for test:  0.1564752628432786
M:  19


# BEWARE!

In [None]:
NOTE: DO NOT RERUN THE SEQUENTIAL FEATURE SELECTOR WITHOUT GOOD REASON. IT TAKES >15 MINUTES!!!!!

BE WARNED!!!!

In [6]:
# feature selection


from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestClassifier




# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(x_train, y_train)




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:  2.1min finished

[2023-04-07 17:13:46] Features: 1/5 -- score: 0.8232061956943324[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  3.0min finished

[2023-04-07 17:16:49] Features: 2/5 -- score: 0.8276719860158301[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:  2.8min finished

[2023-04-07 17:19:34] Features: 3/5 -- score: 0.8269950814865764[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [7]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[0, 1, 3, 10, 11]


In [9]:
print('The top features are:', [df.columns[x] for x in feat_cols])
print(sfs1.subsets_)

['Location', 'MinTemp', 'Rainfall', 'WindSpeed9am', 'WindSpeed3pm']
{1: {'feature_idx': (3,), 'cv_scores': array([0.82440332, 0.82457914, 0.824711  , 0.82154631, 0.82079121]), 'avg_score': 0.8232061956943324, 'feature_names': ('Humidity3pm',)}, 2: {'feature_idx': (3, 10), 'cv_scores': array([0.8289306 , 0.82765593, 0.82875478, 0.8262494 , 0.82676923]), 'avg_score': 0.8276719860158301, 'feature_names': ('Humidity3pm', 'RainToday')}, 3: {'feature_idx': (0, 3, 10), 'cv_scores': array([0.82857896, 0.82664498, 0.82967782, 0.82457914, 0.82549451]), 'avg_score': 0.8269950814865764, 'feature_names': ('Cloud3pm', 'Humidity3pm', 'RainToday')}, 4: {'feature_idx': (0, 1, 3, 10), 'cv_scores': array([0.81956837, 0.8188651 , 0.81943651, 0.81552459, 0.816     ]), 'avg_score': 0.8178789152125182, 'feature_names': ('Cloud3pm', 'Cloud9am', 'Humidity3pm', 'RainToday')}, 5: {'feature_idx': (0, 1, 3, 10, 11), 'cv_scores': array([0.8102501 , 0.81152477, 0.8118764 , 0.81227199, 0.81151648]), 'avg_score': 0.81

From the previous output of the "subsets" intermediate results, I see that the CV scores decrease with each added column after 2. 
The features per each number of features are:
1: 'Humidity3pm', 
2: 'RainToday'
3: 'Cloud3pm',
4: 'Cloud9am',
5: 'Rainfall',
Not sure what to make of the scores. 
