In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from os import cpu_count
from IPython.display import display, clear_output

In [14]:
all_interaction_data = pd.read_csv('../data/all_interactions.csv')
print(len(all_interaction_data.select_dtypes(include='float').columns.tolist()))

1539


In [18]:
def do_classification_accuracy(model,x_data,Y_data, cross_validations=10):
    kf = KFold(n_splits=cross_validations, shuffle=True, random_state=123)
    errors = cross_val_score(model, x_data, Y_data, cv=kf, n_jobs=cpu_count(), scoring='accuracy')
    return np.mean(errors)

In [26]:
# Create a synthetic dataset
X = all_interaction_data.drop('is_high_demand',axis=1)
y = all_interaction_data['is_high_demand']
column_ranges = X.max() - X.min()
non_binary_columns = column_ranges[column_ranges > 1].index.tolist()
(non_binary_columns)

['num__temp',
 'num__dew',
 'num__humidity',
 'num__precip',
 'num__windspeed',
 'num__cloudcover',
 'num__weather_score',
 'num__temp^2',
 'num__temp num__dew',
 'num__temp num__humidity',
 'num__temp num__precip',
 'num__temp num__windspeed',
 'num__temp num__cloudcover',
 'num__temp num__weather_score',
 'num__temp cat__hour_of_day_0',
 'num__temp cat__hour_of_day_1',
 'num__temp cat__hour_of_day_2',
 'num__temp cat__hour_of_day_3',
 'num__temp cat__hour_of_day_4',
 'num__temp cat__hour_of_day_5',
 'num__temp cat__hour_of_day_6',
 'num__temp cat__hour_of_day_7',
 'num__temp cat__hour_of_day_8',
 'num__temp cat__hour_of_day_9',
 'num__temp cat__hour_of_day_10',
 'num__temp cat__hour_of_day_11',
 'num__temp cat__hour_of_day_12',
 'num__temp cat__hour_of_day_13',
 'num__temp cat__hour_of_day_14',
 'num__temp cat__hour_of_day_15',
 'num__temp cat__hour_of_day_16',
 'num__temp cat__hour_of_day_17',
 'num__temp cat__hour_of_day_18',
 'num__temp cat__hour_of_day_19',
 'num__temp cat__hour_

In [31]:
scores = [] 

for k in range(1, 120):
    result = {}
    clear_output(wait=True)
    display(f'prog: {k}/{X.shape[1]}')
    classifier = Pipeline([
        ('pre',ColumnTransformer([
            ('pca',PCA(n_components=k),non_binary_columns)
        ],remainder='passthrough')),
        ('classify', LogisticRegression(random_state=123, max_iter=10000, penalty='l1',solver='liblinear',C=.2))
    ])
    score = do_classification_accuracy(classifier,X,y,cross_validations=5)
    result['score'] = score
    result['k_components'] = k
    scores.append(result)

'prog: 119/1539'

In [32]:
df_scores = pd.DataFrame(scores)
df_scores

Unnamed: 0,score,k_components
0,0.877500,1
1,0.884375,2
2,0.888750,3
3,0.891250,4
4,0.890625,5
...,...,...
114,0.895625,115
115,0.897500,116
116,0.898125,117
117,0.896875,118
