In the energy industry, construction projects which produce electricity follow a well-defined, sequential process. Once the build is far enough along to connect to the grid and hypothetically generate power, builders must ask grid operators for approval to turn the system "on". Approval is contingent on a long but predictable list of compliance concerns, from the specifications of the equipment to the workmanship quality of the installation. This process applies to homeowners with a few solar panels on their roof, to the multimillion dollar nuclear power plant, and every variety of project in between.

The approving body is called the Authority Having Jurisdiction, or AHJ. 

In [626]:
from pprint import pprint
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn

from joblib import dump, load

In [608]:
df = pd.read_pickle('pv_apps_v2.pkl')
df["service county"] = df["service county"].str.lower()
data = ['utility','technology type','service county','mounting method', 'tracking', 'customer sector', 'system size dc','time_to_approval']
cols_to_drop = [col for col in list(df.columns.values) if col not in data]

In [609]:
df = df.drop(columns=cols_to_drop, axis=1)
df = df.fillna(value='not_provided')
data_to_transform = {
    "num" : 'system size dc',
    "cat" : ['utility','technology type','service county','mounting method', 'tracking', 'customer sector']
}

In [610]:
def transform_cat_features(raw_features):
    dummies = pd.get_dummies(raw_features)
    return dummies    

In [611]:
a = transform_cat_features(df[data_to_transform["cat"]])

In [612]:
def scale_num_features(raw_features):
    raw_features = np.array(raw_features).reshape(-1,1)
    scaler = StandardScaler()
    output = scaler.fit_transform(raw_features)
    return output

In [613]:
b = scale_num_features(df[data_to_transform["num"]])

In [631]:
b = b.flatten()
a['dc_cap'] = pd.Series(b)
a.to_pickle('')

pandas.core.frame.DataFrame

In [615]:
X = a.fillna(a.median())
y = df["time_to_approval"]


In [616]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [624]:
def fetch_logged_data(run_id):
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

LinearRegression()

<bound method BaseEstimator.get_params of LinearRegression()>