In [None]:
from modelhub import ModelHub

In [None]:
# instantiate the model hub
modelhub = ModelHub(time_aggregation='YYYY-MM-DD')
# get the Bach DataFrame with Objectiv data
df = modelhub.get_objectiv_dataframe(start_date='2022-02-01')

In [None]:
# define which events to use as conversion events
modelhub.add_conversion_event(location_stack=df.location_stack.json[{'id': 'objectiv-on-github', 
                                                                     '_type': 'LinkContext'}:].fillna(
                                             df.location_stack.json[{'id': 'github', '_type': 'LinkContext'}:]),
                              event_type='PressEvent',
                              name='github_press')

In [None]:
df['root'] = df.location_stack.ls.get_from_context_with_type_series(type='RootLocationContext', key='id')
df['nice_name'] = df.location_stack.ls.nice_name

### Three levels to new models
1. [Tools to help you prepare a data set for a model.](#1.-Prepare-the-data-set)
2. [The model itself, that works with Bach DataFrames.](#2.-Run-a-logistic-regression-directly-on-Bach-data) This model can be tested against sklearn.
3. [Classes and methods to run a full model](#3.-Run-a-full-model-on-Objectiv-data)
  - A wrapper class around the model that has methods to get the desired output for business questions. Currently not a lot there, but can include a lot more like sampling etc.
  - A method in the model hub that transforms the data _and_ fits a model (using the wrapper class) directly. Returns the data set the model used and the fitted model itself. This allows users to modify model parameters etc.

#### 1. Prepare the data set

In [None]:
X, y = modelhub.agg.create_feature_usage_data_set(
    data=df[df.event_type=='PressEvent'],
    name='github_press',
    feature_column='root'
)

In [None]:
X.head()

In [None]:
y.head()

#### 2. Run a logistic regression directly on Bach data
Using any dataset with the right dimensions. In the example use the data set created above.

In [None]:
lr = modelhub.agg.LogisticRegression()

In [None]:
lr.fit(X, y, sample_weight=range(471))

In [None]:
lr.fit(X, y)

In [None]:
X

In [None]:
lr.decision_function(X)

In [None]:
lr.decision_function(X, return_bach=True).head()

In [None]:
lr.coef_

In [None]:
lr.sparsify()

In [None]:
lr.coef_

In [None]:
lr.densify()

In [None]:
lr.coef_

In [None]:
lr.get_params(deep=False)

In [None]:
lr.get_params(deep=True)

In [None]:
lr.predict(X)

In [None]:
lr.predict_log_proba(X).head()

In [None]:
lr.predict_proba(X, return_bach=False)

In [None]:
lr.predict_proba(X).head()

In [None]:
lr.score(X, y)

**testing framework**

In [None]:
from tests_modelhub.functional.modelhub.data_and_utils import TestLR

In [None]:
test_lr = TestLR()

In [None]:
test_lr.test_fit()

In [None]:
test_lr.test_method(method_name='predict', X=True)

In [None]:
test_lr.test_method(method_name='predict_proba', X=True)

**considerations**
- runs a lot of queries
- only the ported model works, not other things like pipelines, metrics or gridsearch. all quite important to effective modeling.

#### 3. Run a full model on Objectiv data 

In [None]:
X, y, model = modelhub.agg.feature_importance(data=df[df.event_type=='PressEvent'],
                     name='github_press',
                     feature_column='root')

In [None]:
model.get_results()

In [None]:
model.underlyingmodel.set_params(fit_intercept=False)
model.underlyingmodel.fit(X, y)
model.get_results()