In [22]:
import sys
sys.path.append('../preprocess/')
sys.path.append('../evaluate/')


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_fscore_support

from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint

import cleanup
import prepare
import evaluate

# n_jobs
NJ = -1

# scoring
SCORE = 'f1_weighted'

# n_splits
STRAT_SPLITS = 3

# n_iter
SEARCH_ITER = 3

data = pd.read_csv("data_transformed.csv")
cleanup.dropFeatures(data, ['appId', 'similarApps'])




######################################
# Load preprocessed Data
#
X_all, y_all = data.drop('price', axis=1), data['price']

print("{0} X_all".format(X_all.shape))

2 features dropped
(24768, 16) X_all


In [27]:
data[['price', 'starRating']]

data.columns


Index(['price', 'starRating', 'category', 'totalNrOfReviews', 'installs',
       'contentRating', 'libraries', 'bigCompany', 'daysSinceLastUpdated',
       'text', '1starReviews', '2starReviews', '3starReviews', '4starReviews',
       '5starReviews', 'requiredAndroidVersion_major',
       'requiredAndroidVersion_minor'],
      dtype='object')

In [2]:
    ######################################
    # Train / Test Split
    #
    X_train, X_eval, y_train, y_eval = train_test_split(X_all, y_all)

    # just to be save
    X_train = X_train.copy()
    X_eval = X_eval.copy()
    y_train = y_train.copy()
    y_eval = y_eval.copy()

    print("{0} X_train".format(X_train.shape))
    print("{0} X_eval".format(X_eval.shape))



    ######################################
    # Preprocessing
    #
    cleanup.dropFeatures(X_train, ['text', 'category'])
    cleanup.dropFeatures(X_eval, ['text', 'category'])

    # calculate price quartiles manually
    # cannot modify y with a custom transformer
    prepare.price_quartiles_transform(y_train, y_eval)

    # column encoder
    column_encoder = prepare.encode(X_train.columns)



    ######################################
    # Model building
    #
    pipe_steps = [
        ('encode', column_encoder),
        ('gradient', GradientBoostingClassifier())
    ]
    pipeline = Pipeline(pipe_steps)

    # here I need to add things



    ######################################
    # Model tuning
    #
    strat_3 = StratifiedKFold(n_splits=STRAT_SPLITS)

    search = RSCV(estimator=pipeline,
                  param_distributions={
                      "gradient__n_estimators": sp_randint(80, 120),
                      "gradient__max_depth": sp_randint(2, 7)
                  },
                  n_iter=SEARCH_ITER, scoring=SCORE, n_jobs=NJ, cv=strat_3)



    ######################################
    # Model fitting
    #
    search.fit(X_train, y_train)



    ######################################
    # Model prediction
    #
    y_pred = search.predict(X_eval)
    y_proba = search.predict_proba(X_eval)
    y_proba_pos = y_proba[:, 1]





(18576, 16) X_train
(6192, 16) X_eval
2 features dropped
2 features dropped

y_train value counts:
2.0    5272
1.0    5145
4.0    4642
3.0    3517
Name: price, dtype: int64

y_test value counts:
2.0    1750
1.0    1642
4.0    1561
3.0    1239
Name: price, dtype: int64


In [21]:
######################################
# Evaluate Model performance
#
rs = evaluate.onSearch(pd.Series(), search)
evaluate.onMetrics(rs, y_eval, y_pred)
print(rs)



encode__default                                            False
encode__df_out                                             False
encode__sparse                                             False
encode__features_installs                       ['LabelEncoder']
encode__features_contentRating                  ['LabelEncoder']
step_1_name                                               encode
step_1_est                                       DataFrameMapper
encode__input_df                                           False
gradient__n_estimators                                        97
gradient__random_state                                      None
gradient__criterion                                 friedman_mse
gradient__max_features                                      None
gradient__presort                                           auto
gradient__loss                                          deviance
gradient__max_depth                                            6
gradient__min_samples_lea