# MDF Single model learning by substances and quantities

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
%matplotlib inline

In [2]:
from prepare_data import NONNUM_TYPES, NON_QUANTITY_TYPES, QUANTITY_TYPES, MEASURE_TYPES, PROCESSED_LIBELLES, NB_COLS, LIBELLE_COLS, BIG, PROCESSED_BIG, PROCESSED_SUBSTANCES, PROCESSED_VOIE_ADMIN, TARGET_COLUMNS, TRAIN, TEST, SOURCE

-- get_unique_noconst --
- Number of found duplicated rows :  1276
- Remove constant columns :  (10959, 42) -> (10959, 42)
- Dropped const columns :  


In [3]:
PROCESSED_TRAIN = PROCESSED_BIG[PROCESSED_BIG[SOURCE] == TRAIN].drop(SOURCE, axis=1)
PROCESSED_TRAIN_SUBS = PROCESSED_SUBSTANCES[PROCESSED_SUBSTANCES[SOURCE] == TRAIN].drop(SOURCE, axis=1)
PROCESSED_TRAIN_VOIE_ADMIN = PROCESSED_VOIE_ADMIN[PROCESSED_VOIE_ADMIN[SOURCE] == TRAIN].drop(SOURCE, axis=1)
PROCESSED_TRAIN_LIBELLES = PROCESSED_LIBELLES[PROCESSED_LIBELLES[SOURCE] == TRAIN].drop(SOURCE, axis=1)

In [4]:
print PROCESSED_TRAIN.shape, PROCESSED_TRAIN_SUBS.shape, PROCESSED_TRAIN_VOIE_ADMIN.shape, PROCESSED_TRAIN_LIBELLES.shape

(7496, 40) (7496, 1767) (7496, 45) (7496, 82)


In [None]:
PROCESSED_TRAIN.columns.values

In [None]:
PROCESSED_VOIE_ADMIN.columns.values

In [None]:
PROCESSED_SUBSTANCES.columns.values

In [None]:
MEASURE_TYPES

## Metric MAPE

In [5]:
# Mean Absolute Percentage Error
def mape_error(y_true, y_pred, **kwards): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Learn by medecin types, substances and its quantities

In [6]:
df = PROCESSED_TRAIN
df_libelles = PROCESSED_TRAIN_LIBELLES
df_subs = PROCESSED_TRAIN_SUBS.drop(['_ENCODED_'], axis=1)

### Learn by medecin measure type : ml

In [7]:
mask = df_libelles['ml'] > 0.0001
df_ml = df_libelles[mask]
print df_ml.shape

df_prices = df[mask]['logprix']

(1220, 82)


#### Add other features 

In [None]:
cols_to_drop = list(NONNUM_TYPES); cols_to_drop.extend(NB_COLS); cols_to_drop.extend(LIBELLE_COLS); cols_to_drop.extend(TARGET_COLUMNS)
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [8]:
df_ml_va = pd.concat([df_ml, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_ml_va.shape

(1220, 127)


#### Remove all zero columns

In [9]:
from common.preprocessing_helper import drop_const_cols

In [10]:
df_ml = drop_const_cols(df_ml)
df_ml_va = drop_const_cols(df_ml_va)

print "df_ml -> ", df_ml.shape
print "df_ml_va -> ", df_ml_va.shape

df_ml ->  (1220, 50)
df_ml_va ->  (1220, 88)


#### Split and learn

In [15]:
y = df_prices
X = StandardScaler().fit_transform(df_ml_va)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

In [16]:
rf = RandomForestRegressor(n_estimators = 35, n_jobs = -1, verbose = 0)

In [17]:
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)

print res1[:10]
print y_test.values[:10]

0.908143577266
41.5343153881
59.2763731294
[ 3.05625521  4.16141549  1.56144815  1.88485163  4.20382754  0.68093861
  0.97979969  3.91351846  2.10347914  3.96899771]
[ 3.76514581  4.4804003   1.48160454  1.89009537  5.22024771  0.41871033
  0.52472853  3.52105234  2.05668455  0.33647224]


In [None]:
X0 = df_subs_quants_packs_va
y = df_prices

X = StandardScaler().fit_transform(X0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

In [None]:
rf = RandomForestRegressor(n_estimators = 35, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)
print res1[:10]
print y_test.values[:10]

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, y_train)
score = svr.score(X_train, y_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)

In [None]:
gbr = GradientBoostingRegressor(verbose = 0, max_features='auto')
gbr.fit(X_train, y_train)
score = gbr.score(X_train, y_train)
print score
res0 = gbr.predict(X_train)
res1 = gbr.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    y = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, y, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, df_prices, rf)
run_cv(df_subs_quants_packs, df_prices, rf)
run_cv(df_subs_quants_others, df_prices, rf)
run_cv(df_subs_quants_packs_va, df_prices, rf)

In [None]:
run_cv(df_subs_quants, df_prices, svr)
run_cv(df_subs_quants_packs, df_prices, svr)
run_cv(df_subs_quants_others, df_prices, svr)
run_cv(df_subs_quants_packs_va, df_prices, svr)

In [None]:
run_cv(df_subs_quants, df_prices, gbr)
run_cv(df_subs_quants_packs, df_prices, gbr)
run_cv(df_subs_quants_others, df_prices, gbr)
run_cv(df_subs_quants_packs_va, df_prices, gbr)

### Learn by type : 'gelule'

In [None]:
index = 2
print "Select type only : ", types[index]
mask = df[types[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(types))
indices.remove(index)
for i in indices:
    mask &= df[types[i]] < 1
    
print "Found lines (med has the type only) : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(NONNUM_TYPES); cols_to_drop.extend(NB_COLS); cols_to_drop.extend(LIBELLE_COLS); cols_to_drop.extend(TARGET_COLUMNS)
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=1)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
gbr = GradientBoostingRegressor(verbose = 0, max_features='auto')
gbr.fit(X_train, ylog_train)
score = gbr.score(X_train, ylog_train)
print score
res0 = gbr.predict(X_train)
res1 = gbr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf)
run_cv(df_subs_quants_packs, ylog, rf)
run_cv(df_subs_quants_others, ylog, rf)
run_cv(df_subs_quants_packs_va, ylog, rf)

In [None]:
run_cv(df_subs_quants, ylog, svr)
run_cv(df_subs_quants_packs, ylog, svr)
run_cv(df_subs_quants_others, ylog, svr)
run_cv(df_subs_quants_packs_va, ylog, svr)

In [None]:
run_cv(df_subs_quants, ylog, gbr)
run_cv(df_subs_quants_packs, ylog, gbr)
run_cv(df_subs_quants_others, ylog, gbr)
run_cv(df_subs_quants_packs_va, ylog, gbr)

### Learn by type : 'comprime' et 'gelule'

In [None]:
index1 = 0
index2 = 2
print "Select types : ", types[index1], types[index2]
mask = (df[types[index1]] > 0) | (df[types[index2]] > 0)
print "Found lines (med has the type): ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=1)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
gbr = GradientBoostingRegressor(verbose = 0, max_features='auto')
gbr.fit(X_train, ylog_train)
score = gbr.score(X_train, ylog_train)
print score
res0 = gbr.predict(X_train)
res1 = gbr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf)
run_cv(df_subs_quants_packs, ylog, rf)
run_cv(df_subs_quants_others, ylog, rf)
run_cv(df_subs_quants_packs_va, ylog, rf)

In [None]:
run_cv(df_subs_quants, ylog, svr)
run_cv(df_subs_quants_packs, ylog, svr)
run_cv(df_subs_quants_others, ylog, svr)
run_cv(df_subs_quants_packs_va, ylog, svr)

In [None]:
run_cv(df_subs_quants, ylog, gbr)
run_cv(df_subs_quants_packs, ylog, gbr)
run_cv(df_subs_quants_others, ylog, gbr)
run_cv(df_subs_quants_packs_va, ylog, gbr)

### Learn by type : 'ml'

In [None]:
index = 3
print "Select type only : ", types[index]
mask = df[types[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(types))
indices.remove(index)
for i in indices:
    mask &= df[types[i]] < 1
    
print "Found lines (med has the type only) : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf, cv=7)
run_cv(df_subs_quants_packs, ylog, rf, cv=7)
run_cv(df_subs_quants_others, ylog, rf, cv=7)
run_cv(df_subs_quants_packs_va, ylog, rf, cv=7)

In [None]:
# run_cv(df_subs_quants, ylog, svr, cv=7)
# run_cv(df_subs_quants_packs, ylog, svr, cv=7)
# run_cv(df_subs_quants_others, ylog, svr, cv=7)
# run_cv(df_subs_quants_packs_va, ylog, svr, cv=7)

### Learn by type : 'comprime'

In [None]:
index = 1
print "Select type only : ", types[index]
mask = df[types[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(types))
indices.remove(index)
for i in indices:
    mask &= df[types[i]] < 1
    
print "Found lines (med has the type only) : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf, cv=5)
run_cv(df_subs_quants_packs, ylog, rf, cv=5)
run_cv(df_subs_quants_others, ylog, rf, cv=5)
run_cv(df_subs_quants_packs_va, ylog, rf, cv=5)

In [None]:
# run_cv(df_subs_quants, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs, ylog, svr, cv=5)
# run_cv(df_subs_quants_others, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs_va, ylog, svr, cv=5)

### Learn all types

In [None]:
df.shape

In [None]:
mask = df[types[0]] == 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask &= df[types[i]] == 0
    
print "Found lines : ", len(df[mask])

In [None]:
mask = df[types[0]] > 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask |= df[types[i]] > 0
    
print "Found lines : ", len(df[mask])

In [None]:
mask = df['nb_plaquette'] > 0

In [None]:
df[mask].head()

In [None]:
from prepare_data import BIG

In [None]:
BIG.ix[81]

In [None]:
mask = df[types[0]] > 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask |= df[types[i]] > 0
    
print "Found lines : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

In [None]:
mask = df[types[0]] > 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask |= df[types[i]] > 0
    
print "Found lines : ", len(df[mask])
df_logprices = df[mask]['logprix']

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf, cv=5)
run_cv(df_subs_quants_packs, ylog, rf, cv=5)
run_cv(df_subs_quants_others, ylog, rf, cv=5)
run_cv(df_subs_quants_packs_va, ylog, rf, cv=5)

In [None]:
# run_cv(df_subs_quants, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs, ylog, svr, cv=5)
# run_cv(df_subs_quants_others, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs_va, ylog, svr, cv=5)