# MDF Single model learning by substances and quantities

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
%matplotlib inline



In [2]:
from prepare_data import BIG, PROCESSED_BIG, PROCESSED_SUBSTANCES, PROCESSED_VOIE_ADMIN, TARGET_COLUMNS, TRAIN, TEST, SOURCE

-- get_unique_noconst --
- Number of found duplicated rows :  1276
- Remove constant columns :  (10959, 42) -> (10959, 42)
- Dropped const columns :  


In [3]:
PROCESSED_TRAIN = PROCESSED_BIG[PROCESSED_BIG[SOURCE] == TRAIN].drop(SOURCE, axis=1)
PROCESSED_TRAIN_SUBS = PROCESSED_SUBSTANCES[PROCESSED_SUBSTANCES[SOURCE] == TRAIN].drop(SOURCE, axis=1)
PROCESSED_TRAIN_VOIE_ADMIN = PROCESSED_VOIE_ADMIN[PROCESSED_VOIE_ADMIN[SOURCE] == TRAIN].drop(SOURCE, axis=1)

In [None]:
print PROCESSED_TRAIN.shape, PROCESSED_TRAIN_SUBS.shape, PROCESSED_TRAIN_VOIE_ADMIN.shape

In [None]:
PROCESSED_TRAIN.columns.values

In [None]:
PROCESSED_VOIE_ADMIN.columns.values

In [None]:
PROCESSED_SUBSTANCES.columns.values

## Metric MAPE

In [None]:
# Mean Absolute Percentage Error
def mape_error(y_true, y_pred, **kwards): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Learn by medecin types, substances and its quantities

In [4]:
df = PROCESSED_TRAIN
df_subs = PROCESSED_TRAIN_SUBS.drop(['_ENCODED_'], axis=1)

In [5]:
nb_cols = [u'nb_ampoule', u'nb_capsule', u'nb_comprime', u'nb_film', u'nb_flacon', u'nb_gelule', u'nb_ml', u'nb_pilulier', u'nb_plaquette', u'nb_poche', u'nb_sachet', u'nb_seringue', u'nb_stylo', u'nb_tube']
libelle_cols = [u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube']

In [6]:
index = 8
print "Select nb_cols only : ", nb_cols[index]
mask = df[nb_cols[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(nb_cols))
indices.remove(index)
for i in indices:
    mask &= df[nb_cols[i]] < 0.00001
print "Found lines (med has the type only) : ", len(df[mask])

Select nb_cols only :  nb_plaquette
Found lines (med has the type):  218
Found lines (med has the type only) :  1


In [None]:
df[mask].head()

In [10]:
import re

In [23]:
vals = BIG.ix[:, 6].values
for t in vals:
    m = re.match(r'^([\d,]*)\s*([\w\-]+)\D*([\d,]*)\s*([\w\-]*)\D*([\d,]*)\s*([\w\-]*)', t)
    if m is None:
        raise Exception("Not found : %s" % t)

In [35]:
vals = BIG.ix[500:600, 6].values

for t in vals:
    tt = t
    if not tt[0].isdigit():
        tt = '1 ' + tt    
    groups = re.findall(r'([\d,]+)\s+([\w\-]+)', tt)
    if len(groups) > 0:
        print groups
        print t
    else:
        raise Exception("Not found : %s" % t)

[(u'1', u'plaquette'), (u'90', u'comprime')]
plaquette thermoformee polypropylene aluminium de 90 comprime
[(u'1', u'plaquette'), (u'30', u'comprime')]
plaquette thermoformee PVC PVDC aluminium de 30 comprime
[(u'1', u'plaquette'), (u'30', u'gelule')]
plaquette thermoformee aluminium de 30 gelule
[(u'1', u'plaquette'), (u'14', u'comprime')]
plaquette thermoformee PVC PVDC aluminium de 14 comprime
[(u'1', u'flacon'), (u'30', u'gelule')]
1 flacon en verre de 30 gelule
[(u'1', u'plaquette'), (u'90', u'comprime')]
plaquette thermoformee aluminium de 90 comprime
[(u'1', u'flacon'), (u'30', u'gelule')]
1 flacon polyethylene de 30 gelule
[(u'1', u'sachet'), (u'8', u'g')]
1 sachet papier polyethylene aluminium de 8 g
[(u'20', u'ampoule'), (u'5', u'ml')]
20 ampoule en verre brun de 5  ml
[(u'1', u'plaquette'), (u'28', u'comprime')]
plaquette thermoformee PVC PVDC aluminium de 28 comprime
[(u'1', u'plaquette'), (u'90', u'comprime')]
plaquette thermoformee PVC polyethylene PVDC aluminium de 90 co

In [None]:
quantities = {
    u'nb_comprime': [None, u'nb_plaquette', u'nb_pilulier', u'nb_flacon', u'nb_tube', u'nb_sachet'],
    u'nb_capsule': [None, u'nb_plaquette', u'nb_flacon'],
}

In [None]:
types = [u'nb_capsule', u'nb_comprime',  u'nb_gelule', u'nb_ml', u'nb_sachet', u'nb_flacon']
packages = [u'nb_ampoule', u'nb_film', u'nb_pilulier', u'nb_plaquette', u'nb_poche', u'nb_seringue', u'nb_stylo', u'nb_tube']
types_and_packages = list(types)
types_and_packages.extend(packages)

In [None]:
from common.preprocessing_helper import drop_const_cols

def create_df_subs_quants(df_subs, series_quants):
    cols = df_subs.columns
    df_subs_quants = pd.DataFrame(columns=cols)
    for c in cols:
        df_subs_quants[c] = df_subs[c] * series_quants
    return df_subs_quants

### Learn by type : 'capsule'

In [None]:
index = 0
print "Select type only : ", types[index]
mask = df[types[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(types))
indices.remove(index)
for i in indices:
    mask &= df[types[i]] < 1
    
print "Found lines (med has the type only) : ", len(df[mask])
df_prices = df[mask]['logprix']

In [None]:
df_prices.hist(bins=100)

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape
# print "- Remove constant columns : ", df_subs_quants_others.shape, '->', res.shape
# print "- Dropped const columns : ", 
# if len(df_subs_quants_others.columns) > len(res.columns):
#     print df_subs_quants_others.columns.difference(res.columns).values
# print ""
# df_subs_quants_others = res
# del res

#### Split and learn

In [None]:
y = df_prices
X = StandardScaler().fit_transform(df_subs_quants)
XX = StandardScaler().fit_transform(df_subs_quants_packs_va)

X_train, X_test, XX_train, XX_test, y_train, y_test = train_test_split(X, XX, y, train_size=0.75)

In [None]:
rf = RandomForestRegressor(n_estimators = 35, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)

rf.fit(XX_train, y_train)
score = rf.score(XX_train, y_train)
print score
res01 = rf.predict(XX_train)
res11 = rf.predict(XX_test)
print mape_error(y_train, res01)
print mape_error(y_test, res11)


print res1[:10]
print res11[:10]
print y_test.values[:10]

In [None]:
X0 = df_subs_quants_packs_va
y = df_prices

X = StandardScaler().fit_transform(X0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

In [None]:
rf = RandomForestRegressor(n_estimators = 35, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)
print res1[:10]
print y_test.values[:10]

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, y_train)
score = svr.score(X_train, y_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)

In [None]:
gbr = GradientBoostingRegressor(verbose = 0, max_features='auto')
gbr.fit(X_train, y_train)
score = gbr.score(X_train, y_train)
print score
res0 = gbr.predict(X_train)
res1 = gbr.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    y = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, y, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, df_prices, rf)
run_cv(df_subs_quants_packs, df_prices, rf)
run_cv(df_subs_quants_others, df_prices, rf)
run_cv(df_subs_quants_packs_va, df_prices, rf)

In [None]:
run_cv(df_subs_quants, df_prices, svr)
run_cv(df_subs_quants_packs, df_prices, svr)
run_cv(df_subs_quants_others, df_prices, svr)
run_cv(df_subs_quants_packs_va, df_prices, svr)

In [None]:
run_cv(df_subs_quants, df_prices, gbr)
run_cv(df_subs_quants_packs, df_prices, gbr)
run_cv(df_subs_quants_others, df_prices, gbr)
run_cv(df_subs_quants_packs_va, df_prices, gbr)

### Learn by type : 'gelule'

In [None]:
index = 2
print "Select type only : ", types[index]
mask = df[types[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(types))
indices.remove(index)
for i in indices:
    mask &= df[types[i]] < 1
    
print "Found lines (med has the type only) : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=1)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
gbr = GradientBoostingRegressor(verbose = 0, max_features='auto')
gbr.fit(X_train, ylog_train)
score = gbr.score(X_train, ylog_train)
print score
res0 = gbr.predict(X_train)
res1 = gbr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf)
run_cv(df_subs_quants_packs, ylog, rf)
run_cv(df_subs_quants_others, ylog, rf)
run_cv(df_subs_quants_packs_va, ylog, rf)

In [None]:
run_cv(df_subs_quants, ylog, svr)
run_cv(df_subs_quants_packs, ylog, svr)
run_cv(df_subs_quants_others, ylog, svr)
run_cv(df_subs_quants_packs_va, ylog, svr)

In [None]:
run_cv(df_subs_quants, ylog, gbr)
run_cv(df_subs_quants_packs, ylog, gbr)
run_cv(df_subs_quants_others, ylog, gbr)
run_cv(df_subs_quants_packs_va, ylog, gbr)

### Learn by type : 'comprime' et 'gelule'

In [None]:
index1 = 0
index2 = 2
print "Select types : ", types[index1], types[index2]
mask = (df[types[index1]] > 0) | (df[types[index2]] > 0)
print "Found lines (med has the type): ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=1)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
gbr = GradientBoostingRegressor(verbose = 0, max_features='auto')
gbr.fit(X_train, ylog_train)
score = gbr.score(X_train, ylog_train)
print score
res0 = gbr.predict(X_train)
res1 = gbr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf)
run_cv(df_subs_quants_packs, ylog, rf)
run_cv(df_subs_quants_others, ylog, rf)
run_cv(df_subs_quants_packs_va, ylog, rf)

In [None]:
run_cv(df_subs_quants, ylog, svr)
run_cv(df_subs_quants_packs, ylog, svr)
run_cv(df_subs_quants_others, ylog, svr)
run_cv(df_subs_quants_packs_va, ylog, svr)

In [None]:
run_cv(df_subs_quants, ylog, gbr)
run_cv(df_subs_quants_packs, ylog, gbr)
run_cv(df_subs_quants_others, ylog, gbr)
run_cv(df_subs_quants_packs_va, ylog, gbr)

### Learn by type : 'ml'

In [None]:
index = 3
print "Select type only : ", types[index]
mask = df[types[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(types))
indices.remove(index)
for i in indices:
    mask &= df[types[i]] < 1
    
print "Found lines (med has the type only) : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf, cv=7)
run_cv(df_subs_quants_packs, ylog, rf, cv=7)
run_cv(df_subs_quants_others, ylog, rf, cv=7)
run_cv(df_subs_quants_packs_va, ylog, rf, cv=7)

In [None]:
# run_cv(df_subs_quants, ylog, svr, cv=7)
# run_cv(df_subs_quants_packs, ylog, svr, cv=7)
# run_cv(df_subs_quants_others, ylog, svr, cv=7)
# run_cv(df_subs_quants_packs_va, ylog, svr, cv=7)

### Learn by type : 'comprime'

In [None]:
index = 1
print "Select type only : ", types[index]
mask = df[types[index]] > 0
print "Found lines (med has the type): ", len(df[mask])
indices = range(len(types))
indices.remove(index)
for i in indices:
    mask &= df[types[i]] < 1
    
print "Found lines (med has the type only) : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf, cv=5)
run_cv(df_subs_quants_packs, ylog, rf, cv=5)
run_cv(df_subs_quants_others, ylog, rf, cv=5)
run_cv(df_subs_quants_packs_va, ylog, rf, cv=5)

In [None]:
# run_cv(df_subs_quants, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs, ylog, svr, cv=5)
# run_cv(df_subs_quants_others, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs_va, ylog, svr, cv=5)

### Learn all types

In [None]:
df.shape

In [None]:
mask = df[types[0]] == 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask &= df[types[i]] == 0
    
print "Found lines : ", len(df[mask])

In [None]:
mask = df[types[0]] > 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask |= df[types[i]] > 0
    
print "Found lines : ", len(df[mask])

In [None]:
mask = df['nb_plaquette'] > 0

In [None]:
df[mask].head()

In [None]:
from prepare_data import BIG

In [None]:
BIG.ix[81]

In [None]:
mask = df[types[0]] > 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask |= df[types[i]] > 0
    
print "Found lines : ", len(df[mask])
df_logprices = df[mask]['logprix']

In [None]:
df_subs_quants = create_df_subs_quants(df_subs[mask], df[mask][types[index]])
print df_subs_quants.shape

In [None]:
mask = df[types[0]] > 0
indices = range(len(types))
indices.remove(0)
for i in indices:
    mask |= df[types[i]] > 0
    
print "Found lines : ", len(df[mask])
df_logprices = df[mask]['logprix']

#### Add packages

In [None]:
df_subs_quants_packs = pd.concat([df_subs_quants, df[mask][packages]], axis=1)
print df_subs_quants_packs.shape

#### Add other features 

In [None]:
cols_to_drop = list(types); cols_to_drop.extend(packages); cols_to_drop.extend(TARGET_COLUMNS)
cols_to_drop.extend([u'libelle_ampoule', u'libelle_capsule', u'libelle_comprime', u'libelle_film', u'libelle_flacon', u'libelle_gelule', u'libelle_pilulier', u'libelle_plaquette', u'libelle_poche', u'libelle_sachet', u'libelle_seringue', u'libelle_stylo', u'libelle_tube'])
other_features = df.drop(cols_to_drop, axis=1)
df_subs_quants_others = pd.concat([df_subs_quants_packs, other_features[mask]], axis=1)
print df_subs_quants_others.shape

#### Add voie admin

In [None]:
df_subs_quants_packs_va = pd.concat([df_subs_quants_packs, PROCESSED_TRAIN_VOIE_ADMIN[mask]], axis=1)
print df_subs_quants_packs_va.shape

#### Remove all zero columns

In [None]:
df_subs_quants = drop_const_cols(df_subs_quants)
df_subs_quants_packs = drop_const_cols(df_subs_quants_packs)
df_subs_quants_others = drop_const_cols(df_subs_quants_others)
df_subs_quants_packs_va = drop_const_cols(df_subs_quants_packs_va)

print "df_subs_quants -> ", df_subs_quants.shape
print "df_subs_quants_packs -> ", df_subs_quants_packs.shape
print "df_subs_quants_others -> ", df_subs_quants_others.shape
print "df_subs_quants_packs_va -> ", df_subs_quants_packs_va.shape

#### Split and learn

In [None]:
X0 = df_subs_quants_packs_va
ylog = df_logprices

X = StandardScaler().fit_transform(X0)
X_train, X_test, ylog_train, ylog_test = train_test_split(X, ylog, train_size=0.80)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1, verbose = 0)

In [None]:
rf.fit(X_train, ylog_train)
score = rf.score(X_train, ylog_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

In [None]:
svr = SVR(kernel='linear', C=3)
svr.fit(X_train, ylog_train)
score = svr.score(X_train, ylog_train)
print score
res0 = svr.predict(X_train)
res1 = svr.predict(X_test)
print mape_error(ylog_train, res0)
print mape_error(ylog_test, res1)

#### Cross validation scoring

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    ylog = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, ylog, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()

In [None]:
run_cv(df_subs_quants, ylog, rf, cv=5)
run_cv(df_subs_quants_packs, ylog, rf, cv=5)
run_cv(df_subs_quants_others, ylog, rf, cv=5)
run_cv(df_subs_quants_packs_va, ylog, rf, cv=5)

In [None]:
# run_cv(df_subs_quants, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs, ylog, svr, cv=5)
# run_cv(df_subs_quants_others, ylog, svr, cv=5)
# run_cv(df_subs_quants_packs_va, ylog, svr, cv=5)