In [1]:
import projectFunctions as pF
import pandas as pd
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBClassifier
random_seed = 206

In [2]:
# Load datasets
df_train = pF.loadData("project-space/bank-product-recommendation/data/train.txt")
df_train.head()

df_test = pF.loadData("project-space/bank-product-recommendation/data/test.txt")

In [3]:
df_train.shape

(3350601, 8)

In [4]:
n_users = df_train.ID_Customer.unique().shape[0]
n_items = df_train['Cod_Prod'].unique().shape[0]
print(str(n_users) + ' users') 
print(str(n_items) + ' products') 

676370 users
94 products


In [5]:
# Train transformation
df_train = pF.tratamientoFecha(df_train)
df_train = pF.mapProduct(df_train)

# Test transformacion
df_test = pF.tratamientoFecha(df_test)
df_test = pF.mapProdByDict(df_test)

  primerMes = pd.datetime(1950, 1, 1, 0, 0, 0)


In [6]:
df_train

Unnamed: 0,ID_Customer,Cod_Prod,Cod_Fecha,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05,Num_Dias
0,A0000001,15,2006-03-01,5,4,3,1,0,20513
1,A0000001,14,2006-03-01,5,4,3,1,0,20513
2,A0000001,0,2007-05-01,5,4,3,1,0,20939
3,A0000001,10,2011-04-01,5,4,3,1,0,22370
4,A0000001,9,2013-04-01,5,4,3,1,0,23101
...,...,...,...,...,...,...,...,...,...
3350596,A0676369,0,2015-07-01,3,2,3,1,0,23922
3350597,A0676369,6,2015-07-01,3,2,3,1,0,23922
3350598,A0676369,3,2015-10-01,3,2,3,1,0,24014
3350599,A0676370,0,2015-11-01,3,2,2,1,0,24045


In [7]:
# Number of previous products to considerate
nant = 8
incluir_num_dias = True

# Train
df_train = pF.addProdAnt(df_train, nant, incluir_num_dias)

# Test
df_test = pF.addProdAnt(df_test, nant, incluir_num_dias)

In [8]:
df_train[list(df_train.columns)[8:]]

Unnamed: 0,Num_Dias,Cod_Prod_Ant1,Cod_Fecha_Ant,Num_Dias_Ant1,Cod_Prod_Ant2,Num_Dias_Ant2,Cod_Prod_Ant3,Num_Dias_Ant3,Cod_Prod_Ant4,Num_Dias_Ant4,Cod_Prod_Ant5,Num_Dias_Ant5,Cod_Prod_Ant6,Num_Dias_Ant6,Cod_Prod_Ant7,Num_Dias_Ant7,Cod_Prod_Ant8,Num_Dias_Ant8
0,20513,-1,1950-01-01,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0
1,20513,15,2006-03-01,20513,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0
2,20939,14,2006-03-01,20513,15,20513,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0
3,22370,0,2007-05-01,20939,14,20513,15,20513,-1,0,-1,0,-1,0,-1,0,-1,0
4,23101,10,2011-04-01,22370,0,20939,14,20513,15,20513,-1,0,-1,0,-1,0,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3350596,23922,2,2015-07-01,23922,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0
3350597,23922,0,2015-07-01,23922,2,23922,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0
3350598,24014,6,2015-07-01,23922,0,23922,2,23922,-1,0,-1,0,-1,0,-1,0,-1,0
3350599,24045,-1,1950-01-01,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0


In [9]:
df_train.columns

Index(['ID_Customer', 'Cod_Prod', 'Cod_Fecha', 'Socio_Demo_01',
       'Socio_Demo_02', 'Socio_Demo_03', 'Socio_Demo_04', 'Socio_Demo_05',
       'Num_Dias', 'Cod_Prod_Ant1', 'Cod_Fecha_Ant', 'Num_Dias_Ant1',
       'Cod_Prod_Ant2', 'Num_Dias_Ant2', 'Cod_Prod_Ant3', 'Num_Dias_Ant3',
       'Cod_Prod_Ant4', 'Num_Dias_Ant4', 'Cod_Prod_Ant5', 'Num_Dias_Ant5',
       'Cod_Prod_Ant6', 'Num_Dias_Ant6', 'Cod_Prod_Ant7', 'Num_Dias_Ant7',
       'Cod_Prod_Ant8', 'Num_Dias_Ant8'],
      dtype='object')

In [10]:
# Training subset
dfTr2model = pF.ultimoElementoSerie(df_train)

# Evaluation subset
dfTs2eval = pF.ultimoElementoSerie(df_test)

# Prediction subset
dfTs2predict = pF.createTest(df_test)

We add the number of products purchased as a feature. For the prediction dataframe we indicate that you take all the products purchased.

In [11]:
dfTr2model["Num_Prod_Ant"] = pF.numProductosComprados(df_train)
dfTs2eval["Num_Prod_Ant"] = pF.numProductosComprados(df_train)
dfTs2predict["Num_Prod_Ant"] = pF.numProductosComprados(df_train, test = True)

In [12]:
dfTr2model

Unnamed: 0,ID_Customer,Cod_Prod,Cod_Fecha,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05,Num_Dias,Cod_Prod_Ant1,...,Num_Dias_Ant4,Cod_Prod_Ant5,Num_Dias_Ant5,Cod_Prod_Ant6,Num_Dias_Ant6,Cod_Prod_Ant7,Num_Dias_Ant7,Cod_Prod_Ant8,Num_Dias_Ant8,Num_Prod_Ant
0,A0000001,9,2013-04-01,5,4,3,1,0,23101,10,...,20513,-1,0,-1,0,-1,0,-1,0,4
1,A0000002,11,2015-02-01,5,5,1,1,0,23772,23,...,0,-1,0,-1,0,-1,0,-1,0,3
2,A0000003,2,2016-02-01,5,5,5,2,0,24137,1,...,13088,-1,0,-1,0,-1,0,-1,0,4
3,A0000004,12,2016-05-01,5,5,3,1,0,24227,2,...,0,-1,0,-1,0,-1,0,-1,0,3
4,A0000005,0,1984-04-01,5,5,3,1,0,12509,1,...,0,-1,0,-1,0,-1,0,-1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676365,A0676366,0,2016-09-01,1,1,2,1,0,24350,-1,...,0,-1,0,-1,0,-1,0,-1,0,0
676366,A0676367,12,2016-09-01,1,1,1,2,0,24350,0,...,0,-1,0,-1,0,-1,0,-1,0,1
676367,A0676368,0,2016-09-01,1,1,1,2,0,24350,-1,...,0,-1,0,-1,0,-1,0,-1,0,0
676368,A0676369,3,2015-10-01,3,2,3,1,0,24014,6,...,0,-1,0,-1,0,-1,0,-1,0,3


In [1]:
__ = pF.mapAparicionProd(df_train)
__ = pF.mapDiasInicio(df_train)
__ = pF.mapDiasInicio(df_test)
__ = pF.mapYearPIB()

NameError: name 'pF' is not defined

### Dataset reduction

**Sub-sampling**

Due to the large amount of data provided in the training file, we performed a random sub-sampling of the * train * dataframe in order to carry out the training of the models in a reasonable time.
This sub-sampling is performed in a stratified manner, as explained in the report.
In our case we are left with 20% of the train data, which is more than 120,000 records.

Once the training model to be implemented in the validation phase has been chosen, in order to estimate the goodness of the classifier on the test set, it is decided to evaluate the success rate on this set.

In [14]:
# Training subset
size = .2
dfTr2model = pF.subset(df_train, df_train.Cod_Prod, size = size)

# Test subset, for evaluation on this data set
size = .2
dfTs2eval = pF.subset(df_test, df_test.Cod_Prod, size = size)

**Elimination of minority classes**

In order to eliminate the minority classes of the training set we *prune* it.

For the test case it would not be necessary because you do not need to model any class with this.

In [15]:
clases = 60
resto = False

dfTr2model = pF.classPrune2(dfTr2model, dfTr2model.Cod_Prod, nc = clases, resto = resto)

Number of classes: 60 
Percentage of data: 99.71766250820747 %
Minimum of samples: 215.0




<a id="feat"></a>
## Treatment of features

**New features**

In the next cell we add the characteristics to the data sets. Some of these functions use the maps loaded at the beginning of the Notebook. This process may take a while to run, but not more than 5 minutes (depends on the computer and the amount of data).

- **restaFechas**: is responsible for subtracting the days between the previous products that each customer has purchased and creating a corresponding column.


- **acontecimiento**: creates a column with the event of Cajamar according to the time in which the user buys the previous product. The event is marked by the merger of the rural banks.


- **addPIBAnt**: creates a column with the GDP per capita of the years in which the user buys the previous product.


- **addDiasInicioAnt**: create a column with the days that have passed since the user started buying, taking into account the date of purchase of the previous product.

In [None]:
dfTr2model = pF.restaFechas(dfTr2model)
dfTr2model = pF.acontecimiento(dfTr2model)
dfTr2model = pF.addPIBAnt(dfTr2model)
dfTr2model = pF.addDiasInicioAnt(dfTr2model)
print("Creation of features for training set completed.")

dfTs2eval = pF.restaFechas(dfTs2eval)
dfTs2eval = pF.acontecimiento(dfTs2eval)
dfTs2eval = pF.addPIBAnt(dfTs2eval)
dfTs2eval = pF.addDiasInicioAnt(dfTs2eval)
print("Creation of features for evaluation set completed.")

dfTs2predict = pF.restaFechas(dfTs2predict)
dfTs2predict = pF.acontecimiento(dfTs2predict)
dfTs2predict = pF.addPIBAnt(dfTs2predict)
dfTs2predict = pF.addDiasInicioAnt(dfTs2predict)
print("Creation of features for prediction set completed.")

**Expansion of variables**

In the next cell we carry out the expansion of categorical variables, in which as many columns as possible values can be created the variable passed by argument.

In [14]:
variable = "Socio_Demo_05"

dfTr2model = pF.expandirVariable(dfTr2model, variable)
dfTs2eval = pF.expandirVariable(dfTs2eval, variable)
dfTs2predict = pF.expandirVariable(dfTs2predict, variable)

Expanding Socio_Demo_05 to dimension 4
Expanding Socio_Demo_05 to dimension 4
Expanding Socio_Demo_05 to dimension 4


#### **Feature selection**

In the next cell we eliminate attributes that have served us to calculate other variables but that we do not really have at the time of making the prediction.

The attributes to eliminate in this part are:

- **Cod_Fecha**: this attribute is eliminated, since in the real scenario we can not know when the user is going to buy the new product.


- **Cod_Fecha_Ant**: it is a column in date format when I bought the previous product, we eliminated it because we have already made the relevant transformations, and it is not useful in date format.


- **Num_Dias**: this column is not available because we do not have the current purchase date to know how many days have passed since 1950.

In [15]:
columns2drop = ["Cod_Fecha", "Cod_Fecha_Ant", "Num_Dias"]

dfTr2model = dfTr2model.drop(columns2drop, axis=1)
dfTs2eval = dfTs2eval.drop(columns2drop, axis=1)
dfTs2predict = dfTs2predict.drop(columns2drop,axis=1)

## **Machine Learning model**
In the next cell we extract the columns with which we will carry out the training of our classifier.



In [None]:
# We choose the training data by eliminating the product code and the user ID.
columns = list(dfTr2model.columns[2:len(dfTr2model.columns)])

print(columns)

['Socio_Demo_01', 'Socio_Demo_02', 'Socio_Demo_03', 'Socio_Demo_04', 'Cod_Prod_Ant1', 'Num_Dias_Ant1', 'Cod_Prod_Ant2', 'Num_Dias_Ant2', 'Cod_Prod_Ant3', 'Num_Dias_Ant3', 'Cod_Prod_Ant4', 'Num_Dias_Ant4', 'Cod_Prod_Ant5', 'Num_Dias_Ant5', 'Cod_Prod_Ant6', 'Num_Dias_Ant6', 'Cod_Prod_Ant7', 'Num_Dias_Ant7', 'Cod_Prod_Ant8', 'Num_Dias_Ant8', 'Diferencia_Fechas_1', 'Diferencia_Fechas_2', 'Diferencia_Fechas_3', 'Diferencia_Fechas_4', 'Diferencia_Fechas_5', 'Diferencia_Fechas_6', 'Diferencia_Fechas_7', 'AcontecimientoAnt', 'PIB_Ant', 'DiasDesde_Inicio', 'Socio_Demo_05_1', 'Socio_Demo_05_2', 'Socio_Demo_05_3', 'Socio_Demo_05_4']


<a id="train"> </a>
## **Training**

In the next cell we carry out the entry of our classifier *XGBoost* with the optimal parameters obtained from the validation phase. 

***Note: We should do a tuning of parameters to find the best values that improve the model.***

In [None]:
# Uncoment and comment from import ... If we have a trained model file
# we can load it with:
#import pickle
#estimator = pickle.load(open("trained_model.pickle.dat", "rb"))

import multiprocessing
nproc = multiprocessing.cpu_count() # For a fastest train, we use all cpu processors
print('Number of processors: ',nproc)
estimator=XGBClassifier(learning_rate = 0.1,
                       nthread = nproc,
                       base_score = 0.2,
                       n_estimators = 200,
                       seed = random_seed,
                       max_depth = 8)

Number of processors:  40


In [None]:
estimator.fit(dfTr2model[columns], dfTr2model.Cod_Prod)

XGBClassifier(base_score=0.2, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=40, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=206,
       silent=True, subsample=1)

In [None]:
# Uncomment if we want save model to file
#import pickle
#pickle.dump(estimator, open("trained_model.pickle.dat", "wb"))

**Evaluation of the test data**

In order to observe the results of the test predictions, the trained classifier is evaluated on the subset of test data.

In [None]:
tsScore = estimator.score(dfTs2eval[columns], dfTs2eval.Cod_Prod)

print("Score obtained in test: "+ str(tsScore))

<a id="predict"> </a>
## **Prediction**

We make the prediction of the future products to be hired by the customers of the test dataset.

In [None]:
Cod_Prod_predicted = estimator.predict(dfTs2predict[columns])

  if diff:


**Creation of the results dataframe**

In the next cell, the creation of a dataframe with the customer's ID and the product code to be purchased is carried out.

Subsequently, an inverse mapping of these product codes is performed.

In [16]:
# We create a new column with the prediction.
dfresults = pd.DataFrame([dfTs2predict["ID_Customer"],Cod_Prod_predicted],["ID_Customer", "Cod_Prod"]).transpose()

# The mapping of the products is carried out in reverse.
dfresults = pF.getDfMap(dfresults)

dfresults.head()

NameError: name 'Cod_Prod_predicted' is not defined

**Export of results**

Finally, we export the dataframe with the results to the output format: ID_Customer, Cod_Prod

In [None]:
fileName = "output_prediction.csv"
dfresults.to_csv(fileName, encoding='utf-8', index=False)