In [1]:
import pandas as pd

# Load dataframe with first column as the index
df = pd.read_csv('../rec_sys_alumnos.csv', index_col=0, dtype=str)
df.head(10)

Unnamed: 0,cod_persona,mes,pais,sexo,edad,fecha1,xti_empleado,xti_nuevo_cliente,num_antiguedad,xti_rel,...,ind_prod16,ind_prod17,ind_prod18,ind_prod19,ind_prod20,ind_prod21,ind_prod22,ind_prod23,ind_prod24,ind_prod25
0,178103,2015-01-28,ES,H,35,2015-01-12,N,0.0,6,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1,503082,2015-01-28,ES,V,27,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
2,502996,2015-01-28,ES,V,37,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
3,503053,2015-01-28,ES,H,23,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,1,0
4,503031,2015-01-28,ES,H,44,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
5,503112,2015-01-28,ES,H,23,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
6,503163,2015-01-28,ES,V,22,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
7,503154,2015-01-28,ES,H,23,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
8,503142,2015-01-28,ES,H,23,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0
9,502804,2015-01-28,ES,H,24,2012-08-10,N,0.0,35,1.0,...,0,0,0,0,0,0,0.0,0.0,0,0


In [2]:
# Trim spaces at the beginning and at the end of each record
df = df.apply(lambda x: x.str.strip())

# Set 'NA' strings as nan value
df = df.replace('NA', float('nan'))

In [3]:
columns = df.columns.tolist()

# Manual classification of columns by their type (products are already binary)
asint = [0, 4, 7, 8, 9, 18]
products = list(range(23, 48))
asfloat = [20, 22]
asdate = [1, 5, 10]
ascategory = [2, 6, 11, 12, 15, 21]
asbinary = [3, 13, 14, 16, 17, 19]

# Transform some columns to binary
from numpy import mod
for i in asbinary:
    options = pd.Series(df[columns[i]].unique()).sort_values().values
    binary = [0, 1]
    for j in [-2, -3]:
        df[columns[i]] = df[columns[i]].replace(options[mod(j, len(options))], binary[mod(j, len(binary)+1)])

# Convert numeric columns to float (including binary)
for i in asfloat + asint + products + asbinary:
    df[columns[i]] = df[columns[i]].astype('float64')

# Try to convert integer columns to int (not possible if columns contains nan, since this value is float)
for i in asint + products + asbinary:
    try:
        df[columns[i]] = df[columns[i]].astype('int64')
    except ValueError:
        continue

# Convert date columns to datetime
for i in asdate:
    df[columns[i]] = pd.to_datetime(df[columns[i]])

# Convert some columns to categorical
for i in ascategory:
    df[columns[i]] = df[columns[i]].astype('category')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 634682 entries, 0 to 641706
Data columns (total 48 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   cod_persona            634682 non-null  int64         
 1   mes                    634682 non-null  datetime64[ns]
 2   pais                   634626 non-null  category      
 3   sexo                   634617 non-null  float64       
 4   edad                   634626 non-null  float64       
 5   fecha1                 634626 non-null  datetime64[ns]
 6   xti_empleado           634626 non-null  category      
 7   xti_nuevo_cliente      634626 non-null  float64       
 8   num_antiguedad         634626 non-null  float64       
 9   xti_rel                634626 non-null  float64       
 10  fec_ult_cli_1t         36 non-null      datetime64[ns]
 11  xti_rel_1mes           629040 non-null  category      
 12  tip_rel_1mes           629040 non-null  cate

In [5]:
df.isna().sum()

cod_persona                   0
mes                           0
pais                         56
sexo                         65
edad                         56
fecha1                       56
xti_empleado                 56
xti_nuevo_cliente            56
num_antiguedad               56
xti_rel                      56
fec_ult_cli_1t           634646
xti_rel_1mes               5642
tip_rel_1mes               5642
indresi                      56
indext                       56
des_canal                  7178
xti_extra                    56
tip_dom                       0
cod_provincia              2946
xti_actividad_cliente        56
imp_renta                124164
id_segmento                7403
mean_engagement            5642
ind_prod1                     0
ind_prod2                     0
ind_prod3                     0
ind_prod4                     0
ind_prod5                     0
ind_prod6                     0
ind_prod7                     0
ind_prod8                     0
ind_prod