In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import gc
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

%matplotlib inline

In [2]:
plt.rc('figure',dpi=100);

In [3]:
avisos = pd.read_csv("../../Data/fiuba_entrenamiento/gian/avisosONEHOT.csv")
postulaciones = pd.read_csv("../../Data/fiuba_entrenamiento/gian/postulaciones.csv")
postulantes = pd.read_csv("../../Data/fiuba_entrenamiento/gian/postulantesONEHOT.csv")
areas=avisos['nombre_area'].value_counts()
avisos['nombre_area']=avisos['nombre_area'].replace(areas.index, areas.values)

avisos_no_one_hot= pd.read_csv("../../Data/fiuba_entrenamiento/gian/avisos.csv")
avisos_no_one_hot=avisos_no_one_hot[['idaviso','tipo_de_trabajo','nivel_laboral','nombre_area']];
avisos_no_one_hot['tipo_de_trabajo']=avisos_no_one_hot['tipo_de_trabajo'].astype('category')
avisos_no_one_hot['nivel_laboral']=avisos_no_one_hot['nivel_laboral'].astype('category')
avisos_no_one_hot['nombre_area']=avisos_no_one_hot['nombre_area'].astype('category')
avisos_no_one_hot=avisos_no_one_hot.rename(columns={'nombre_area': 'nombre_area_aux'})

postulaciones['sepostulo']=1
postulaciones.drop(['fechapostulacion','dia','mes','semana','diadelasemana','hora'],axis=1,inplace=True)

In [4]:
vistas = pd.read_csv("../../Data/fiuba_entrenamiento/gian/vistas.csv")
vistas.drop(['timestamp','dia','mes','semana','diadelasemana','hora'],axis=1,inplace=True)
vistas=vistas.rename(columns={'idAviso': 'idaviso'})
vistas.head()

Unnamed: 0,idaviso,idpostulante
0,1111780242,YjVJQ6Z
1,1112263876,BmVpYoR
2,1112327963,wVkBzZd
3,1112318643,OqmP9pv
4,1111903673,DrpbXDP


In [5]:
vistas_por_area=vistas.copy()
vistas_por_area['vistas_por_area']=0;
vistas_por_area=pd.merge(vistas_por_area,avisos_no_one_hot,on='idaviso', how='left')
vistas_por_area=vistas_por_area.groupby(['idpostulante','nombre_area_aux'],as_index=False).agg({'vistas_por_area':'count'})
vistas_por_area.head()

Unnamed: 0,idpostulante,nombre_area_aux,vistas_por_area
0,0005E,Tecnologia / Sistemas,2
1,00Lkv,Comercio Exterior,2
2,00dMd,Comercio Exterior,4
3,01QAq,Contabilidad,1
4,021OM,Comercio Exterior,5


In [6]:
vistas_por_nivel=vistas.copy()
vistas_por_nivel['vistas_por_nivel']=0;
vistas_por_nivel=pd.merge(vistas_por_nivel,avisos_no_one_hot,on='idaviso', how='left')
vistas_por_nivel=vistas_por_nivel.groupby(['idpostulante','nivel_laboral'],as_index=False).agg({'vistas_por_nivel':'count'})
vistas_por_nivel.head()

Unnamed: 0,idpostulante,nivel_laboral,vistas_por_nivel
0,0005E,Otro,2
1,00Lkv,Otro,2
2,00dMd,Otro,4
3,01QAq,Otro,1
4,021OM,Otro,5


In [7]:
vistas_por_tipo=vistas.copy()
vistas_por_tipo['vistas_por_tipo']=0;
vistas_por_tipo=pd.merge(vistas_por_tipo,avisos_no_one_hot,on='idaviso', how='left')
vistas_por_tipo=vistas_por_tipo.groupby(['idpostulante','tipo_de_trabajo'],as_index=False).agg({'vistas_por_tipo':'count'})
vistas_por_tipo.head()

Unnamed: 0,idpostulante,tipo_de_trabajo,vistas_por_tipo
0,0005E,Full-time,2
1,00Lkv,Full-time,2
2,00dMd,Full-time,4
3,01QAq,Full-time,1
4,021OM,Full-time,5


In [8]:
vistas['cantidad']=0
vistas2=vistas.groupby(['idaviso','idpostulante'],as_index=False).agg({'cantidad':'count'})
vistas2.head()

Unnamed: 0,idaviso,idpostulante,cantidad
0,18,BolNL,2
1,48375,RwVdKR,1
2,169730,1KjXB,2
3,169730,2AKzxa,2
4,169730,6LJ64,1


In [9]:
vistas=vistas.groupby(['idpostulante'],as_index=False).agg({'cantidad':'count'});
vistas.head()

Unnamed: 0,idpostulante,cantidad
0,0002q,15
1,0005E,58
2,000R8,14
3,001XE,26
4,003k9,32


In [10]:
vistas3=pd.merge(vistas2,vistas,on='idpostulante',how='left')

In [11]:
vistas3.head()

Unnamed: 0,idaviso,idpostulante,cantidad_x,cantidad_y
0,18,BolNL,2,28
1,48375,RwVdKR,1,15
2,169730,1KjXB,2,99
3,169730,2AKzxa,2,2
4,169730,6LJ64,1,23


In [12]:
vistas3['cantidad']=vistas3['cantidad_x']/vistas3['cantidad_y']
vistas3.drop(['cantidad_x','cantidad_y'],axis=1,inplace=True)
vistas3.head()

Unnamed: 0,idaviso,idpostulante,cantidad
0,18,BolNL,0.071429
1,48375,RwVdKR,0.066667
2,169730,1KjXB,0.020202
3,169730,2AKzxa,1.0
4,169730,6LJ64,0.043478


In [13]:
vistas=vistas.rename(columns={'cantidad': 'vistas_postulante'})
vistas2=vistas2.rename(columns={'cantidad': 'vistas_al_aviso'})

In [14]:
postulaciones2= pd.read_csv("../../Data/fiuba_entrenamiento/gian/no-postulaciones/12.csv").sample(len(postulaciones)//100)
print(len(postulaciones2))
postulaciones2

66045


Unnamed: 0,idaviso,idpostulante,sepostulo
1690822,1112297991,8o3bNx,0
6048260,1112342513,Nz0ZoLJ,0
2479405,1112472340,96X30eP,0
6573059,1112303836,YjVG1KY,0
3353651,1112297714,4rdqZKz,0
1509670,1112256010,bOjAbD4,0
1575085,1112464787,8MPEYNL,0
1393999,1112238424,kPLp9Za,0
924721,1112474200,bO4mRMB,0
2669957,1112421137,4rdY3lz,0


In [15]:
postulaciones.head()

Unnamed: 0,idaviso,idpostulante,sepostulo
0,1112257047,NM5M,1
1,1111920714,NM5M,1
2,1112346945,NM5M,1
3,1112345547,NM5M,1
4,1112237522,5awk,1


In [16]:
postulaciones=postulaciones.append(postulaciones2, ignore_index=True)
print(len(postulaciones))
postulaciones2=0
gc.collect()
postulaciones.drop_duplicates(['idaviso','idpostulante'],keep='first',inplace=True)
print(len(postulaciones))

6670579
6669757


In [17]:
postulaciones=pd.merge(postulaciones,postulantes,on='idpostulante')
postulaciones=pd.merge(postulaciones,avisos,on='idaviso')
postulaciones=pd.merge(postulaciones,avisos_no_one_hot,on='idaviso')
postulaciones=pd.merge(postulaciones,vistas,on=['idpostulante'],how='left')
postulaciones=pd.merge(postulaciones,vistas2,on=['idaviso','idpostulante'],how='left')
postulaciones=pd.merge(postulaciones,vistas3,on=['idaviso','idpostulante'],how='left')
postulaciones.sample(20)

Unnamed: 0,idaviso,idpostulante,sepostulo,edad,sexo_FEM,sexo_MASC,sexo_NO_DECLARA,Doctorado_Abandonado,Doctorado_En Curso,Doctorado_Graduado,...,nivel_laboral_Jefe / Supervisor / Responsable,nivel_laboral_Junior,nivel_laboral_Otro,nivel_laboral_Senior / Semi-Senior,tipo_de_trabajo,nivel_laboral,nombre_area_aux,vistas_postulante,vistas_al_aviso,cantidad
5397086,1112345184,RzrxVrx,1,32.0,True,False,False,False,False,False,...,False,False,False,True,Full-time,Senior / Semi-Senior,Camareros,130.0,,
5156298,1111469635,lDbpjG1,1,46.0,True,False,False,False,False,False,...,False,True,False,False,Full-time,Junior,Salud,11.0,,
4066663,1112452620,2zPavk6,1,19.0,True,False,False,False,False,False,...,False,False,False,True,Full-time,Senior / Semi-Senior,Ventas,580.0,2.0,0.003448
5142635,1112254122,Nz0XA9l,1,44.0,True,False,False,False,False,False,...,False,False,False,True,Full-time,Senior / Semi-Senior,Gastronomia,102.0,,
6045147,1112448310,PmGWAa0,1,21.0,False,True,False,False,False,False,...,False,False,False,True,Full-time,Senior / Semi-Senior,Tesorería,21.0,2.0,0.095238
4296296,1112418593,6rQGbmx,1,28.0,True,False,False,False,False,False,...,False,True,False,False,Full-time,Junior,Atención al Cliente,106.0,1.0,0.009434
4872818,1112307101,8MBYmzx,1,31.0,False,True,False,False,False,False,...,False,False,False,True,Full-time,Senior / Semi-Senior,Telecomunicaciones,45.0,,
683364,1112281263,EzZ06E0,1,24.0,True,False,False,False,False,False,...,False,False,False,True,Por Contrato,Senior / Semi-Senior,Selección,77.0,,
2665292,1112419006,Y2el8Y,1,28.0,True,False,False,False,False,False,...,False,True,False,False,Full-time,Junior,Jóvenes Profesionales,7.0,2.0,0.285714
3995827,1112376163,4rPY850,1,39.0,True,False,False,False,False,False,...,False,False,False,True,Full-time,Senior / Semi-Senior,Comercial,111.0,,


In [18]:
postulaciones['cantidad']=postulaciones['cantidad'].fillna(0)
postulaciones['vistas_postulante']=postulaciones['vistas_postulante'].fillna(0)
postulaciones['vistas_al_aviso']=postulaciones['vistas_al_aviso'].fillna(0)
postulaciones['lo_vio']=(postulaciones['vistas_al_aviso']>0)
postulaciones.sample(20)

Unnamed: 0,idaviso,idpostulante,sepostulo,edad,sexo_FEM,sexo_MASC,sexo_NO_DECLARA,Doctorado_Abandonado,Doctorado_En Curso,Doctorado_Graduado,...,nivel_laboral_Junior,nivel_laboral_Otro,nivel_laboral_Senior / Semi-Senior,tipo_de_trabajo,nivel_laboral,nombre_area_aux,vistas_postulante,vistas_al_aviso,cantidad,lo_vio
3000012,1112351502,qek1WQx,1,28.0,False,True,False,False,False,False,...,False,True,False,Full-time,Otro,Atención al Cliente,1.0,0.0,0.0,False
5195319,1112382252,ekmDZ1B,1,33.0,False,True,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Almacén / Depósito / Expedición,73.0,1.0,0.013699,True
2360973,1112268555,Y3Doxw,1,25.0,False,True,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Corporate Finance / Banca Inversión,65.0,2.0,0.030769,True
2816166,1112309742,e9GKxp,1,32.0,False,True,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Producción,0.0,0.0,0.0,False
1658191,1112306344,qe22dm9,1,20.0,True,False,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Recepcionista,0.0,0.0,0.0,False
4300556,1112409128,E6alE0,1,33.0,True,False,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Tesorería,286.0,4.0,0.013986,True
5835115,1112249063,ZD2bXRE,1,40.0,True,False,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Laboratorio,2.0,0.0,0.0,False
5358732,1112320563,QNrJev2,1,26.0,False,True,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Testing / QA / QC,4.0,0.0,0.0,False
5476592,1112247517,Oqrx0ZB,1,22.0,True,False,False,False,False,False,...,False,False,True,Full-time,Senior / Semi-Senior,Comercial,104.0,0.0,0.0,False
4751801,1112328640,6Evbzj,1,30.0,False,True,False,False,False,False,...,False,False,True,Por Horas,Senior / Semi-Senior,Educación/ Docentes,8.0,0.0,0.0,False


In [19]:
postulaciones_por_area=postulaciones.copy()
postulaciones_por_area=postulaciones_por_area.groupby(by=['idpostulante','nombre_area_aux'],as_index=False).agg({'sepostulo':'count'})
postulaciones_por_area=postulaciones_por_area.rename(columns={'sepostulo':'postulaciones_misma_area'})
postulaciones_por_area.head()

Unnamed: 0,idpostulante,nombre_area_aux,postulaciones_misma_area
0,0z5Dmrd,Recursos Humanos,2
1,0z5JW1r,Almacén / Depósito / Expedición,1
2,0z5JW1r,Comercial,1
3,0z5JW1r,Desarrollo de Negocios,1
4,0z5JW1r,Gastronomia,1


In [20]:
postulaciones_por_nivel=postulaciones.copy()
postulaciones_por_nivel=postulaciones_por_nivel.groupby(by=['idpostulante','nivel_laboral'],as_index=False).agg({'sepostulo':'count'})
postulaciones_por_nivel=postulaciones_por_nivel.rename(columns={'sepostulo':'postulaciones_mismo_nivel'})
postulaciones_por_nivel.head()

Unnamed: 0,idpostulante,nivel_laboral,postulaciones_mismo_nivel
0,0z5Dmrd,Senior / Semi-Senior,2
1,0z5JW1r,Gerencia / Alta Gerencia / Dirección,2
2,0z5JW1r,Jefe / Supervisor / Responsable,1
3,0z5JW1r,Senior / Semi-Senior,4
4,0z5VvGv,-,1


In [21]:
postulaciones_por_tipo=postulaciones.copy()
postulaciones_por_tipo=postulaciones_por_tipo.groupby(by=['idpostulante','tipo_de_trabajo'],as_index=False).agg({'sepostulo':'count'})
postulaciones_por_tipo=postulaciones_por_tipo.rename(columns={'sepostulo':'postulaciones_mismo_tipo'})
postulaciones_por_tipo.head()

Unnamed: 0,idpostulante,tipo_de_trabajo,postulaciones_mismo_tipo
0,0z5Dmrd,Full-time,2
1,0z5JW1r,Full-time,7
2,0z5VvGv,Full-time,39
3,0z5VvGv,Part-time,8
4,0z5VvGv,Temporario,1


In [22]:
postulaciones=pd.merge(postulaciones,postulaciones_por_area,on=['idpostulante','nombre_area_aux'],how='left')
postulaciones=pd.merge(postulaciones,postulaciones_por_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
postulaciones=pd.merge(postulaciones,postulaciones_por_nivel,on=['idpostulante','nivel_laboral'],how='left')
postulaciones['postulaciones_misma_area']=postulaciones['postulaciones_misma_area'].fillna(0).apply(lambda x: x if(x<=0) else x-1)
postulaciones['postulaciones_mismo_nivel']=postulaciones['postulaciones_mismo_nivel'].fillna(0).apply(lambda x: x if(x<=0) else x-1)
postulaciones['postulaciones_mismo_tipo']=postulaciones['postulaciones_mismo_tipo'].fillna(0).apply(lambda x: x if(x<=0) else x-1)
postulaciones=pd.merge(postulaciones,vistas_por_area,on=['idpostulante','nombre_area_aux'],how='left')
postulaciones=pd.merge(postulaciones,vistas_por_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
postulaciones=pd.merge(postulaciones,vistas_por_nivel,on=['idpostulante','nivel_laboral'],how='left')
postulaciones['vistas_por_area']=postulaciones['vistas_por_area'].fillna(0)
postulaciones['vistas_por_tipo']=postulaciones['vistas_por_tipo'].fillna(0)
postulaciones['vistas_por_nivel']=postulaciones['vistas_por_tipo'].fillna(0)
postulaciones.sample(20)

Unnamed: 0,idaviso,idpostulante,sepostulo,edad,sexo_FEM,sexo_MASC,sexo_NO_DECLARA,Doctorado_Abandonado,Doctorado_En Curso,Doctorado_Graduado,...,vistas_postulante,vistas_al_aviso,cantidad,lo_vio,postulaciones_misma_area,postulaciones_mismo_tipo,postulaciones_mismo_nivel,vistas_por_area,vistas_por_tipo,vistas_por_nivel
3141143,1112335686,8MlNLdD,1,24.0,True,False,False,False,False,False,...,837.0,0.0,0.0,False,6,487,352,8.0,644.0,644.0
111091,1112421437,PmGOOwj,1,28.0,True,False,False,False,False,False,...,355.0,2.0,0.005634,True,35,82,63,66.0,284.0,284.0
4577406,1112431556,ZD8qYEV,1,22.0,True,False,False,False,False,False,...,172.0,0.0,0.0,False,5,66,1,4.0,115.0,115.0
4624458,1112407528,LNwkWk5,1,36.0,True,False,False,False,False,False,...,10.0,0.0,0.0,False,8,43,23,1.0,8.0,8.0
4266991,1112369238,MV6B20x,1,21.0,False,True,False,False,False,False,...,3.0,0.0,0.0,False,51,61,11,1.0,3.0,3.0
5427296,1112314997,BmolMPL,1,26.0,False,True,False,False,False,False,...,108.0,0.0,0.0,False,1,20,12,2.0,96.0,96.0
3763411,1112408962,2zQVv66,1,31.0,True,False,False,False,False,False,...,11.0,0.0,0.0,False,21,66,0,5.0,10.0,10.0
992534,1112305277,pzMJBbN,1,20.0,True,False,False,False,False,False,...,0.0,0.0,0.0,False,1,22,11,0.0,0.0,0.0
166484,1112458756,PmG8qJ1,1,35.0,True,False,False,False,False,False,...,364.0,1.0,0.002747,True,15,217,178,20.0,264.0,264.0
2211021,1112280941,MVdVjA9,1,24.0,False,True,False,False,False,False,...,103.0,1.0,0.009709,True,18,30,8,50.0,97.0,97.0


In [23]:
train, test=train_test_split(postulaciones,test_size=0.05)
print("Train: ",len(train),"Test: ",len(test))

Train:  5941237 Test:  312697


In [24]:
features=['nombre_area', 'edad','sexo_FEM', 'sexo_MASC',
'Doctorado_En Curso', 'Doctorado_Graduado',
'Master_En Curso', 'Master_Graduado',
'Posgrado_En Curso', 'Posgrado_Graduado',
'Secundario_En Curso', 'Secundario_Graduado',
'Terciario/Técnico_En Curso', 'Terciario/Técnico_Graduado',
'Universitario_En Curso', 'Universitario_Graduado', 'pide_hombre',
'pide_mujer', 'ingles', 'experiencia', 'paquete_office', 'liderazgo', 'secundario',
'viajar', 'empresa_importante', 'capacitacion', 'remuneracion_pretendida', 'graduados',
'estudiantes', 'conocimientos_especificos', 'vistas_postulante',
'vistas_al_aviso', 'cantidad', 'postulaciones_misma_area', 'postulaciones_mismo_tipo', 
'postulaciones_mismo_nivel','lo_vio','vistas_por_area','vistas_por_nivel','vistas_por_tipo']

In [25]:
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
import random
import gc
class RandomNaiveBayes:

    naives = []
    chosen_features=[]
    
    def __init__(self,  n_iterations, n_features, retries, subset_sample, verbose):
        self.n_iterations=n_iterations
        self.n_features=n_features
        self.retries=retries
        self.subset_sample=subset_sample
        self.verbose=verbose
    
    def roc_auc_score(self, test, to_predict):
        y_test=test[to_predict]
        return roc_auc_score(y_test, self.predict_proba_class(1,test))
    
    def predict_proba_class(self,number,data):
        if(not self.naives):
            raise Exception('No naive bayes has been trained.')
        preds=[]
        for i in range(0,len(self.naives)):
            y=self.naives[i].predict_proba(data[self.chosen_features[i]])
            y_pred=[]
            for elem in y:
                y_pred.append(elem[number])
            preds.append(y_pred)
        y_pred=[sum(x) for x in zip(*preds)]
        y_pred=[i / len(self.naives) for i in y_pred]
        return y_pred
                             
    def fit(self,train, test, to_predict, features):
        best=0
        for i in range(1,self.n_iterations+1):
            features_aux=[]
            if(self.verbose):
                print("Iteration "+str(i))
            for j in range(0,self.n_features):
                features_aux.append(random.choice(features))
            if(i==1):
                features_aux=features
            c=GaussianNB()
            subsample=train.sample(int(round(len(train)*self.subset_sample)))
            c.fit(subsample[features_aux],subsample[to_predict])
            self.naives.append(c)
            self.chosen_features.append(features_aux)
            score=self.roc_auc_score(test,to_predict)
            if(self.verbose):
                print("Score: "+str(score)+" with features "+str(features_aux))
            if(score<=best):
                for k in range(0,self.retries):
                    features_aux=[]
                    self.naives.pop()
                    self.chosen_features.pop()
                    for j in range(0,self.n_features):
                        features_aux.append(random.choice(features))
                    c=GaussianNB()
                    subsample=train.sample(int(round(len(train)*self.subset_sample)))
                    c.fit(subsample[features_aux],subsample[to_predict])
                    self.naives.append(c)
                    self.chosen_features.append(features_aux)
                    score=self.roc_auc_score(test,to_predict)
                    if(self.verbose):
                        print("Retry number "+str(k+1)+" with score: "+str(score)+" and features: "+str(features_aux))
                    if(score>best):
                        break
            if(score<=best):
                self.naives.pop()
                self.chosen_features.pop()
                print("Stopping")
                return;
            best=score
            subsample=0
            gc.collect()
    

In [26]:
postulaciones=0
c=0
gc.collect()

28

In [27]:
c=RandomNaiveBayes(n_iterations=10,n_features=5,retries=3,subset_sample=1,verbose=True)
#c = joblib.load("../../Data/fiuba_entrenamiento/gian/modelos/....pkl")
dt=c.fit(train,test,'sepostulo',features)

Iteration 1
Score: 0.8793862734770148 with features ['nombre_area', 'edad', 'sexo_FEM', 'sexo_MASC', 'Doctorado_En Curso', 'Doctorado_Graduado', 'Master_En Curso', 'Master_Graduado', 'Posgrado_En Curso', 'Posgrado_Graduado', 'Secundario_En Curso', 'Secundario_Graduado', 'Terciario/Técnico_En Curso', 'Terciario/Técnico_Graduado', 'Universitario_En Curso', 'Universitario_Graduado', 'pide_hombre', 'pide_mujer', 'ingles', 'experiencia', 'paquete_office', 'liderazgo', 'secundario', 'viajar', 'empresa_importante', 'capacitacion', 'remuneracion_pretendida', 'graduados', 'estudiantes', 'conocimientos_especificos', 'vistas_postulante', 'vistas_al_aviso', 'cantidad', 'postulaciones_misma_area', 'postulaciones_mismo_tipo', 'postulaciones_mismo_nivel', 'lo_vio', 'vistas_por_area', 'vistas_por_nivel', 'vistas_por_tipo']
Iteration 2
Score: 0.8993307762592352 with features ['capacitacion', 'Terciario/Técnico_Graduado', 'postulaciones_misma_area', 'cantidad', 'Master_En Curso']
Iteration 3
Score: 0.89

In [28]:
y_test=test['sepostulo']
score=roc_auc_score(y_test, c.predict_proba_class(1,test))
print(score)

0.8993307762592352


In [80]:
joblib.dump(c, "../../Data/fiuba_entrenamiento/gian/modelos/RandomNaiveBayes.pkl")

['../../Data/fiuba_entrenamiento/gian/modelos/RandomNaiveBayes.pkl']

In [29]:
prediccion = pd.read_csv("../../Data/fiuba_entrenamiento/test_final_100k.csv")
print(len(prediccion))
prediccion=pd.merge(prediccion,postulantes,on='idpostulante',how='inner')
print(len(prediccion))
prediccion=pd.merge(prediccion,avisos,on='idaviso',how='inner')
print(len(prediccion))
prediccion=pd.merge(prediccion,avisos_no_one_hot,on='idaviso',how='inner')
print(len(prediccion))
prediccion=pd.merge(prediccion,vistas,on=['idpostulante'],how='left')
prediccion=pd.merge(prediccion,vistas2,on=['idaviso','idpostulante'],how='left')
prediccion=pd.merge(prediccion,vistas3,on=['idaviso','idpostulante'],how='left')
prediccion.cantidad=prediccion.cantidad.fillna(0)
prediccion['vistas_postulante']=prediccion['vistas_postulante'].fillna(0)
prediccion['vistas_al_aviso']=prediccion['vistas_al_aviso'].fillna(0)
prediccion['lo_vio']=(prediccion['vistas_al_aviso']>0)
prediccion=pd.merge(prediccion,postulaciones_por_area,on=['idpostulante','nombre_area_aux'],how='left')
prediccion=pd.merge(prediccion,postulaciones_por_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
prediccion=pd.merge(prediccion,postulaciones_por_nivel,on=['idpostulante','nivel_laboral'],how='left')
prediccion=pd.merge(prediccion,vistas_por_area,on=['idpostulante','nombre_area_aux'],how='left')
prediccion=pd.merge(prediccion,vistas_por_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
prediccion=pd.merge(prediccion,vistas_por_nivel,on=['idpostulante','nivel_laboral'],how='left')
prediccion['vistas_por_area']=prediccion['vistas_por_area'].fillna(0)
prediccion['vistas_por_tipo']=prediccion['vistas_por_tipo'].fillna(0)
prediccion['vistas_por_nivel']=prediccion['vistas_por_tipo'].fillna(0)
prediccion['postulaciones_misma_area']=prediccion['postulaciones_misma_area'].fillna(0)
prediccion['postulaciones_mismo_nivel']=prediccion['postulaciones_mismo_nivel'].fillna(0)
prediccion['postulaciones_mismo_tipo']=prediccion['postulaciones_mismo_tipo'].fillna(0)
prediccion.sample(10)

100000
100000
100000
100000


Unnamed: 0,id,idaviso,idpostulante,edad,sexo_FEM,sexo_MASC,sexo_NO_DECLARA,Doctorado_Abandonado,Doctorado_En Curso,Doctorado_Graduado,...,vistas_postulante,vistas_al_aviso,cantidad,lo_vio,postulaciones_misma_area,postulaciones_mismo_tipo,postulaciones_mismo_nivel,vistas_por_area,vistas_por_tipo,vistas_por_nivel
15852,67919,1112460519,4rNzLje,24.0,True,False,False,False,False,False,...,470.0,0.0,0.0,False,148.0,998.0,781.0,45.0,281.0,281.0
66805,12879,1112029048,eXQrE,42.0,True,False,False,False,False,False,...,7.0,0.0,0.0,False,0.0,6.0,5.0,0.0,7.0,7.0
95652,95124,1112487453,eaPxOE,36.0,True,False,False,False,False,False,...,22.0,0.0,0.0,False,0.0,11.0,8.0,0.0,22.0,22.0
2498,13048,1112033906,BmZ0KKZ,21.0,True,False,False,False,False,False,...,25.0,3.0,0.12,True,4.0,7.0,10.0,4.0,21.0,21.0
87932,42529,1112423624,BmDqe5v,52.0,False,True,False,False,False,False,...,151.0,3.0,0.019868,True,11.0,12.0,12.0,101.0,104.0,104.0
41968,45130,1112427944,88jxlM,51.0,True,False,False,False,False,False,...,7.0,0.0,0.0,False,4.0,59.0,47.0,1.0,3.0,3.0
75630,95189,1112487778,6vNmmx,35.0,False,True,False,False,False,False,...,568.0,0.0,0.0,False,0.0,507.0,317.0,0.0,523.0,523.0
90057,54484,1112446563,5Ao4Lv,27.0,True,False,False,False,False,False,...,4.0,0.0,0.0,False,0.0,26.0,18.0,0.0,4.0,4.0
55666,5457,1111556156,NAYvlz,36.0,False,True,False,False,False,False,...,0.0,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0
90559,58772,1112453056,1ljw8Q,40.0,True,False,False,False,False,False,...,5.0,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
y_final=c.predict_proba_class(1,prediccion)
y_final

[0.132918947719328,
 0.3457866879161412,
 0.13291896130806002,
 0.13291941814139085,
 0.13291925474752578,
 0.13291904632724863,
 0.17590538201403086,
 0.2525880801719168,
 0.17590538178332932,
 0.13292019303716368,
 0.1329189753438674,
 0.13292006401048412,
 0.13292104976334645,
 0.1759054725158645,
 0.25258812570071104,
 0.17590533152760263,
 0.13291893166550595,
 0.13498506898496315,
 0.1742385142224661,
 0.1329189377799758,
 0.13291893783970987,
 0.13291891446100096,
 0.13291891450019228,
 0.13291896078863072,
 0.13292522780211585,
 0.13291898064055477,
 0.1349853163906291,
 0.13292043058758696,
 0.1329189862308479,
 0.13291895945986446,
 0.13291895594655087,
 0.1329189569237798,
 0.38398796830398446,
 0.132947327812292,
 0.2025754456233811,
 0.20257542548910426,
 0.1347772458283159,
 0.2125131911716134,
 0.1382949529057418,
 0.2923694334880309,
 0.13291894343815144,
 0.13292672540033934,
 0.2025754054306189,
 0.5521918388918206,
 0.13291898035814242,
 0.6329173065940616,
 0.132918

In [31]:
prediccion['sepostulo']=y_final

In [32]:
prediccion.head()

Unnamed: 0,id,idaviso,idpostulante,edad,sexo_FEM,sexo_MASC,sexo_NO_DECLARA,Doctorado_Abandonado,Doctorado_En Curso,Doctorado_Graduado,...,vistas_al_aviso,cantidad,lo_vio,postulaciones_misma_area,postulaciones_mismo_tipo,postulaciones_mismo_nivel,vistas_por_area,vistas_por_tipo,vistas_por_nivel,sepostulo
0,0,739260,6M9ZQR,42.0,True,False,False,False,False,True,...,0.0,0.0,False,0.0,2.0,0.0,4.0,24.0,24.0,0.132919
1,1,739260,6v1xdL,30.0,False,True,False,False,False,False,...,0.0,0.0,False,5.0,44.0,0.0,0.0,6.0,6.0,0.345787
2,2,739260,ezRKm9,36.0,True,False,False,False,False,False,...,0.0,0.0,False,0.0,1.0,0.0,1.0,7.0,7.0,0.132919
3,3,758580,1Q35ej,68.0,False,True,False,False,False,False,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,0.132919
4,4,758580,EAN4J6,32.0,True,False,False,False,False,False,...,0.0,0.0,False,0.0,1.0,0.0,0.0,3.0,3.0,0.132919


In [33]:
prediccion=prediccion[['id','sepostulo']]
prediccion.head()

Unnamed: 0,id,sepostulo
0,0,0.132919
1,1,0.345787
2,2,0.132919
3,3,0.132919
4,4,0.132919


In [34]:
prediccion.to_csv("../../Data/fiuba_entrenamiento/gian/predicciones/prediccion55.csv",index=False)

In [35]:
prediccion.sepostulo.mean()

0.4980318388909141