In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import gc
from lightgbm import LGBMClassifier



SAMPLE_SIZE = 0.5
%matplotlib inline

In [74]:
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
import random
import gc
class RandomLightGBM:

    
    def __init__(self,  n_iterations, n_features, retries, subset_sample, verbose, meta_parameters = []):
        self.n_iterations=n_iterations
        self.n_features=n_features
        self.retries=retries
        self.subset_sample=subset_sample
        self.verbose=verbose
        self.LGBMs = []
        self.chosen_features=[]
        self.meta_parameters = meta_parameters
    
    
    def roc_auc_score(self, test, to_predict):
        y_test=test[to_predict]
        return roc_auc_score(y_test, self.predict_proba_class(1,test))
    
    def predict_proba_class(self,number,data):
        n_models = len(self.LGBMs)
        if(n_models == 0):
            raise Exception('No naive bayes has been trained.')
        preds=[]
        for i in range(0,n_models):
            y=self.LGBMs[i].predict_proba(data[self.chosen_features[i]])
            y_pred=[]
            for elem in y:
                y_pred.append(elem[number])
            preds.append(y_pred)
            
        y_pred=[sum(x) for x in zip(*preds)]
        y_pred=[i / n_models for i in y_pred]
        return y_pred
    
    def score_iteration(self,train,features,to_predict):
        #TODO: Configurar para pasarle metaparametros
        c=LGBMClassifier(learning_rate=0.05,objective='binary',num_leaves=300,max_depth=9,n_estimators=1000,colsample_bytree=0.8,n_jobs=-1,random_state=0,silent=False,subsample=0.8,sumsample_freq=0.5)
        subsample=train.sample(int(round(len(train)*self.subset_sample)))
        c.fit(subsample[features],subsample[to_predict])
        self.LGBMs.append(c)
        self.chosen_features.append(features)
        return self.roc_auc_score(test,to_predict)
        
    def fit(self,train, test, to_predict, features):
        best=0
        
        #Por cada iteracion, entrena un clasificador LGBM con los meta parametros, un subsample aleatorio del set de entrenamiento
        #de tamano subsample y compara los scores
        for i in range(1,self.n_iterations+1):
            features_aux=[]
            for j in range(0,self.n_features):
                feat = random.choice(features)
                if(feat not in features_aux): features_aux.append(feat)
               
            if(self.verbose):
                print("Iteration "+str(i)+" with features "+str(features_aux))
            score = self.score_iteration(train,features_aux,to_predict)
            if(self.verbose):
                print("Score: "+str(score)+" with features "+str(features_aux))
           
            if(score<=best):
                for k in range(0,self.retries):
                    features_aux=[]
                    self.LGBMs.pop()
                    self.chosen_features.pop()
                    for j in range(0,self.n_features*i):
                        feat = random.choice(features)
                        if(feat not in features_aux): features_aux.append(feat)
                    score = self.score_iteration(train,features_aux,to_predict)
                    if(self.verbose):
                        print("Retry number "+str(k+1)+" with score: "+str(score)+" and features: "+str(features_aux))
                    if(score>best):
                        break
            if(score<=best):
                self.LGBMs.pop()
                self.chosen_features.pop()
                print("Stopping")
                return;
            best=score
            subsample=0
            gc.collect()

In [75]:
avisos = pd.read_csv("Data/fiuba_entrenamiento/pocho/avisos.csv")
postulaciones = pd.read_csv("Data/fiuba_entrenamiento/pocho/postulaciones.csv")
postulantes = pd.read_csv("Data/fiuba_entrenamiento/pocho/postulantes.csv")

postulantes['sexo']=postulantes['sexo'].astype('category')
postulantes['Doctorado']=postulantes['Doctorado'].astype('category')
postulantes['Master']=postulantes['Master'].astype('category')
postulantes['Otro']=postulantes['Otro'].astype('category')
postulantes['Posgrado']=postulantes['Posgrado'].astype('category')
postulantes['Secundario']=postulantes['Secundario'].astype('category')
postulantes['Terciario/Técnico']=postulantes['Terciario/Técnico'].astype('category')
postulantes['Universitario']=postulantes['Universitario'].astype('category')

avisos['nombre_zona']=avisos['nombre_zona'].astype('category')
avisos['tipo_de_trabajo']=avisos['tipo_de_trabajo'].astype('category')
avisos['nivel_laboral']=avisos['nivel_laboral'].astype('category')
avisos['nombre_area']=avisos['nombre_area'].astype('category')

vistas = pd.read_csv("Data/fiuba_entrenamiento/pocho/vistas.csv")

postulaciones['sepostulo']=1
postulaciones.drop(['fechapostulacion','dia','mes','semana','diadelasemana','hora'],axis=1,inplace=True)

In [76]:
vistas['cantidad']=0
vistas2=vistas.groupby(['idaviso','idpostulante'],as_index=False).agg({'cantidad':'count'})
vistas2.head()

Unnamed: 0,idaviso,idpostulante,cantidad
0,18,BolNL,1
1,48375,RwVdKR,1
2,169730,1KjXB,1
3,169730,2AKzxa,1
4,169730,6LJ64,1


In [77]:
vistas=vistas.groupby(['idpostulante'],as_index=False).agg({'cantidad':'count'});
vistas.head()

Unnamed: 0,idpostulante,cantidad
0,0002q,3
1,0005E,25
2,000R8,4
3,001XE,12
4,003k9,26


In [78]:
vistas3=pd.merge(vistas2,vistas,on='idpostulante',how='left')
vistas3.head()

Unnamed: 0,idaviso,idpostulante,cantidad_x,cantidad_y
0,18,BolNL,1,20
1,48375,RwVdKR,1,10
2,169730,1KjXB,1,34
3,169730,2AKzxa,1,1
4,169730,6LJ64,1,10


In [79]:
vistas3['cantidad']=vistas3['cantidad_x']/vistas3['cantidad_y']
vistas3.drop(['cantidad_x','cantidad_y'],axis=1,inplace=True)
vistas3.head()

Unnamed: 0,idaviso,idpostulante,cantidad
0,18,BolNL,0.05
1,48375,RwVdKR,0.1
2,169730,1KjXB,0.029412
3,169730,2AKzxa,1.0
4,169730,6LJ64,0.1


In [80]:
vistas=vistas.rename(columns={'cantidad': 'vistas_postulante'})
vistas2=vistas2.rename(columns={'cantidad': 'vistas_al_aviso'})
vistas_final = pd.merge(vistas3,avisos, on= "idaviso", how = "left")
vistas_final.head()

Unnamed: 0,idaviso,idpostulante,cantidad,titulo,nombre_zona,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa,titulo_uppercase,...,estudiantes,conocimientos_especificos,buen_ambiente,indica_sueldo,part_time,full_time,ofrece_crecimiento,ofrece_beneficios,obra_social,multinacional
0,18,BolNL,0.05,,,,,,,,...,,,,,,,,,,
1,48375,RwVdKR,0.1,,,,,,,,...,,,,,,,,,,
2,169730,1KjXB,0.029412,,,,,,,,...,,,,,,,,,,
3,169730,2AKzxa,1.0,,,,,,,,...,,,,,,,,,,
4,169730,6LJ64,0.1,,,,,,,,...,,,,,,,,,,


In [82]:
no_postulaciones = pd.read_csv("Data/fiuba_entrenamiento/pocho/no-postulaciones/azar_total.csv")
postulaciones=postulaciones.append(no_postulaciones, ignore_index=True)
postulaciones.drop_duplicates(['idaviso','idpostulante'],keep='first',inplace=True)
postulaciones = postulaciones.sample(int(round(len(postulaciones)*SAMPLE_SIZE)))
no_postulaciones = 0
gc.collect()

945

In [83]:
vistas_final["vistas_por_area"]=0
vistas_final["vistas_por_tipo"]=0
vistas_final["vistas_por_nivel"]=0

vistas_area = vistas_final.groupby(['idpostulante','nombre_area'],as_index=False).agg({'vistas_por_area':'count'})
vistas_tipo= vistas_final.groupby(['idpostulante','tipo_de_trabajo'],as_index=False).agg({'vistas_por_tipo':'count'})
vistas_nivel=vistas_final.groupby(['idpostulante','nivel_laboral'],as_index=False).agg({'vistas_por_nivel':'count'})

vistas_final = 0
gc.collect()

126

In [84]:
postulaciones=pd.merge(postulaciones,postulantes,on='idpostulante')
postulaciones=pd.merge(postulaciones,avisos,on='idaviso')
postulaciones=pd.merge(postulaciones,vistas,on=['idpostulante'],how='left')
postulaciones=pd.merge(postulaciones,vistas2,on=['idaviso','idpostulante'],how='left')
postulaciones=pd.merge(postulaciones,vistas3,on=['idaviso','idpostulante'],how='left')
postulaciones=pd.merge(postulaciones,vistas_area,on=['idpostulante','nombre_area'],how='left')
postulaciones=pd.merge(postulaciones,vistas_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
postulaciones=pd.merge(postulaciones,vistas_nivel,on=['idpostulante','nivel_laboral'],how='left')
postulaciones['vistas_por_area']=postulaciones['vistas_por_area'].fillna(0)
postulaciones['vistas_por_tipo']=postulaciones['vistas_por_tipo'].fillna(0)
postulaciones['vistas_por_nivel']=postulaciones['vistas_por_tipo'].fillna(0)
postulaciones["vistas_al_aviso"] = postulaciones["vistas_al_aviso"].fillna(0)
postulaciones['lo_vio']=(postulaciones['vistas_al_aviso']>0)


postulaciones.sample(20)

Unnamed: 0,idaviso,idpostulante,sepostulo,sexo,edad,Doctorado,Master,Otro,Posgrado,Secundario,...,ofrece_beneficios,obra_social,multinacional,vistas_postulante,vistas_al_aviso,cantidad,vistas_por_area,vistas_por_tipo,vistas_por_nivel,lo_vio
1988416,1112366912,YNaqNw,0,MASC,45.0,-,-,-,-,-,...,False,False,False,39.0,0.0,,0.0,29.0,29.0,False
3143536,1112393194,6rPJ3oj,1,MASC,19.0,-,-,-,-,Graduado,...,False,False,True,9.0,0.0,,1.0,2.0,2.0,False
5164511,1111947372,ekOOZd8,1,MASC,29.0,-,-,-,-,-,...,False,False,True,52.0,0.0,,1.0,31.0,31.0,False
5864248,1112206351,4rPRr2e,0,MASC,29.0,-,-,-,-,Graduado,...,False,False,False,10.0,0.0,,0.0,10.0,10.0,False
2626116,1112358080,6aGWNv,1,MASC,30.0,-,-,-,-,-,...,False,False,True,39.0,0.0,,4.0,35.0,35.0,False
5140718,1112275218,xkvX6p0,1,MASC,20.0,-,-,-,-,Graduado,...,False,False,False,,0.0,,0.0,0.0,0.0,False
2994678,1112431054,X9x5X08,1,FEM,38.0,-,-,En Curso,-,Graduado,...,True,False,False,148.0,1.0,0.006757,2.0,102.0,102.0,True
4634912,1112445434,96M6d69,0,MASC,57.0,-,-,-,-,-,...,False,False,False,2.0,0.0,,0.0,2.0,2.0,False
1909303,1112283238,3NP1w6X,1,FEM,48.0,-,-,-,-,-,...,False,False,False,5.0,0.0,,2.0,4.0,4.0,False
2542193,1112386877,ZDPm6Kb,1,MASC,27.0,-,-,-,-,Graduado,...,False,False,False,41.0,1.0,0.02439,7.0,39.0,39.0,True


In [85]:
postulaciones_area = postulaciones.groupby(['idpostulante','nombre_area'],as_index=False).agg({'sepostulo':'count'})
postulaciones_tipo= postulaciones.groupby(['idpostulante','tipo_de_trabajo'],as_index=False).agg({'sepostulo':'count'})
postulaciones_nivel =postulaciones.groupby(['idpostulante','nivel_laboral'],as_index=False).agg({'sepostulo':'count'})
postulaciones_area=postulaciones_area.rename(columns={'sepostulo':'postulaciones_misma_area'})
postulaciones_tipo=postulaciones_tipo.rename(columns={'sepostulo':'postulaciones_mismo_tipo'})
postulaciones_nivel=postulaciones_nivel.rename(columns={'sepostulo':'postulaciones_mismo_nivel'})

In [86]:
postulaciones.head()



Unnamed: 0,idaviso,idpostulante,sepostulo,sexo,edad,Doctorado,Master,Otro,Posgrado,Secundario,...,ofrece_beneficios,obra_social,multinacional,vistas_postulante,vistas_al_aviso,cantidad,vistas_por_area,vistas_por_tipo,vistas_por_nivel,lo_vio
0,1112271497,zv642lk,1,FEM,32.0,-,-,-,Graduado,Graduado,...,False,False,False,,0.0,,0.0,0.0,0.0,False
1,1112271497,X9xaNv8,1,FEM,41.0,-,-,-,Graduado,-,...,False,False,False,22.0,0.0,,2.0,22.0,22.0,False
2,1112271497,6rQdqjl,1,MASC,32.0,-,-,-,-,Graduado,...,False,False,False,14.0,0.0,,1.0,14.0,14.0,False
3,1112271497,ar8OeE,1,MASC,38.0,-,-,-,-,-,...,False,False,False,430.0,0.0,,30.0,358.0,358.0,False
4,1112271497,5mraqEZ,1,MASC,28.0,-,-,Graduado,Graduado,Graduado,...,False,False,False,249.0,0.0,,33.0,240.0,240.0,False


In [87]:
postulaciones=pd.merge(postulaciones,postulaciones_area,on=['idpostulante','nombre_area'],how='left')
postulaciones=pd.merge(postulaciones,postulaciones_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
postulaciones=pd.merge(postulaciones,postulaciones_nivel,on=['idpostulante','nivel_laboral'],how='left')
postulaciones['postulaciones_misma_area']=postulaciones['postulaciones_misma_area'].fillna(0).apply(lambda x: x if(x<=0) else x-1)
postulaciones['postulaciones_mismo_nivel']=postulaciones['postulaciones_mismo_nivel'].fillna(0).apply(lambda x: x if(x<=0) else x-1)
postulaciones['postulaciones_mismo_tipo']=postulaciones['postulaciones_mismo_tipo'].fillna(0).apply(lambda x: x if(x<=0) else x-1)


In [88]:
postulaciones.sample(10)

Unnamed: 0,idaviso,idpostulante,sepostulo,sexo,edad,Doctorado,Master,Otro,Posgrado,Secundario,...,vistas_postulante,vistas_al_aviso,cantidad,vistas_por_area,vistas_por_tipo,vistas_por_nivel,lo_vio,postulaciones_misma_area,postulaciones_mismo_tipo,postulaciones_mismo_nivel
836951,1112344955,LNKvk4P,1,NO_DECLARA,200.0,-,-,-,-,Graduado,...,26.0,1.0,0.038462,1.0,24.0,24.0,True,0,10,4
718056,1112371554,qe24dq9,0,FEM,21.0,-,-,-,-,-,...,,0.0,,0.0,0.0,0.0,False,1,4,2
1569298,1112444243,A3X1Rw1,1,MASC,31.0,-,-,-,-,Graduado,...,64.0,0.0,,0.0,44.0,44.0,False,1,31,7
3550848,1112298991,Nzrb8q4,0,MASC,36.0,-,-,-,-,Graduado,...,18.0,0.0,,0.0,18.0,18.0,False,0,7,3
5511037,1112157764,eMG8A4,0,MASC,30.0,-,-,-,-,Graduado,...,2.0,0.0,,0.0,1.0,1.0,False,0,11,5
6118090,1112207320,5mkJvNw,0,MASC,35.0,-,-,-,-,Graduado,...,1.0,0.0,,0.0,1.0,1.0,False,0,6,1
3421588,1112437934,lDLK053,0,FEM,24.0,-,-,Graduado,-,Graduado,...,,0.0,,0.0,0.0,0.0,False,0,8,2
4375593,1112308845,lDLRWq6,0,MASC,19.0,-,-,-,-,En Curso,...,1.0,0.0,,0.0,1.0,1.0,False,0,9,3
5130717,1112241117,vVkxYVX,1,MASC,24.0,-,-,-,-,Graduado,...,,0.0,,0.0,0.0,0.0,False,10,81,63
1806787,1112343977,jk6Br4e,1,FEM,47.0,-,-,-,-,Graduado,...,26.0,0.0,,9.0,16.0,16.0,False,0,10,6


In [89]:
train, test=train_test_split(postulaciones,test_size=0.10)
print("Train: ",len(train),"Test: ",len(test))
features=list(postulaciones.columns)
features.remove('idaviso')
features.remove('idpostulante')
features.remove('sepostulo')
features.remove('titulo')
features.remove('denominacion_empresa')

x_train=train[features]
y_train=train['sepostulo']

x_test=test[features]
y_test=test['sepostulo']

postulaciones=0

gc.collect()

Train:  5754323 Test:  639370


98

In [90]:
model = RandomLightGBM(n_iterations=5,n_features=15,retries=3,subset_sample=0.3,verbose=True)
dt = model.fit(train,test,'sepostulo',features)

Iteration 1 with features ['vistas_por_area', 'liderazgo', 'vistas_por_nivel', 'titulo_uppercase', 'nombre_area', 'ingles', 'supera_largo_medio', 'sexo', 'postulaciones_mismo_tipo', 'largo_descripcion', 'estudiantes', 'buen_ambiente', 'pide_mujer']
Score: 0.955272660975176 with features ['vistas_por_area', 'liderazgo', 'vistas_por_nivel', 'titulo_uppercase', 'nombre_area', 'ingles', 'supera_largo_medio', 'sexo', 'postulaciones_mismo_tipo', 'largo_descripcion', 'estudiantes', 'buen_ambiente', 'pide_mujer']
Iteration 2 with features ['tipo_de_trabajo', 'pide_mujer', 'Secundario', 'postulaciones_misma_area', 'vistas_por_area', 'secundario', 'empresa_importante', 'postulaciones_mismo_nivel', 'pide_excel', 'sexo', 'Posgrado', 'part_time', 'viajar']
Score: 0.9514566263993018 with features ['tipo_de_trabajo', 'pide_mujer', 'Secundario', 'postulaciones_misma_area', 'vistas_por_area', 'secundario', 'empresa_importante', 'postulaciones_mismo_nivel', 'pide_excel', 'sexo', 'Posgrado', 'part_time',

In [91]:
score=roc_auc_score(y_test, model.predict_proba_class(1,test))
print(score)


0.9734617505869821


In [92]:
joblib.dump(model, "Data/fiuba_entrenamiento/pocho/modelos/RandomLGBM6.pkl")

['Data/fiuba_entrenamiento/pocho/modelos/RandomLGBM6.pkl']

In [93]:
vistas3.head()

Unnamed: 0,idaviso,idpostulante,cantidad
0,18,BolNL,0.05
1,48375,RwVdKR,0.1
2,169730,1KjXB,0.029412
3,169730,2AKzxa,1.0
4,169730,6LJ64,0.1


In [94]:
prediccion = pd.read_csv("Data/fiuba_entrenamiento/test_final_100k.csv")
prediccion = pd.merge(prediccion,postulantes,on='idpostulante',how='inner')
prediccion = pd.merge(prediccion,avisos,on='idaviso',how='inner')
prediccion = pd.merge(prediccion,vistas,on='idpostulante',how='left')
prediccion.sample(10)


prediccion=pd.merge(prediccion,vistas2,on=['idaviso','idpostulante'],how='left')
prediccion=pd.merge(prediccion,vistas3,on=['idaviso','idpostulante'],how='left')
prediccion['cantidad']=prediccion['cantidad'].fillna(0)
prediccion['vistas_postulante']=prediccion['vistas_postulante'].fillna(0)
prediccion['vistas_al_aviso']=prediccion['vistas_al_aviso'].fillna(0)
prediccion['lo_vio']=(prediccion['vistas_al_aviso']>0)

prediccion=pd.merge(prediccion,postulaciones_area,on=['idpostulante','nombre_area'],how='left')
prediccion=pd.merge(prediccion,postulaciones_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
prediccion=pd.merge(prediccion,postulaciones_nivel,on=['idpostulante','nivel_laboral'],how='left')
prediccion=pd.merge(prediccion,vistas_area,on=['idpostulante','nombre_area'],how='left')
prediccion=pd.merge(prediccion,vistas_tipo,on=['idpostulante','tipo_de_trabajo'],how='left')
prediccion=pd.merge(prediccion,vistas_nivel,on=['idpostulante','nivel_laboral'],how='left')
prediccion['vistas_por_area']=prediccion['vistas_por_area'].fillna(0)
prediccion['vistas_por_tipo']=prediccion['vistas_por_tipo'].fillna(0)
prediccion['vistas_por_nivel']=prediccion['vistas_por_tipo'].fillna(0)
prediccion['postulaciones_misma_area']=prediccion['postulaciones_misma_area'].fillna(0)
prediccion['postulaciones_mismo_nivel']=prediccion['postulaciones_mismo_nivel'].fillna(0)
prediccion['postulaciones_mismo_tipo']=prediccion['postulaciones_mismo_tipo'].fillna(0)
prediccion.sample(10)

Unnamed: 0,id,idaviso,idpostulante,sexo,edad,Doctorado,Master,Otro,Posgrado,Secundario,...,vistas_postulante,vistas_al_aviso,cantidad,lo_vio,postulaciones_misma_area,postulaciones_mismo_tipo,postulaciones_mismo_nivel,vistas_por_area,vistas_por_tipo,vistas_por_nivel
76506,22749,1112296264,8894zz,MASC,39.0,-,-,-,-,Graduado,...,14.0,0.0,0.0,False,0.0,8.0,1.0,0.0,14.0,14.0
11115,25912,1112334809,8XoxzR,MASC,31.0,-,-,-,-,Graduado,...,5.0,0.0,0.0,False,0.0,3.0,2.0,0.0,5.0,5.0
48116,63026,1112456971,avao4W,FEM,29.0,-,-,En Curso,-,Graduado,...,189.0,1.0,0.005291,True,23.0,184.0,130.0,20.0,172.0,172.0
39724,50478,1112439336,RzrmGbq,MASC,23.0,-,-,Graduado,-,Graduado,...,1.0,0.0,0.0,False,16.0,23.0,16.0,1.0,1.0,1.0
1514,86524,1112466769,BmaVDME,FEM,31.0,-,-,-,-,-,...,1.0,0.0,0.0,False,9.0,33.0,7.0,0.0,1.0,1.0
38004,21989,1112280954,6QQNr,MASC,43.0,-,Graduado,-,-,Graduado,...,3.0,0.0,0.0,False,0.0,12.0,1.0,0.0,2.0,2.0
88380,44793,1112427197,6jOrpr,FEM,47.0,-,-,-,-,-,...,1.0,0.0,0.0,False,0.0,9.0,8.0,0.0,1.0,1.0
25606,20912,1112264647,ZW0208,MASC,32.0,-,-,-,-,Graduado,...,24.0,1.0,0.041667,True,0.0,24.0,17.0,1.0,15.0,15.0
60615,8784,1111793352,NAlOm4,FEM,36.0,-,-,-,-,-,...,20.0,0.0,0.0,False,2.0,7.0,5.0,0.0,0.0,0.0
80395,45694,1112428575,8GEYj,MASC,43.0,-,-,-,-,Graduado,...,0.0,0.0,0.0,False,0.0,8.0,7.0,0.0,0.0,0.0


In [95]:
y_final=model.predict_proba_class(1,prediccion)
prediccion['sepostulo']=y_final
prediccion.head()


Unnamed: 0,id,idaviso,idpostulante,sexo,edad,Doctorado,Master,Otro,Posgrado,Secundario,...,vistas_al_aviso,cantidad,lo_vio,postulaciones_misma_area,postulaciones_mismo_tipo,postulaciones_mismo_nivel,vistas_por_area,vistas_por_tipo,vistas_por_nivel,sepostulo
0,0,739260,6M9ZQR,FEM,42.0,Graduado,-,-,-,Graduado,...,0.0,0.0,False,0.0,8.0,1.0,3.0,20.0,20.0,0.319913
1,1,739260,6v1xdL,MASC,30.0,-,-,Graduado,-,-,...,0.0,0.0,False,2.0,30.0,0.0,0.0,3.0,3.0,0.402862
2,2,739260,ezRKm9,FEM,36.0,-,-,-,-,Graduado,...,0.0,0.0,False,0.0,5.0,0.0,1.0,6.0,6.0,0.158007
3,3,758580,1Q35ej,MASC,68.0,-,-,-,Graduado,Graduado,...,0.0,0.0,False,0.0,6.0,0.0,0.0,0.0,0.0,0.098621
4,4,758580,EAN4J6,FEM,32.0,-,-,-,-,-,...,0.0,0.0,False,1.0,10.0,0.0,0.0,1.0,1.0,0.211196


In [96]:
prediccion=prediccion[['id','sepostulo']]
print(prediccion.sepostulo.mean())
prediccion.head()

0.5409887203457424


Unnamed: 0,id,sepostulo
0,0,0.319913
1,1,0.402862
2,2,0.158007
3,3,0.098621
4,4,0.211196


In [97]:
prediccion.to_csv("Data/fiuba_entrenamiento/pocho/predicciones/pred-rlgbm5.csv",index=False)