In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier, BaggingRegressor, AdaBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [62]:
train_data = pd.read_csv('Features/train_data.csv',index_col = 0)
test_data = pd.read_csv('Features/test_data.csv',index_col=0)

In [63]:
train_data.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'caracteres_usados',
       'menciones_realizadas', 'permite_location', 'use_keyword', 'cita_url',
       'use_hashtag', 'cant_palabras', 'cant_abreviaciones', 'location_usa',
       'has_emoji', 'has_repeated_letter'],
      dtype='object')

In [64]:
features = ['caracteres_usados',
       'menciones_realizadas', 'permite_location', 'use_keyword', 'cita_url',
       'use_hashtag', 'cant_palabras', 'cant_abreviaciones', 'location_usa',
       'has_emoji', 'has_repeated_letter']

# Creando los algoritmos con los parametros obtenidos con grid-search

In [65]:
xgbRegressor = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.075,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=95, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=0.8, seed=15, silent=True,
       subsample=0.8)

xgbClassifier = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.075,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=95, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=0.8, seed=15, silent=True,
       subsample=0.8)

In [66]:
randomForestClassifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=45, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [67]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## Organizando los sets de entrenamiento/test

In [68]:
X = train_data[features]
Y = train_data['target']

X_train,X_test,Y_train,Y_test = train_test_split(X,Y)

In [69]:
kfold = KFold(n_splits=10)

In [70]:
mejor_score = 0
for train, test in kfold.split(X_train, Y_train):
    train_datas = train_data.iloc[train]
    test_datas = train_data.iloc[test]
    
    X_train_data = train_datas[features]
    Y_train_data = train_datas['target']
    
    X_test_data = test_datas[features]
    Y_test_data = test_datas['target']
    
    xgbRegressor.fit(X_train_data,Y_train_data)
    print(roc_auc_score(Y_test_data,xgbRegressor.predict(X_test_data)))
    if (roc_auc_score(Y_test_data, xgbRegressor.predict(X_test_data))>mejor_score):
        mejor_score = roc_auc_score(Y_test_data,xgbRegressor.predict(X_test_data))
        X_train_optimo = X_train_data
        Y_train_optimo = Y_train_data
        X_test_optimo = X_test_data
        Y_test_optimo = Y_test_data

0.6716524946214527
0.6299833297864794
0.6360976148409894
0.7007489065201913
0.6662101313320825
0.6783405834500724
0.7203126954346467
0.672653191856028
0.6219323782527644
0.7184981386545669


## Voting

In [15]:
voting = VotingClassifier(estimators = [('xgb',xgbClassifier),('randomforest',randomForestClassifier),('knn',knn)],voting='soft')

In [16]:
voting.fit(X_train_optimo,Y_train_optimo)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1.0, gamma=1,
                                            learning_rate=0.075,
                                            max_delta_step=0, max_depth=4,
                                            min_child_weight=1, missing=None,
                                            n_estimators=95, n_jobs=1,
                                            nthread=None,
                                            objective='binary:logistic',
                                            random_state=0, reg_alpha=0,
                                            reg_lambda=1, scale_pos_wei...
                                                     min_samples_leaf=1,
                                        

In [18]:
predic = voting.predict_proba(test_data[features])

In [20]:
predic = pd.DataFrame(predic)

In [22]:
test_data['prediction'] = predic[1]

In [75]:
test_data = test_data[['id','prediction']]
test_data = test_data.rename(columns={'prediction':'target'})

In [76]:
test_data['target'] = test_data['target'].transform(lambda x: 0.0 if x<0.5 else 1.0)

In [39]:
test_data.to_csv('submit_kaggle_voting.csv',index=False)

## Bagging

In [49]:
bagging = BaggingRegressor(base_estimator=xgbRegressor,n_estimators=95)

In [50]:
bagging.fit(X_train_optimo,Y_train_optimo)

BaggingRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                             colsample_bylevel=1,
                                             colsample_bynode=1,
                                             colsample_bytree=1.0, gamma=1,
                                             importance_type='gain',
                                             learning_rate=0.075,
                                             max_delta_step=0, max_depth=4,
                                             min_child_weight=1, missing=None,
                                             n_estimators=95, n_jobs=1,
                                             nthread=None,
                                             objective='binary:logistic',
                                             random_state=0, reg_alpha=0,
                                             reg_lambda=1, scale_pos_weight=0.8,
                                             seed=15, silent=Tru

In [52]:
roc_auc_score(Y_test_optimo,bagging.predict(X_test_optimo))

0.7235334584115072

In [53]:
test_data['prediction'] = bagging.predict(test_data[features])

In [60]:
test_data.to_csv('submit_kaggle_bagging.csv',index=False)

## AdaBoost

In [61]:
adaboost = AdaBoostRegressor(base_estimator=xgbRegressor,n_estimators=95,learning_rate=0.075)

In [71]:
adaboost.fit(X_train_optimo,Y_train_optimo)

AdaBoostRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1.0, gamma=1,
                                              importance_type='gain',
                                              learning_rate=0.075,
                                              max_delta_step=0, max_depth=4,
                                              min_child_weight=1, missing=None,
                                              n_estimators=95, n_jobs=1,
                                              nthread=None,
                                              objective='binary:logistic',
                                              random_state=0, reg_alpha=0,
                                              reg_lambda=1,
                                              scale_pos_weight=0.8, seed=

In [72]:
roc_auc_score(Y_test_optimo,adaboost.predict(X_test_optimo))

0.7259599749843653

In [73]:
test_data['prediction'] = adaboost.predict(test_data[features])

In [77]:
test_data.to_csv('submit_kaggle_adaBoost.csv',index=False)