In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('Features/train_data.csv', index_col=0)
test_data = pd.read_csv('Features/test_data.csv', index_col=0)

In [3]:
train_data.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'caracteres_usados',
       'menciones_realizadas', 'permite_location', 'use_keyword', 'cita_url',
       'use_hashtag', 'cant_palabras', 'cant_abreviaciones', 'location_usa',
       'has_emoji', 'has_repeated_letter'],
      dtype='object')

In [4]:
features = ['caracteres_usados',
       'menciones_realizadas', 'permite_location', 'use_keyword', 'cita_url',
       'use_hashtag', 'cant_palabras', 'cant_abreviaciones', 'location_usa',
       'has_emoji', 'has_repeated_letter']

In [5]:
randomForest_KFold = RandomForestClassifier()

X = np.array(train_data[features])
y = np.array(train_data['target'])

X_train,X_test,Y_train,Y_test = train_test_split(X,y)

In [6]:
kfold = KFold(n_splits=10)

In [7]:
mejor_score = 0
for train, test in kfold.split(X_train, Y_train):
    train_datas = train_data.iloc[train]
    test_datas = train_data.iloc[test]
    
    X_train_data = train_datas[features]
    Y_train_data = train_datas['target']
    
    X_test_data = test_datas[features]
    Y_test_data = test_datas['target']
    
    randomForest_KFold.fit(X=X_train_data,y=Y_train_data)
    print(roc_auc_score(Y_test_data,randomForest_KFold.predict(X_test_data)))
    if (roc_auc_score(Y_test_data, randomForest_KFold.predict(X_test_data))>mejor_score):
        mejor_score = roc_auc_score(Y_test_data,randomForest_KFold.predict(X_test_data))
        X_train_optimo = X_train_data
        Y_train_optimo = Y_train_data
        X_test_optimo = X_test_data
        Y_test_optimo = Y_test_data

0.6073404364306936
0.5442736043129744
0.5856461032587359
0.5908223985352457
0.5604440275171982
0.6076089360760895
0.6227454659161976
0.5855880253083532
0.5699200694604051
0.6370564420100024


In [8]:
randomForest_KFold.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [9]:
params={'n_estimators':list(range(10,50)),'max_features':list(range(1,11))}

In [10]:
cv = GridSearchCV(estimator=randomForest_KFold,param_grid=params,scoring='roc_auc',cv=5)
cv.fit(X_train_optimo,Y_train_optimo)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [11]:
cv.best_score_

0.6355533458065767

In [12]:
randomForest = cv.best_estimator_

In [13]:
randomForest.fit(X_train_optimo,Y_train_optimo)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=45,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
predict = randomForest.predict_proba(X_test_optimo)

In [18]:
predict = pd.DataFrame(predict)

In [20]:
test_data['prediction'] = predict[1]

In [21]:
test_data = test_data[['id','prediction']]
test_data = test_data.rename(columns={'prediction':'target'})

In [22]:
test_data['target'] = test_data['target'].transform(lambda x: 0.0 if x<0.5 else 1.0)

In [24]:
test_data.to_csv('submit_kaggle_RF.csv',index=False)

## Gradient Boosting Classifier


In [129]:
from sklearn.ensemble import GradientBoostingRegressor

In [130]:
gbc = GradientBoostingRegressor()

In [131]:
gbc.fit(X_train,Y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [132]:
gbc.score(X_test,Y_test)

0.12153655092569604

In [133]:
gbc.score(X_train,Y_train)

0.16988892711941872

In [134]:
test_data['prediction'] = gbc.predict(test_data[features])

In [135]:
test_data['prediction'] = test_data['prediction'].transform(lambda x: 0 if x < 0.5 else x)
test_data['prediction'] = test_data['prediction'].transform(lambda x: 1 if x>= 0.5 else x)

In [136]:
test_data = test_data[['id','prediction']]

In [137]:
test_data = test_data.rename(columns={'prediction':'target'})

In [75]:
#test_data['target']=test_data['target'].transform(lambda x: 0 if x==0.0 else 1)

In [139]:
test_data.to_csv('submit_kaggle_RF_GBC.csv',index=False)

In [138]:
test_data

Unnamed: 0,id,target
0,0,0.0
1,2,1.0
2,3,1.0
3,9,0.0
4,11,1.0
...,...,...
3258,10861,1.0
3259,10865,1.0
3260,10868,1.0
3261,10874,1.0
