In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import auc

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


In [2]:
# Reading the Data

train_data = pd.read_csv(r'../input/edvancer-project-1-consumer-services/Consumer_Complaints_train.csv')
test_data = pd.read_csv(r'../input/edvancer-project-1-consumer-services/Consumer_Complaints_test_share.csv')

In [3]:
y = train_data['Consumer disputed?']
x = train_data.drop('Consumer disputed?',1)
x_test = test_data


y = y.replace(('Yes','No'),(1,0))

In [40]:
x_test.shape[0]

119606

In [None]:
x_train

In [31]:
y.sum()

101431

In [4]:
date_cols = ['Date received','Date sent to company']

drop_cols = ['Complaint ID',
             'ZIP code',
             'Consumer complaint narrative',
             'Tags']


nrml_cols = ['Sub-product','Company','State','Issue',
             'State','Sub-issue',
             'Consumer consent provided?',
             'Company public response']


In [5]:
# dropping columns

x_temp = x.drop(columns = drop_cols,axis=1)
x_test_temp = x_test.drop(columns = drop_cols,axis=1)

In [6]:
# Dealing with datetime formats

for col in date_cols : 
    x_temp[col] = pd.to_datetime(x_temp[col])
    x_test_temp[col] = pd.to_datetime(x_test_temp[col])
    
x_temp['Process_period'] = pd.to_numeric(x_temp['Date sent to company'] - x_temp['Date received'])
x_test_temp['Process_period'] = pd.to_numeric(x_test_temp['Date sent to company'] - x_test_temp['Date received'])

for col in date_cols : 
    x_temp.drop(columns = col,axis = 1,inplace=True)
    x_test_temp.drop(columns = col,axis = 1,inplace=True)

In [7]:
# Imputing missing values

def impute_na(df,cols):
    for col in cols :
        if df[col].dtype=='O' :
            df[col].fillna('missing',inplace=True)
        else:
            df[col].fillna(x_temp.median(),inplace=True)
            
impute_na(x_temp,nrml_cols)
impute_na(x_test_temp,nrml_cols)

In [8]:
# Creating dummy_dummies

def dummy_dummies(df_train,df_test,cols):
    
    list = []
    var_cat_dict = {}
    freq_cutoff = df_train.shape[0]/20
    
    for col in cols:
        k=df_train[col].value_counts()
            
        if(k<=freq_cutoff).sum()==0:
            cats=set(k.index[:1])
        else:
            cats=set(k.index[k>freq_cutoff])
        
        for cat in cats:
            name=col+'_'+str(cat)
            x_temp[name]=(x_temp[col]==str(cat)).astype(int)
            x_test_temp[name]=(x_test_temp[col]==str(cat)).astype(int)
       
    x_temp.drop(columns=cols,axis=1,inplace=True)
    x_test_temp.drop(columns=cols,axis=1,inplace=True)

dummy_dummies(x_temp,x_test_temp,nrml_cols)

In [9]:
# getting dummies for the rest object columns

dummy_cols = x_temp.select_dtypes('O').columns

pd.get_dummies(x_temp,dummy_cols,drop_first=True)
pd.get_dummies(x_test_temp,dummy_cols,drop_first=True)

x_temp.drop(columns=dummy_cols,inplace=True)
x_test_temp.drop(columns=dummy_cols,inplace=True)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

x_train, x_valid, y_train, y_valid = train_test_split(x_temp, y)

xgb_params = {  
                "learning_rate":[0.01,0.05,0.1,0.3,0.5],
                "gamma":[i/10.0 for i in range(0,5)],
                "max_depth": [2,3,4,5,6,7,8],
                "min_child_weight":[1,2,5,10],
                "max_delta_step":[0,1,2,5,10],
                "subsample":[i/10.0 for i in range(5,10)],
                "colsample_bytree":[i/10.0 for i in range(5,10)],
                "colsample_bylevel":[i/10.0 for i in range(5,10)],
                "reg_lambda":[1e-5, 1e-2, 0.1, 1, 100], 
                "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100],
                "scale_pos_weight":[1,2,3,4,5,6,7,8,9],
                "n_estimators":[100,500,700,1000]
             }

In [11]:
xgb_params = {  
                "learning_rate":[0.01,0.05,0.1,0.3,0.5],
                "max_depth": [2,3,4,5,6,7,8],
                "n_estimators":[100,500,700,1000]
             }

In [12]:
from sklearn.model_selection import RandomizedSearchCV

my_model = XGBClassifier(objective='binary:logistic')

n_iter=10

random_search=RandomizedSearchCV(my_model,n_jobs=-1,cv=10,n_iter=n_iter,scoring='roc_auc',param_distributions=xgb_params)

In [13]:
random_search.fit(x_train,y_train)

RandomizedSearchCV(cv=10,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=None...
                                           max_delta_step=None, max_depth=None,
                                           max_leaves=None

In [32]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [16]:
prediction = random_search.predict(x_valid)

In [18]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(prediction,y_valid))

0.593799499878733


In [20]:
print_preds = random_search.predict(x_test_temp)

In [25]:
to_print = pd.DataFrame({'Complaint ID':x_test['Complaint ID'],'Consumer disputed?':print_preds})

In [30]:
to_print.to_csv('Project1_Submission2_XGBoost_YogeswaraReddy.csv',index = False)

In [88]:
xg = XGBClassifier(learning_rate = 0.05, max_depth = 5, n_estimators = 5000, subsample = 0.25, objective='binary:logistic')

In [89]:
xg.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=5000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [80]:
predict = xg.predict(x_valid)
print(roc_auc_score(predict, y_valid))

0.5654548458438104


In [81]:
predict.sum()

274

In [101]:
predict_print = xg.predict(x_test_temp)

In [102]:
predict_print.sum()

217

In [90]:
from sklearn.linear_model import LogisticRegression

In [91]:
params={'class_weight':['balanced',None],'penalty':['l1','l2'],'C':np.linspace(0.0001,1000,10)}

In [92]:
model=LogisticRegression(fit_intercept=True)

In [93]:
from sklearn.model_selection import GridSearchCV

grid_search=GridSearchCV(model,param_grid=params,cv=10,scoring="roc_auc",n_jobs=-1)

In [96]:
grid_search.fit(x_train,y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_L

GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.000000e-04, 1.111112e+02, 2.222223e+02, 3.333334e+02,
       4.444445e+02, 5.555556e+02, 6.666667e+02, 7.777778e+02,
       8.888889e+02, 1.000000e+03]),
                         'class_weight': ['balanced', None],
                         'penalty': ['l1', 'l2']},
             scoring='roc_auc')

In [97]:
grid_search.best_estimator_

LogisticRegression(C=0.0001, class_weight='balanced')

In [99]:
doc = grid_search.predict(x_valid)

In [100]:
print(roc_auc_score(doc,y_valid))

0.5205804869404649
