## ML Hackathon : Solving citizens’ grievances

In [18]:
import numpy as np
import pandas as pd
import pickle
from itertools import chain
import warnings
warnings.simplefilter("ignore")

# plot
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go

# text preprocessing
import re
import nltk
# uncomment if not not downloaded
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('brown')
#nltk.download('names')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('universal_tagset')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
#from normalise import normalise


# feature Engineering and feature Selection
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel,SelectKBest,chi2,mutual_info_classif

# ML model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.preprocessing import StandardScaler,LabelEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
# Read Data Provided
train = pd.read_csv(r'C:\Users\Admin\Downloads\bee0b3604b5011eb\dataset\train.csv')
test = pd.read_csv(r'C:\Users\Admin\Downloads\bee0b3604b5011eb\dataset\test.csv')
test['importance']=-1
train['label'] = 'train'
test['label'] = 'test'
combined = pd.concat([train,test],axis=0)
combined.shape

(13638, 329)

### Feature Engineering : 

#### Part-1
<br>1) Intiution says that a severe crime translates to multiple violations. How do we measure multiple violations ? By counting the total number of issues a case is registered under. This is done by  combine_and_count_issues function </br>

<br>2) Again, we seek to establish whether No of respondents is related to severity of a grievance. 
   <br> H0 : There is no relationship between number of respondents</br>
    <br>H1 : There is a relationship </br></br>
<br>  encode_and_count_respondents is the function for it </br>

<br> 3) Many Columns do not have much variance, and therefore do not contribute to the output . These were removed
Functions : drop_cols , remove_constant_cols</br>

<br> 4) Date Columns were compared to calculated the number of days elapsed between milestones 
Functions: generate_date_features</br>

<br> 5) Item-Id indicates an unique id for a case. However, the numbers were in increasing order. So, this could indicate the recency of a case. This column is label-encoded to give it a relationship such that a particular Case came before the others
The other columns were one-hot encoded, as they can't be compared on a nominal scale. 
Functions: col_encode </br>

<br> 6) A simple function to impute the missing values with 0. This is because most of the features are kind of dummy categorical variables, so no need to impute with mean or median </br>

<br> 7) A function to count the total number of articles or paragraphs referenced per case
Function: calculate_articles_paragraph </br>



In [4]:
# sharepointid is kind of an unique id assigned to an opinion, not important for modelling, should be dropped
print(combined.iloc[np.where(train['sharepointid'].value_counts() > 1)].shape)

(0, 329)


In [5]:
# Step 1: Create Features from the Given Data
# In the second step, we will vectorize the data

def combine_and_count_issues(df):
    print('Combining and Counting Total Issues')
    issue_columns = [x for x in list(df.columns) if 'issue' in x]
    df['issues'] = df[issue_columns].apply(lambda x: '. '.join([val for val in x if pd.notna(val)]), axis=1)
    df['total_issues'] = df[issue_columns].apply(lambda x:  sum([int(pd.notna(i)) for i in x]),axis=1) 
    df.drop(issue_columns, axis=1, inplace=True)
    df[['issues']].fillna('',inplace=True)
    return df

# This function encodes the respondents and counts the total respondents
def encode_and_count_respondents(df):
    print('Encoding and Counting Respondents')
    dict1 = dict([(c,[b]) for b,c in zip( df['respondentOrderEng'],df['country.name'])])
    dict2 = dict([(c,[b]) for b,c in zip( combined['respondentOrderEng'],combined['respondent.0'])]) 
    dict1.update(dict2)
    df['respondent.0'] = df['respondent.0'].apply(lambda x: dict1[x][0])
    df['respondent.1'] = df['respondent.1'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    df['respondent.2'] = df['respondent.2'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    df['respondent.3'] = df['respondent.3'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    df['respondent.4'] = df['respondent.4'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    respondent_cols = [col for col in list(df.columns) if'respondent.' in col]
    df['total_respondents'] = df[respondent_cols].apply(lambda x:  sum([int(pd.notna(i)) for i in x]),axis=1) 
    #df.drop(respondent_cols, axis=1, inplace=True)
    df.drop('respondentOrderEng', axis=1, inplace=True)
    return df

# Remove cols not important for Modelling
def drop_cols(df):
    cols =['parties.0','parties.2', 'country.alpha2', 'parties.1', \
           'country.name', 'docname', 'appno', 'ecli', 'kpdate','sharepointid','originatingbody_name']
    df.drop(cols, axis=1, inplace=True)
    return df

# This function removes features where there is no variance
def remove_constant_cols(df):
    print('Remove Constant Columns')
    for col in df.columns:
        if df[col].nunique()==1:
            print(col,end=', ' )
            del df[col]
    return df

# This function generates new date features
def generate_date_features(df):
    print('generate date features')
    df['days_between_intro_decision'] = (pd.to_datetime(df['decisiondate']) - pd.to_datetime(df['introductiondate'])).dt.days
    df['days_between_intro_judgement'] = (pd.to_datetime(df['judgementdate']) - pd.to_datetime(df['introductiondate'])).dt.days
    df['days_between_decision_judgement'] = (pd.to_datetime(df['judgementdate']) - pd.to_datetime(df['decisiondate'])).dt.days
    df.drop(['decisiondate','introductiondate','judgementdate'], axis=1, inplace=True)
    return df

# Encoding for few more feature columns
def col_encode(df):
    print('One-hot encoding Relevcant Rows')
    le = LabelEncoder()
    # Item Ids are in ascending order of judgement date..So let's convert it using label encoder
    df['itemid'] = df['itemid'].apply(lambda x: x[4:7])  
    pd.get_dummies(df,columns=['doctypebranch'])
    df.drop(['doctypebranch'], axis=1, inplace=True )
    df['separateopinion'] = le.fit_transform(df['separateopinion'])   
    return df

# Fill Any missing values with 0 except for issues
def fill_missing(df):
    print('Replace NA values in Numerical Columns with 0')
    for col in df.columns:
        if col not in ['issues','label']:
            df[col].fillna(0,inplace=True)
            df[col] = df[col].astype('int')
    return df

def calculate_articles_paragraph(df):
    articles_columns = [x for x in list(df.columns) if 'article' in x and '_article' not in x]
    df['total_articles'] = df[articles_columns].apply(lambda x:  sum([i for i in x]),axis=1)
    paragraph_columns = [x for x in list(df.columns) if 'paragraphs' in x]
    df['total_paragraphs'] = df[paragraph_columns].apply(lambda x:  sum([i for i in x]),axis=1)
    return df

In [6]:
combined = combine_and_count_issues(combined)
combined = encode_and_count_respondents(combined)
combined = drop_cols(combined)
combined = remove_constant_cols(combined)
combined = generate_date_features(combined)
combined = col_encode(combined)
combined = fill_missing(combined)
combined = calculate_articles_paragraph(combined)

Combining and Counting Total Issues
Encoding and Counting Respondents
Remove Constant Columns
application, languageisocode, originatingbody_type, respondent.3, respondent.4, documentcollectionid=CASELAW, documentcollectionid=JUDGMENTS, documentcollectionid=ENG, generate date features
One-hot encoding Relevcant Rows
Replace NA values in Numerical Columns with 0


### Feature Engineering

#### Part 2 : Text Features

<br> 1) Lemmatize the 'issues' column </br>
<br> 2) Generate Hand Crafted Features : Calculate the number of times a POS ( Parts of Speech) occured.</br>
Calculate the number of times a sequence of Digits occured. This would indicate the number of articles referenced 
<br> 3) Create a special Column for Criminal grievances </br>

<br> Once basic text pre-processing is done, generate features from the text Column using TF-IDF with ngrams between 1 and 5 </br>

In [7]:
# Text Processing

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def words_lemmatizer(text, encoding="utf8"):
    words = nltk.word_tokenize(text)
    lemma_words = []
    wl = WordNetLemmatizer()
    for word in words:
        pos = find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos))
    return " ".join(lemma_words)

def stemmer(text):
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in nltk.word_tokenize(text)]
    return " ".join(words)


# Function to find part of speech tag for a word
def find_pos(word):
    # Part of Speech constants
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    
    # Adjective tags - 'JJ', 'JJR', 'JJS'
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags - 'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags - 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v':
        return 'v'
    # Noun tags - 'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'

def remove_stopwords(text, lang='english'):
    """
    :param text: text input
    :return: text with stopwords removed
    :rtype: str 
    """
    words = nltk.word_tokenize(text)
    lang_stopwords= [i for i in stopwords.words(lang) if i not in ['not', 'no']]
    stopwords_removed= [w for w in words if w not in lang_stopwords]
    return " ".join(stopwords_removed)

In [8]:
def do_preprocessing(df):
    """
    create handcrafted features
    """
    df['Issue_cleaned'] = df['issues'].str.lower().\
                                str.replace('[^\w\s]|_', ' ').\
                                apply(remove_numbers).\
                                apply(remove_stopwords)
    df['Issues_cleaned_lemma'] = df['Issue_cleaned'].apply(words_lemmatizer)
    df['Issues_cleaned_stem'] = df['Issue_cleaned'].apply(stemmer)
    # hand-crafted features
    df['words'] = df['issues'].apply(tokenize)
    df['text__len'] = df['words'].apply(len)
    df['sent__num'] = df['issues'].apply(lambda x: len(nltk.sent_tokenize(x)))
    df['digit__cnt'] = df['words'].apply(lambda x: sum([re.search('\d', i) is not None for i in x]))
    df['bracket__cnt'] = df['words'].apply(lambda x: sum([re.search('\(|\)|\[|\]', i) is not None for i in x]))
    df['equal__cnt'] = df['issues'].apply(lambda x: len(re.findall('=|<-', x)))
    df['verb__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='V' for token, tag in TextBlob(x).tags]))
    df['noun__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='N' for token, tag in TextBlob(x).tags]))
    df['adv__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='R' for token, tag in TextBlob(x).tags]))
    df['adj__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='J' for token, tag in TextBlob(x).tags]))
    df['criminal__cnt'] = df['issues'].apply(lambda x: len(re.findall('criminal|crime', x)))
    #df['keywrds__weight'] = df['Comment_cleaned_lemma'].apply(lambda x: compute_weight(x))    
    df['nonStop__cnt'] = df['words'].apply(lambda x: len([i for i in x if i not in stopwords.words('english')]))
    df['continuousChar__cnt'] = df['issues'].apply(lambda x: len([match.group() for match in re.compile(r'([a-z])\1{2,}').finditer(x)]))
    df['continuousDigit__cnt'] = df['issues'].apply(lambda x: len([match.group() for match in re.compile(r'([0-9])\1{2,}').finditer(x)]))
    df['continuousPunct__cnt'] = df['issues'].apply(lambda x: len([match.group() for match in re.compile(r'([\W|_])\1{2,}').finditer(x)]))
    return df

### Feature Selection :
<br> For doing a feature selection, We have to split the dataset firt. Due to presence of multiple labels in the output column, We do a stratify sampling to generate the train and test sets. </br>

<br> Feature Selection is done using a mutual_info_classifier, as it was found to be empirically better than chi-square test

In [9]:
combined_train = combined.query('label == "train"').drop(['label'] , axis=1)
combined_train = do_preprocessing(combined_train)

In [53]:
# Train-Test Split in 80:20 Ratio
X_train, X_test, Y_train, Y_test = train_test_split(combined_train.drop('importance', axis=1),combined_train['importance'],test_size=0.2,stratify=combined_train['importance'])

In [54]:
# Vectorize the issue column and add the generated features to the X_train matrix
vect = TfidfVectorizer(stop_words='english',ngram_range=(1,5),#token_pattern=r'b[^\d\W]+\b',
                       min_df=5,binary=True)
X_train_dtm = vect.fit_transform(X_train['Issues_cleaned_lemma'])
df1 = pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())


In [55]:
# Merge X_train with features matrix
X_train.drop(['Issues_cleaned_lemma','issues','Issue_cleaned','Issues_cleaned_stem','words'],axis = 1, inplace = True)
X_train.reset_index(drop=True, inplace = True)
res = pd.concat([X_train, df1], axis=1)

In [56]:
# Prepare X_test Matrix
X_test_dtm =  vect.transform(X_test['Issues_cleaned_lemma'])
df_test = pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names())
X_test.drop(['Issues_cleaned_lemma','issues','Issue_cleaned','Issues_cleaned_stem','words'],axis = 1, inplace = True)
X_test.reset_index(drop=True, inplace = True)
res_test = pd.concat([X_test, df_test], axis=1)

In [58]:
res_test.shape

(1776, 1752)

In [47]:
# feature selection
def select_features(X_train, y_train, X_test,k):
    fs = SelectKBest(score_func=mutual_info_classif, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs

In [62]:
# prepare input data
#X_train_enc,X_test_enc = prepare_inputs(res,res_test)
# prepare output data
#y_train_enc,y_test_enc = prepare_targets(Y_train,Y_test)
# feature selection
X_train_fs, X_test_fs = select_features(X_train, Y_train, X_test,150)

### Model Building : xgboost, LGBM, Catboost, Voting Classifier

#### LGBM Training and Prediction

In [70]:
combined_test = combined.query('label == "test"')
combined_test = do_preprocessing(combined_test)
test = combined_test.drop(['issues','label','Issue_cleaned','Issues_cleaned_stem','words', 'importance'] , axis=1)

In [75]:
X_test_dtm_s=vect.transform(test['Issues_cleaned_lemma'])
df2 = pd.DataFrame(X_test_dtm_s.toarray(), columns=vect.get_feature_names())
del test['Issues_cleaned_lemma']
test.reset_index(drop = True,inplace = True)
res2 = pd.concat([test, df2], axis=1)

In [30]:
clf_dict = {"LGBM Classifier": 
            {'classifier': LGBMClassifier(),
                 'params': [
                            {
                             'learning_rate': [0.01],
                             'n_estimators' :[1000],
                             'max_depth':[7],
                            }
                           ]
            },
           }

In [31]:
res_df  = pd.DataFrame()
num_clf = len(clf_dict.keys())
res_df = pd.DataFrame(
    data=np.zeros(shape=(num_clf, 3)),
    columns = ['classifier',
                   'train_score', 
                   'test_score',
            ]
)

In [104]:
%%time
X_train_fs, X_test_fs = select_features(res, Y_train, res_test,150)
count = 0
for key, clf in clf_dict.items():
    print(key, clf)

    grid = GridSearchCV(clf["classifier"],
                        clf["params"],
                        refit=True,
                        cv=5,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        verbose=0
                        
                       )
    estimator = grid.fit(
                        X_train_fs,
                        Y_train)
    train_score = estimator.score(X_train_fs,
                                      Y_train)
    test_score = estimator.score(X_test_fs,Y_test)
    count+=1
    
    res_df.loc[count,'classifier'] = key
    res_df.loc[count,'train_score'] = train_score
    res_df.loc[count,'test_score'] = test_score
    print(f"{key} best params: {grid.best_params_}")
res_df.iloc[1:, :]

LGBM Classifier {'classifier': LGBMClassifier(), 'params': [{'learning_rate': [0.01], 'n_estimators': [1000], 'max_depth': [7]}]}
LGBM Classifier best params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000}
Wall time: 4min 20s


Unnamed: 0,classifier,train_score,test_score
1,LGBM Classifier,0.946353,0.886824


In [73]:
%%time
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,200)
count = 0
for key, clf in clf_dict.items():
    print(key, clf)

    grid = GridSearchCV(clf["classifier"],
                        clf["params"],
                        refit=True,
                        cv=5,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        verbose=0
                        
                       )
    estimator = grid.fit(
                        X_train_fs,
                        Y_train)
    train_score = estimator.score(X_train_fs,
                                      Y_train)
    test_score = estimator.score(X_test_fs,Y_test)
    count+=1
    
    res_df.loc[count,'classifier'] = key
    res_df.loc[count,'train_score'] = train_score
    res_df.loc[count,'test_score'] = test_score
    print(f"{key} best params: {grid.best_params_}")
res_df.iloc[1:, :]

LGBM Classifier {'classifier': LGBMClassifier(), 'params': [{'learning_rate': [0.01], 'n_estimators': [1000], 'max_depth': [7]}]}
LGBM Classifier best params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000}
Wall time: 4min 5s


Unnamed: 0,classifier,train_score,test_score
1,LGBM Classifier,0.948184,0.892455


In [74]:
%%time
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,100)
count = 0
for key, clf in clf_dict.items():
    print(key, clf)

    grid = GridSearchCV(clf["classifier"],
                        clf["params"],
                        refit=True,
                        cv=5,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        verbose=0
                        
                       )
    estimator = grid.fit(
                        X_train_fs,
                        Y_train)
    train_score = estimator.score(X_train_fs,
                                      Y_train)
    test_score = estimator.score(X_test_fs,Y_test)
    count+=1
    
    res_df.loc[count,'classifier'] = key
    res_df.loc[count,'train_score'] = train_score
    res_df.loc[count,'test_score'] = test_score
    print(f"{key} best params: {grid.best_params_}")
res_df.iloc[1:, :]

LGBM Classifier {'classifier': LGBMClassifier(), 'params': [{'learning_rate': [0.01], 'n_estimators': [1000], 'max_depth': [7]}]}
LGBM Classifier best params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000}
Wall time: 4min 9s


Unnamed: 0,classifier,train_score,test_score
1,LGBM Classifier,0.948465,0.886824


In [111]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,150)
lg150 = LGBMClassifier(learning_rate=0.01,max_depth=7,n_estimators=1000)
lg150.fit(X_train_fs,Y_train)
preds150 = lg150.predict(X_test_fs)
preds150

array([4, 4, 4, ..., 4, 4, 4])

In [108]:
test.shape

(4760, 327)

In [None]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,200)
lg200 = LGBMClassifier(learning_rate=0.01,max_depth=7,n_estimators=1000)
lg200.fit(X_train_fs,Y_train)
preds200 = lg.predict(X_test_fs)
preds200

In [112]:
test = pd.read_csv(r'C:\Users\Admin\Downloads\bee0b3604b5011eb\dataset\test.csv')
sub = pd.DataFrame(columns=["appno","importance"])
sub["appno"] = test.appno
sub["importance"] = preds150
sub.to_csv("result_tunedlgbfs150_2.csv", index=False)

### xgboost Modelling and Prediction

In [91]:
# With 200 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,200)
XG200 = XGBClassifier()
xg_train = cross_val_predict(XG200, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: Train ", accuracy_score(xg_train, Y_train) * 100)

XG200.fit(X_train_fs,Y_train)
xg_test = XG200.predict(X_test_fs)

print("cv score: Train ", accuracy_score(xg_test, Y_test) * 100)

cv score: Train  88.98901717825964
cv score: Train  89.07657657657657


In [92]:
# With 150 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,150)
XG = XGBClassifier()
xg_train = cross_val_predict(XG, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: Train ", accuracy_score(xg_train, Y_train) * 100)

XG.fit(X_train_fs,Y_train)
xg_test = XG.predict(X_test_fs)

print("cv score: Test ", accuracy_score(xg_test, Y_test) * 100)

cv score: Train  88.89045339341031
cv score: Test  88.96396396396396


In [94]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,150)
xg_test_set = XG.predict(X_test_fs)

test = pd.read_csv(r'C:\Users\Admin\Downloads\bee0b3604b5011eb\dataset\test.csv')
sub = pd.DataFrame(columns=["appno","importance"])
sub["appno"] = test.appno
sub["importance"] = xg_test_set
sub.to_csv("result_tunedxg150.csv", index=False)

In [None]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,200)
xg_test_set = XG200.predict(X_test_fs)

test = pd.read_csv(r'C:\Users\Admin\Downloads\bee0b3604b5011eb\dataset\test.csv')
sub = pd.DataFrame(columns=["appno","importance"])
sub["appno"] = test.appno
sub["importance"] = xg_test_set
sub.to_csv("result_tunedxg200.csv", index=False)

### Catboost Modeling and Prediction

In [97]:
from catboost import CatBoostClassifier

# With 150 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,150)

cat150 = CatBoostClassifier(random_state=40,n_estimators=1005)
cat_pred = cross_val_predict(cat150, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: ", accuracy_score(cat_pred, Y_train) * 100)

cat150.fit(X_train_fs,Y_train)


cv score:  88.8200506899465
Learning rate set to 0.087079
0:	learn: 1.2238017	total: 51.8ms	remaining: 52s
1:	learn: 1.1062167	total: 96.6ms	remaining: 48.4s
2:	learn: 1.0111103	total: 153ms	remaining: 51.1s
3:	learn: 0.9328750	total: 201ms	remaining: 50.3s
4:	learn: 0.8663717	total: 264ms	remaining: 52.8s
5:	learn: 0.8092933	total: 316ms	remaining: 52.6s
6:	learn: 0.7666912	total: 364ms	remaining: 51.9s
7:	learn: 0.7234920	total: 422ms	remaining: 52.6s
8:	learn: 0.6863434	total: 483ms	remaining: 53.5s
9:	learn: 0.6565510	total: 545ms	remaining: 54.2s
10:	learn: 0.6301550	total: 595ms	remaining: 53.7s
11:	learn: 0.6057783	total: 651ms	remaining: 53.9s
12:	learn: 0.5862278	total: 705ms	remaining: 53.8s
13:	learn: 0.5665064	total: 764ms	remaining: 54.1s
14:	learn: 0.5491334	total: 814ms	remaining: 53.7s
15:	learn: 0.5319487	total: 864ms	remaining: 53.4s
16:	learn: 0.5185658	total: 919ms	remaining: 53.4s
17:	learn: 0.5062426	total: 971ms	remaining: 53.2s
18:	learn: 0.4939426	total: 1.03s	

162:	learn: 0.2971875	total: 8.46s	remaining: 43.7s
163:	learn: 0.2970507	total: 8.51s	remaining: 43.6s
164:	learn: 0.2963325	total: 8.55s	remaining: 43.6s
165:	learn: 0.2960643	total: 8.6s	remaining: 43.5s
166:	learn: 0.2958254	total: 8.67s	remaining: 43.5s
167:	learn: 0.2954055	total: 8.72s	remaining: 43.4s
168:	learn: 0.2949999	total: 8.77s	remaining: 43.4s
169:	learn: 0.2946789	total: 8.82s	remaining: 43.3s
170:	learn: 0.2945555	total: 8.87s	remaining: 43.3s
171:	learn: 0.2942463	total: 8.92s	remaining: 43.2s
172:	learn: 0.2940144	total: 8.97s	remaining: 43.1s
173:	learn: 0.2936347	total: 9.03s	remaining: 43.1s
174:	learn: 0.2934053	total: 9.08s	remaining: 43.1s
175:	learn: 0.2933320	total: 9.13s	remaining: 43s
176:	learn: 0.2931062	total: 9.17s	remaining: 42.9s
177:	learn: 0.2925849	total: 9.22s	remaining: 42.9s
178:	learn: 0.2924525	total: 9.27s	remaining: 42.8s
179:	learn: 0.2921229	total: 9.32s	remaining: 42.7s
180:	learn: 0.2920780	total: 9.38s	remaining: 42.7s
181:	learn: 0.2

321:	learn: 0.2591409	total: 18.2s	remaining: 38.6s
322:	learn: 0.2590068	total: 18.3s	remaining: 38.6s
323:	learn: 0.2589578	total: 18.4s	remaining: 38.6s
324:	learn: 0.2589254	total: 18.4s	remaining: 38.6s
325:	learn: 0.2583119	total: 18.5s	remaining: 38.6s
326:	learn: 0.2582851	total: 18.6s	remaining: 38.5s
327:	learn: 0.2580390	total: 18.6s	remaining: 38.4s
328:	learn: 0.2579459	total: 18.7s	remaining: 38.5s
329:	learn: 0.2578744	total: 18.8s	remaining: 38.4s
330:	learn: 0.2574249	total: 18.9s	remaining: 38.4s
331:	learn: 0.2571107	total: 18.9s	remaining: 38.4s
332:	learn: 0.2570669	total: 19s	remaining: 38.3s
333:	learn: 0.2567592	total: 19s	remaining: 38.3s
334:	learn: 0.2564292	total: 19.1s	remaining: 38.2s
335:	learn: 0.2562119	total: 19.2s	remaining: 38.1s
336:	learn: 0.2558804	total: 19.2s	remaining: 38.1s
337:	learn: 0.2558363	total: 19.3s	remaining: 38s
338:	learn: 0.2551363	total: 19.3s	remaining: 37.9s
339:	learn: 0.2546947	total: 19.4s	remaining: 37.9s
340:	learn: 0.2544

482:	learn: 0.2332949	total: 29.1s	remaining: 31.4s
483:	learn: 0.2330971	total: 29.2s	remaining: 31.4s
484:	learn: 0.2330642	total: 29.2s	remaining: 31.3s
485:	learn: 0.2330196	total: 29.3s	remaining: 31.3s
486:	learn: 0.2329846	total: 29.4s	remaining: 31.3s
487:	learn: 0.2327182	total: 29.5s	remaining: 31.2s
488:	learn: 0.2326368	total: 29.5s	remaining: 31.2s
489:	learn: 0.2326073	total: 29.6s	remaining: 31.1s
490:	learn: 0.2325105	total: 29.7s	remaining: 31.1s
491:	learn: 0.2324370	total: 29.7s	remaining: 31s
492:	learn: 0.2322784	total: 29.8s	remaining: 31s
493:	learn: 0.2321307	total: 29.9s	remaining: 30.9s
494:	learn: 0.2318916	total: 30s	remaining: 30.9s
495:	learn: 0.2318393	total: 30s	remaining: 30.8s
496:	learn: 0.2315430	total: 30.1s	remaining: 30.8s
497:	learn: 0.2314178	total: 30.2s	remaining: 30.7s
498:	learn: 0.2313844	total: 30.2s	remaining: 30.7s
499:	learn: 0.2313475	total: 30.3s	remaining: 30.6s
500:	learn: 0.2312067	total: 30.4s	remaining: 30.6s
501:	learn: 0.231074

644:	learn: 0.2134501	total: 41.5s	remaining: 23.1s
645:	learn: 0.2133068	total: 41.5s	remaining: 23.1s
646:	learn: 0.2130719	total: 41.6s	remaining: 23s
647:	learn: 0.2128942	total: 41.7s	remaining: 23s
648:	learn: 0.2127640	total: 41.7s	remaining: 22.9s
649:	learn: 0.2127095	total: 41.8s	remaining: 22.8s
650:	learn: 0.2124936	total: 41.9s	remaining: 22.8s
651:	learn: 0.2124676	total: 41.9s	remaining: 22.7s
652:	learn: 0.2123648	total: 42s	remaining: 22.6s
653:	learn: 0.2121692	total: 42s	remaining: 22.6s
654:	learn: 0.2119015	total: 42.1s	remaining: 22.5s
655:	learn: 0.2118513	total: 42.2s	remaining: 22.4s
656:	learn: 0.2118156	total: 42.2s	remaining: 22.4s
657:	learn: 0.2116226	total: 42.3s	remaining: 22.3s
658:	learn: 0.2115231	total: 42.4s	remaining: 22.2s
659:	learn: 0.2113955	total: 42.5s	remaining: 22.2s
660:	learn: 0.2112735	total: 42.5s	remaining: 22.1s
661:	learn: 0.2111948	total: 42.6s	remaining: 22.1s
662:	learn: 0.2110635	total: 42.6s	remaining: 22s
663:	learn: 0.2110172	

805:	learn: 0.1941519	total: 52.6s	remaining: 13s
806:	learn: 0.1940292	total: 52.6s	remaining: 12.9s
807:	learn: 0.1938197	total: 52.7s	remaining: 12.8s
808:	learn: 0.1936584	total: 52.8s	remaining: 12.8s
809:	learn: 0.1935641	total: 52.8s	remaining: 12.7s
810:	learn: 0.1934262	total: 52.9s	remaining: 12.6s
811:	learn: 0.1933177	total: 52.9s	remaining: 12.6s
812:	learn: 0.1932830	total: 53s	remaining: 12.5s
813:	learn: 0.1930094	total: 53.1s	remaining: 12.5s
814:	learn: 0.1928605	total: 53.1s	remaining: 12.4s
815:	learn: 0.1927993	total: 53.2s	remaining: 12.3s
816:	learn: 0.1925196	total: 53.3s	remaining: 12.3s
817:	learn: 0.1923766	total: 53.3s	remaining: 12.2s
818:	learn: 0.1923451	total: 53.4s	remaining: 12.1s
819:	learn: 0.1922539	total: 53.5s	remaining: 12.1s
820:	learn: 0.1917832	total: 53.6s	remaining: 12s
821:	learn: 0.1916600	total: 53.6s	remaining: 11.9s
822:	learn: 0.1914696	total: 53.7s	remaining: 11.9s
823:	learn: 0.1913869	total: 53.7s	remaining: 11.8s
824:	learn: 0.1913

965:	learn: 0.1760578	total: 1m 3s	remaining: 2.56s
966:	learn: 0.1759934	total: 1m 3s	remaining: 2.5s
967:	learn: 0.1759257	total: 1m 3s	remaining: 2.43s
968:	learn: 0.1758735	total: 1m 3s	remaining: 2.36s
969:	learn: 0.1756426	total: 1m 3s	remaining: 2.3s
970:	learn: 0.1755094	total: 1m 3s	remaining: 2.24s
971:	learn: 0.1754124	total: 1m 3s	remaining: 2.17s
972:	learn: 0.1751318	total: 1m 4s	remaining: 2.1s
973:	learn: 0.1750832	total: 1m 4s	remaining: 2.04s
974:	learn: 0.1750659	total: 1m 4s	remaining: 1.97s
975:	learn: 0.1749762	total: 1m 4s	remaining: 1.91s
976:	learn: 0.1747534	total: 1m 4s	remaining: 1.84s
977:	learn: 0.1747002	total: 1m 4s	remaining: 1.77s
978:	learn: 0.1743915	total: 1m 4s	remaining: 1.71s
979:	learn: 0.1741911	total: 1m 4s	remaining: 1.64s
980:	learn: 0.1740962	total: 1m 4s	remaining: 1.58s
981:	learn: 0.1739717	total: 1m 4s	remaining: 1.51s
982:	learn: 0.1738973	total: 1m 4s	remaining: 1.45s
983:	learn: 0.1737870	total: 1m 4s	remaining: 1.38s
984:	learn: 0.1

In [98]:
cat_test = cat150.predict(X_test_fs)

print("cv score: Test ", accuracy_score(cat_test, Y_test) * 100)

cv score: Test  89.47072072072072


In [99]:
# With 200 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,200)

cat200 = CatBoostClassifier(random_state=40,n_estimators=1005)
cat_pred = cross_val_predict(cat200, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: ", accuracy_score(cat_pred, Y_train) * 100)

cat200.fit(X_train_fs,Y_train)
cat_test = cat200.predict(X_test_fs)

print("cv score: Test ", accuracy_score(cat_test, Y_test) * 100)

cv score:  88.79188960856096
Learning rate set to 0.087079
0:	learn: 1.2272609	total: 52.5ms	remaining: 52.7s
1:	learn: 1.1112516	total: 103ms	remaining: 51.8s
2:	learn: 1.0160945	total: 186ms	remaining: 1m 1s
3:	learn: 0.9340312	total: 246ms	remaining: 1m 1s
4:	learn: 0.8687228	total: 303ms	remaining: 1m
5:	learn: 0.8177343	total: 364ms	remaining: 1m
6:	learn: 0.7707751	total: 419ms	remaining: 59.8s
7:	learn: 0.7309729	total: 466ms	remaining: 58.1s
8:	learn: 0.6966347	total: 530ms	remaining: 58.7s
9:	learn: 0.6615855	total: 579ms	remaining: 57.6s
10:	learn: 0.6328154	total: 630ms	remaining: 57s
11:	learn: 0.6104660	total: 677ms	remaining: 56s
12:	learn: 0.5871233	total: 730ms	remaining: 55.7s
13:	learn: 0.5694932	total: 787ms	remaining: 55.7s
14:	learn: 0.5505039	total: 840ms	remaining: 55.4s
15:	learn: 0.5327989	total: 887ms	remaining: 54.9s
16:	learn: 0.5181766	total: 944ms	remaining: 54.8s
17:	learn: 0.5046902	total: 1s	remaining: 55s
18:	learn: 0.4918940	total: 1.05s	remaining: 54

162:	learn: 0.2974437	total: 8.8s	remaining: 45.5s
163:	learn: 0.2970749	total: 8.85s	remaining: 45.4s
164:	learn: 0.2969260	total: 8.89s	remaining: 45.3s
165:	learn: 0.2967606	total: 8.94s	remaining: 45.2s
166:	learn: 0.2964739	total: 9s	remaining: 45.2s
167:	learn: 0.2962790	total: 9.05s	remaining: 45.1s
168:	learn: 0.2959107	total: 9.1s	remaining: 45s
169:	learn: 0.2952336	total: 9.27s	remaining: 45.5s
170:	learn: 0.2946610	total: 9.39s	remaining: 45.8s
171:	learn: 0.2938143	total: 9.49s	remaining: 46s
172:	learn: 0.2937696	total: 9.58s	remaining: 46.1s
173:	learn: 0.2936425	total: 9.66s	remaining: 46.1s
174:	learn: 0.2935316	total: 9.74s	remaining: 46.2s
175:	learn: 0.2932726	total: 9.8s	remaining: 46.2s
176:	learn: 0.2930237	total: 9.86s	remaining: 46.1s
177:	learn: 0.2928168	total: 9.91s	remaining: 46.1s
178:	learn: 0.2923562	total: 9.97s	remaining: 46s
179:	learn: 0.2920893	total: 10s	remaining: 45.9s
180:	learn: 0.2913269	total: 10.1s	remaining: 45.9s
181:	learn: 0.2909240	tota

323:	learn: 0.2581704	total: 18s	remaining: 37.9s
324:	learn: 0.2581384	total: 18.1s	remaining: 37.8s
325:	learn: 0.2580054	total: 18.1s	remaining: 37.8s
326:	learn: 0.2578639	total: 18.2s	remaining: 37.7s
327:	learn: 0.2577689	total: 18.2s	remaining: 37.6s
328:	learn: 0.2575814	total: 18.3s	remaining: 37.6s
329:	learn: 0.2572089	total: 18.3s	remaining: 37.5s
330:	learn: 0.2569928	total: 18.4s	remaining: 37.4s
331:	learn: 0.2569157	total: 18.4s	remaining: 37.4s
332:	learn: 0.2566291	total: 18.5s	remaining: 37.3s
333:	learn: 0.2563891	total: 18.5s	remaining: 37.2s
334:	learn: 0.2559545	total: 18.6s	remaining: 37.2s
335:	learn: 0.2557326	total: 18.6s	remaining: 37.1s
336:	learn: 0.2553314	total: 18.7s	remaining: 37.1s
337:	learn: 0.2551567	total: 18.7s	remaining: 37s
338:	learn: 0.2551147	total: 18.8s	remaining: 36.9s
339:	learn: 0.2550721	total: 18.9s	remaining: 36.9s
340:	learn: 0.2548997	total: 18.9s	remaining: 36.9s
341:	learn: 0.2546895	total: 19s	remaining: 36.8s
342:	learn: 0.2543

485:	learn: 0.2309917	total: 26.8s	remaining: 28.6s
486:	learn: 0.2308791	total: 26.8s	remaining: 28.5s
487:	learn: 0.2306864	total: 26.9s	remaining: 28.5s
488:	learn: 0.2305404	total: 26.9s	remaining: 28.4s
489:	learn: 0.2304478	total: 27s	remaining: 28.3s
490:	learn: 0.2303678	total: 27s	remaining: 28.3s
491:	learn: 0.2302257	total: 27s	remaining: 28.2s
492:	learn: 0.2299902	total: 27.1s	remaining: 28.1s
493:	learn: 0.2297744	total: 27.1s	remaining: 28.1s
494:	learn: 0.2296971	total: 27.2s	remaining: 28s
495:	learn: 0.2295222	total: 27.2s	remaining: 27.9s
496:	learn: 0.2294551	total: 27.3s	remaining: 27.9s
497:	learn: 0.2293443	total: 27.3s	remaining: 27.8s
498:	learn: 0.2291393	total: 27.4s	remaining: 27.8s
499:	learn: 0.2289166	total: 27.5s	remaining: 27.7s
500:	learn: 0.2287118	total: 27.5s	remaining: 27.7s
501:	learn: 0.2286258	total: 27.6s	remaining: 27.6s
502:	learn: 0.2284143	total: 27.6s	remaining: 27.6s
503:	learn: 0.2283553	total: 27.7s	remaining: 27.5s
504:	learn: 0.228292

644:	learn: 0.2092577	total: 35s	remaining: 19.5s
645:	learn: 0.2091344	total: 35s	remaining: 19.5s
646:	learn: 0.2090587	total: 35.1s	remaining: 19.4s
647:	learn: 0.2088398	total: 35.1s	remaining: 19.4s
648:	learn: 0.2087783	total: 35.2s	remaining: 19.3s
649:	learn: 0.2086037	total: 35.2s	remaining: 19.2s
650:	learn: 0.2084420	total: 35.3s	remaining: 19.2s
651:	learn: 0.2083679	total: 35.4s	remaining: 19.1s
652:	learn: 0.2083252	total: 35.4s	remaining: 19.1s
653:	learn: 0.2082400	total: 35.5s	remaining: 19s
654:	learn: 0.2081445	total: 35.5s	remaining: 19s
655:	learn: 0.2080611	total: 35.6s	remaining: 18.9s
656:	learn: 0.2077666	total: 35.6s	remaining: 18.9s
657:	learn: 0.2076893	total: 35.7s	remaining: 18.8s
658:	learn: 0.2076247	total: 35.7s	remaining: 18.8s
659:	learn: 0.2076027	total: 35.8s	remaining: 18.7s
660:	learn: 0.2075456	total: 35.8s	remaining: 18.6s
661:	learn: 0.2073802	total: 35.9s	remaining: 18.6s
662:	learn: 0.2072088	total: 35.9s	remaining: 18.5s
663:	learn: 0.206968

807:	learn: 0.1907024	total: 43.7s	remaining: 10.7s
808:	learn: 0.1905010	total: 43.8s	remaining: 10.6s
809:	learn: 0.1903402	total: 43.8s	remaining: 10.6s
810:	learn: 0.1903134	total: 43.9s	remaining: 10.5s
811:	learn: 0.1902445	total: 43.9s	remaining: 10.4s
812:	learn: 0.1901835	total: 44s	remaining: 10.4s
813:	learn: 0.1899947	total: 44s	remaining: 10.3s
814:	learn: 0.1898913	total: 44.1s	remaining: 10.3s
815:	learn: 0.1898190	total: 44.1s	remaining: 10.2s
816:	learn: 0.1896827	total: 44.2s	remaining: 10.2s
817:	learn: 0.1896668	total: 44.2s	remaining: 10.1s
818:	learn: 0.1895849	total: 44.3s	remaining: 10.1s
819:	learn: 0.1894739	total: 44.3s	remaining: 10s
820:	learn: 0.1893122	total: 44.4s	remaining: 9.95s
821:	learn: 0.1892023	total: 44.4s	remaining: 9.89s
822:	learn: 0.1891047	total: 44.5s	remaining: 9.84s
823:	learn: 0.1890468	total: 44.5s	remaining: 9.78s
824:	learn: 0.1889413	total: 44.6s	remaining: 9.72s
825:	learn: 0.1889183	total: 44.6s	remaining: 9.67s
826:	learn: 0.1887

967:	learn: 0.1744225	total: 52.5s	remaining: 2.01s
968:	learn: 0.1743718	total: 52.6s	remaining: 1.95s
969:	learn: 0.1742401	total: 52.6s	remaining: 1.9s
970:	learn: 0.1740783	total: 52.7s	remaining: 1.84s
971:	learn: 0.1739170	total: 52.7s	remaining: 1.79s
972:	learn: 0.1738516	total: 52.8s	remaining: 1.74s
973:	learn: 0.1738118	total: 52.8s	remaining: 1.68s
974:	learn: 0.1735860	total: 52.9s	remaining: 1.63s
975:	learn: 0.1734653	total: 52.9s	remaining: 1.57s
976:	learn: 0.1734328	total: 53s	remaining: 1.52s
977:	learn: 0.1733842	total: 53s	remaining: 1.46s
978:	learn: 0.1732775	total: 53.1s	remaining: 1.41s
979:	learn: 0.1731688	total: 53.1s	remaining: 1.35s
980:	learn: 0.1730837	total: 53.2s	remaining: 1.3s
981:	learn: 0.1729799	total: 53.2s	remaining: 1.25s
982:	learn: 0.1729472	total: 53.3s	remaining: 1.19s
983:	learn: 0.1729213	total: 53.3s	remaining: 1.14s
984:	learn: 0.1728227	total: 53.4s	remaining: 1.08s
985:	learn: 0.1727677	total: 53.5s	remaining: 1.03s
986:	learn: 0.1726

In [100]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,150)
cat_test = cat150.predict(X_test_fs)

test = pd.read_csv(r'C:\Users\Admin\Downloads\bee0b3604b5011eb\dataset\test.csv')
sub = pd.DataFrame(columns=["appno","importance"])
sub["appno"] = test.appno
sub["importance"] = cat_test
sub.to_csv("result_cat150.csv", index=False)

### Voting Classifier

In [113]:
from sklearn.ensemble import  VotingClassifier
eclf1 = VotingClassifier(estimators=[('lgb', lg150), ('xg', XG), ('cat', cat150)], voting='soft')
X_train_fs,X_test_fs = select_features(res, Y_train, res2,150)
eclf1 = eclf1.fit(X_train_fs, Y_train)
print(eclf1.predict(X_test_fs))

Learning rate set to 0.087079
0:	learn: 1.2241196	total: 55.3ms	remaining: 55.5s
1:	learn: 1.0981311	total: 94.5ms	remaining: 47.4s
2:	learn: 1.0013981	total: 135ms	remaining: 45s
3:	learn: 0.9292110	total: 173ms	remaining: 43.4s
4:	learn: 0.8658757	total: 237ms	remaining: 47.3s
5:	learn: 0.8087217	total: 285ms	remaining: 47.5s
6:	learn: 0.7625829	total: 328ms	remaining: 46.8s
7:	learn: 0.7233611	total: 373ms	remaining: 46.5s
8:	learn: 0.6880419	total: 426ms	remaining: 47.1s
9:	learn: 0.6569303	total: 468ms	remaining: 46.6s
10:	learn: 0.6292377	total: 516ms	remaining: 46.6s
11:	learn: 0.6048909	total: 560ms	remaining: 46.4s
12:	learn: 0.5856092	total: 613ms	remaining: 46.8s
13:	learn: 0.5663530	total: 658ms	remaining: 46.6s
14:	learn: 0.5500235	total: 699ms	remaining: 46.2s
15:	learn: 0.5336970	total: 743ms	remaining: 45.9s
16:	learn: 0.5199885	total: 792ms	remaining: 46s
17:	learn: 0.5063360	total: 847ms	remaining: 46.5s
18:	learn: 0.4951108	total: 891ms	remaining: 46.2s
19:	learn: 0.

162:	learn: 0.3004620	total: 7.53s	remaining: 38.9s
163:	learn: 0.2999103	total: 7.57s	remaining: 38.8s
164:	learn: 0.2997084	total: 7.61s	remaining: 38.8s
165:	learn: 0.2996418	total: 7.65s	remaining: 38.7s
166:	learn: 0.2993322	total: 7.69s	remaining: 38.6s
167:	learn: 0.2991594	total: 7.74s	remaining: 38.6s
168:	learn: 0.2987224	total: 7.79s	remaining: 38.5s
169:	learn: 0.2982264	total: 7.83s	remaining: 38.5s
170:	learn: 0.2978238	total: 7.87s	remaining: 38.4s
171:	learn: 0.2971783	total: 7.92s	remaining: 38.4s
172:	learn: 0.2970495	total: 7.96s	remaining: 38.3s
173:	learn: 0.2969250	total: 8s	remaining: 38.2s
174:	learn: 0.2966413	total: 8.04s	remaining: 38.1s
175:	learn: 0.2964016	total: 8.09s	remaining: 38.1s
176:	learn: 0.2962061	total: 8.13s	remaining: 38s
177:	learn: 0.2961593	total: 8.17s	remaining: 37.9s
178:	learn: 0.2952378	total: 8.21s	remaining: 37.9s
179:	learn: 0.2948923	total: 8.25s	remaining: 37.8s
180:	learn: 0.2948460	total: 8.29s	remaining: 37.8s
181:	learn: 0.294

322:	learn: 0.2619414	total: 14.5s	remaining: 30.7s
323:	learn: 0.2617022	total: 14.6s	remaining: 30.6s
324:	learn: 0.2615390	total: 14.6s	remaining: 30.6s
325:	learn: 0.2613929	total: 14.7s	remaining: 30.5s
326:	learn: 0.2612391	total: 14.7s	remaining: 30.5s
327:	learn: 0.2611511	total: 14.8s	remaining: 30.5s
328:	learn: 0.2609673	total: 14.8s	remaining: 30.4s
329:	learn: 0.2607993	total: 14.8s	remaining: 30.3s
330:	learn: 0.2603267	total: 14.9s	remaining: 30.3s
331:	learn: 0.2602147	total: 14.9s	remaining: 30.3s
332:	learn: 0.2601143	total: 15s	remaining: 30.2s
333:	learn: 0.2598100	total: 15s	remaining: 30.2s
334:	learn: 0.2595839	total: 15s	remaining: 30.1s
335:	learn: 0.2594142	total: 15.1s	remaining: 30s
336:	learn: 0.2593160	total: 15.1s	remaining: 30s
337:	learn: 0.2591847	total: 15.2s	remaining: 30s
338:	learn: 0.2590842	total: 15.2s	remaining: 29.9s
339:	learn: 0.2588502	total: 15.3s	remaining: 29.8s
340:	learn: 0.2586882	total: 15.3s	remaining: 29.8s
341:	learn: 0.2586436	to

482:	learn: 0.2335491	total: 22.6s	remaining: 24.4s
483:	learn: 0.2333962	total: 22.7s	remaining: 24.4s
484:	learn: 0.2333204	total: 22.7s	remaining: 24.4s
485:	learn: 0.2330473	total: 22.8s	remaining: 24.3s
486:	learn: 0.2329846	total: 22.8s	remaining: 24.3s
487:	learn: 0.2328986	total: 22.9s	remaining: 24.2s
488:	learn: 0.2327476	total: 22.9s	remaining: 24.2s
489:	learn: 0.2326321	total: 22.9s	remaining: 24.1s
490:	learn: 0.2325441	total: 23s	remaining: 24.1s
491:	learn: 0.2321813	total: 23s	remaining: 24s
492:	learn: 0.2319389	total: 23.1s	remaining: 24s
493:	learn: 0.2318494	total: 23.1s	remaining: 23.9s
494:	learn: 0.2316537	total: 23.2s	remaining: 23.9s
495:	learn: 0.2314864	total: 23.2s	remaining: 23.8s
496:	learn: 0.2314091	total: 23.3s	remaining: 23.8s
497:	learn: 0.2313853	total: 23.3s	remaining: 23.7s
498:	learn: 0.2313269	total: 23.3s	remaining: 23.7s
499:	learn: 0.2310260	total: 23.4s	remaining: 23.6s
500:	learn: 0.2309969	total: 23.4s	remaining: 23.6s
501:	learn: 0.230871

641:	learn: 0.2116383	total: 30.2s	remaining: 17.1s
642:	learn: 0.2115277	total: 30.3s	remaining: 17s
643:	learn: 0.2114132	total: 30.3s	remaining: 17s
644:	learn: 0.2111772	total: 30.4s	remaining: 16.9s
645:	learn: 0.2109676	total: 30.4s	remaining: 16.9s
646:	learn: 0.2109063	total: 30.5s	remaining: 16.9s
647:	learn: 0.2108856	total: 30.5s	remaining: 16.8s
648:	learn: 0.2107883	total: 30.5s	remaining: 16.8s
649:	learn: 0.2106866	total: 30.6s	remaining: 16.7s
650:	learn: 0.2106441	total: 30.6s	remaining: 16.7s
651:	learn: 0.2105167	total: 30.7s	remaining: 16.6s
652:	learn: 0.2103401	total: 30.7s	remaining: 16.6s
653:	learn: 0.2102475	total: 30.8s	remaining: 16.5s
654:	learn: 0.2101536	total: 30.8s	remaining: 16.5s
655:	learn: 0.2100024	total: 30.9s	remaining: 16.4s
656:	learn: 0.2097259	total: 30.9s	remaining: 16.4s
657:	learn: 0.2096863	total: 30.9s	remaining: 16.3s
658:	learn: 0.2094759	total: 31s	remaining: 16.3s
659:	learn: 0.2093025	total: 31s	remaining: 16.2s
660:	learn: 0.209189

801:	learn: 0.1939700	total: 37.9s	remaining: 9.61s
802:	learn: 0.1938976	total: 38s	remaining: 9.56s
803:	learn: 0.1938395	total: 38s	remaining: 9.51s
804:	learn: 0.1937459	total: 38.1s	remaining: 9.46s
805:	learn: 0.1936551	total: 38.1s	remaining: 9.41s
806:	learn: 0.1935927	total: 38.2s	remaining: 9.37s
807:	learn: 0.1934264	total: 38.2s	remaining: 9.32s
808:	learn: 0.1931917	total: 38.3s	remaining: 9.27s
809:	learn: 0.1930826	total: 38.3s	remaining: 9.22s
810:	learn: 0.1929664	total: 38.4s	remaining: 9.18s
811:	learn: 0.1928449	total: 38.4s	remaining: 9.13s
812:	learn: 0.1926489	total: 38.5s	remaining: 9.08s
813:	learn: 0.1926325	total: 38.5s	remaining: 9.04s
814:	learn: 0.1924455	total: 38.6s	remaining: 8.99s
815:	learn: 0.1922703	total: 38.6s	remaining: 8.95s
816:	learn: 0.1921618	total: 38.7s	remaining: 8.91s
817:	learn: 0.1920048	total: 38.8s	remaining: 8.87s
818:	learn: 0.1918861	total: 38.8s	remaining: 8.82s
819:	learn: 0.1918066	total: 38.9s	remaining: 8.77s
820:	learn: 0.19

961:	learn: 0.1760286	total: 46.2s	remaining: 2.07s
962:	learn: 0.1759615	total: 46.4s	remaining: 2.02s
963:	learn: 0.1757806	total: 46.6s	remaining: 1.98s
964:	learn: 0.1757140	total: 46.8s	remaining: 1.94s
965:	learn: 0.1756873	total: 46.9s	remaining: 1.89s
966:	learn: 0.1755707	total: 47s	remaining: 1.84s
967:	learn: 0.1754580	total: 47.1s	remaining: 1.8s
968:	learn: 0.1753973	total: 47.2s	remaining: 1.75s
969:	learn: 0.1753700	total: 47.3s	remaining: 1.71s
970:	learn: 0.1753105	total: 47.4s	remaining: 1.66s
971:	learn: 0.1750820	total: 47.5s	remaining: 1.61s
972:	learn: 0.1749963	total: 47.5s	remaining: 1.56s
973:	learn: 0.1749511	total: 47.6s	remaining: 1.51s
974:	learn: 0.1748691	total: 47.7s	remaining: 1.47s
975:	learn: 0.1747803	total: 47.7s	remaining: 1.42s
976:	learn: 0.1746639	total: 47.8s	remaining: 1.37s
977:	learn: 0.1744355	total: 47.8s	remaining: 1.32s
978:	learn: 0.1744063	total: 47.9s	remaining: 1.27s
979:	learn: 0.1743490	total: 48s	remaining: 1.22s
980:	learn: 0.174

In [114]:
test = pd.read_csv(r'C:\Users\Admin\Downloads\bee0b3604b5011eb\dataset\test.csv')
sub = pd.DataFrame(columns=["appno","importance"])
sub["appno"] = test.appno
sub["importance"] = eclf1.predict(X_test_fs)
sub.to_csv("result_vote150.csv", index=False)

Conclusion : LGBM with 150 features gave the best result

In [115]:
from sklearn.ensemble import  VotingClassifier
eclf1 = VotingClassifier(estimators=[('lgb', lg150), ('xg', XG), ('cat', cat150)], voting='hard')
X_train_fs,X_test_fs = select_features(res, Y_train, res2,150)
eclf1 = eclf1.fit(X_train_fs, Y_train)
print(eclf1.predict(X_test_fs))

Learning rate set to 0.087079
0:	learn: 1.2214915	total: 103ms	remaining: 1m 43s
1:	learn: 1.1025471	total: 197ms	remaining: 1m 38s
2:	learn: 1.0127748	total: 287ms	remaining: 1m 35s
3:	learn: 0.9341273	total: 371ms	remaining: 1m 32s
4:	learn: 0.8670488	total: 486ms	remaining: 1m 37s
5:	learn: 0.8133301	total: 558ms	remaining: 1m 32s
6:	learn: 0.7682574	total: 634ms	remaining: 1m 30s
7:	learn: 0.7309113	total: 708ms	remaining: 1m 28s
8:	learn: 0.6988116	total: 799ms	remaining: 1m 28s
9:	learn: 0.6650606	total: 875ms	remaining: 1m 27s
10:	learn: 0.6398509	total: 971ms	remaining: 1m 27s
11:	learn: 0.6167341	total: 1.04s	remaining: 1m 25s
12:	learn: 0.5941939	total: 1.11s	remaining: 1m 24s
13:	learn: 0.5744614	total: 1.21s	remaining: 1m 25s
14:	learn: 0.5550331	total: 1.3s	remaining: 1m 26s
15:	learn: 0.5413541	total: 1.38s	remaining: 1m 25s
16:	learn: 0.5251399	total: 1.47s	remaining: 1m 25s
17:	learn: 0.5122380	total: 1.55s	remaining: 1m 24s
18:	learn: 0.5016143	total: 1.62s	remaining: 

158:	learn: 0.3041186	total: 13.3s	remaining: 1m 10s
159:	learn: 0.3039635	total: 13.4s	remaining: 1m 10s
160:	learn: 0.3038724	total: 13.5s	remaining: 1m 10s
161:	learn: 0.3032948	total: 13.6s	remaining: 1m 10s
162:	learn: 0.3028679	total: 13.6s	remaining: 1m 10s
163:	learn: 0.3024865	total: 13.7s	remaining: 1m 10s
164:	learn: 0.3020683	total: 13.8s	remaining: 1m 10s
165:	learn: 0.3014634	total: 13.8s	remaining: 1m 9s
166:	learn: 0.3010741	total: 13.9s	remaining: 1m 9s
167:	learn: 0.3004817	total: 14s	remaining: 1m 9s
168:	learn: 0.3002823	total: 14.1s	remaining: 1m 9s
169:	learn: 0.2999586	total: 14.1s	remaining: 1m 9s
170:	learn: 0.2996884	total: 14.2s	remaining: 1m 9s
171:	learn: 0.2992931	total: 14.3s	remaining: 1m 9s
172:	learn: 0.2991720	total: 14.3s	remaining: 1m 8s
173:	learn: 0.2988970	total: 14.4s	remaining: 1m 8s
174:	learn: 0.2987046	total: 14.5s	remaining: 1m 8s
175:	learn: 0.2983905	total: 14.6s	remaining: 1m 8s
176:	learn: 0.2982148	total: 14.6s	remaining: 1m 8s
177:	le

317:	learn: 0.2671427	total: 25.1s	remaining: 54.2s
318:	learn: 0.2671069	total: 25.2s	remaining: 54.1s
319:	learn: 0.2668987	total: 25.3s	remaining: 54.1s
320:	learn: 0.2667113	total: 25.3s	remaining: 54s
321:	learn: 0.2666594	total: 25.4s	remaining: 53.9s
322:	learn: 0.2664905	total: 25.5s	remaining: 53.8s
323:	learn: 0.2660384	total: 25.5s	remaining: 53.7s
324:	learn: 0.2654940	total: 25.6s	remaining: 53.6s
325:	learn: 0.2653971	total: 25.7s	remaining: 53.4s
326:	learn: 0.2650210	total: 25.7s	remaining: 53.3s
327:	learn: 0.2646063	total: 25.8s	remaining: 53.2s
328:	learn: 0.2644688	total: 25.9s	remaining: 53.1s
329:	learn: 0.2641009	total: 25.9s	remaining: 53s
330:	learn: 0.2639523	total: 26s	remaining: 52.9s
331:	learn: 0.2637870	total: 26.1s	remaining: 52.8s
332:	learn: 0.2636012	total: 26.1s	remaining: 52.8s
333:	learn: 0.2634649	total: 26.2s	remaining: 52.7s
334:	learn: 0.2634080	total: 26.3s	remaining: 52.6s
335:	learn: 0.2633189	total: 26.3s	remaining: 52.5s
336:	learn: 0.2631

478:	learn: 0.2400128	total: 38.8s	remaining: 42.6s
479:	learn: 0.2399114	total: 38.8s	remaining: 42.5s
480:	learn: 0.2396579	total: 38.9s	remaining: 42.4s
481:	learn: 0.2395617	total: 39s	remaining: 42.3s
482:	learn: 0.2393751	total: 39.1s	remaining: 42.2s
483:	learn: 0.2391543	total: 39.1s	remaining: 42.1s
484:	learn: 0.2390122	total: 39.2s	remaining: 42.1s
485:	learn: 0.2387318	total: 39.3s	remaining: 42s
486:	learn: 0.2386677	total: 39.4s	remaining: 41.9s
487:	learn: 0.2386203	total: 39.5s	remaining: 41.8s
488:	learn: 0.2385483	total: 39.6s	remaining: 41.7s
489:	learn: 0.2384237	total: 39.7s	remaining: 41.7s
490:	learn: 0.2383496	total: 39.7s	remaining: 41.6s
491:	learn: 0.2382779	total: 39.9s	remaining: 41.6s
492:	learn: 0.2382334	total: 40s	remaining: 41.6s
493:	learn: 0.2379901	total: 40.1s	remaining: 41.5s
494:	learn: 0.2378506	total: 40.2s	remaining: 41.5s
495:	learn: 0.2374274	total: 40.3s	remaining: 41.4s
496:	learn: 0.2371412	total: 40.4s	remaining: 41.3s
497:	learn: 0.2370

639:	learn: 0.2184572	total: 51.3s	remaining: 29.3s
640:	learn: 0.2182937	total: 51.4s	remaining: 29.2s
641:	learn: 0.2182637	total: 51.4s	remaining: 29.1s
642:	learn: 0.2181183	total: 51.5s	remaining: 29s
643:	learn: 0.2180623	total: 51.6s	remaining: 28.9s
644:	learn: 0.2178291	total: 51.7s	remaining: 28.8s
645:	learn: 0.2177585	total: 51.7s	remaining: 28.8s
646:	learn: 0.2174742	total: 51.8s	remaining: 28.7s
647:	learn: 0.2172984	total: 51.9s	remaining: 28.6s
648:	learn: 0.2172313	total: 51.9s	remaining: 28.5s
649:	learn: 0.2171636	total: 52s	remaining: 28.4s
650:	learn: 0.2170060	total: 52.1s	remaining: 28.3s
651:	learn: 0.2167629	total: 52.1s	remaining: 28.2s
652:	learn: 0.2166311	total: 52.2s	remaining: 28.1s
653:	learn: 0.2164461	total: 52.3s	remaining: 28.1s
654:	learn: 0.2163968	total: 52.4s	remaining: 28s
655:	learn: 0.2163352	total: 52.9s	remaining: 28.1s
656:	learn: 0.2162925	total: 53.3s	remaining: 28.2s
657:	learn: 0.2161943	total: 53.5s	remaining: 28.2s
658:	learn: 0.2161

799:	learn: 0.1976917	total: 1m 10s	remaining: 18s
800:	learn: 0.1975713	total: 1m 10s	remaining: 17.9s
801:	learn: 0.1974791	total: 1m 10s	remaining: 17.8s
802:	learn: 0.1974614	total: 1m 10s	remaining: 17.7s
803:	learn: 0.1972696	total: 1m 10s	remaining: 17.7s
804:	learn: 0.1970822	total: 1m 10s	remaining: 17.6s
805:	learn: 0.1969363	total: 1m 10s	remaining: 17.5s
806:	learn: 0.1968579	total: 1m 10s	remaining: 17.4s
807:	learn: 0.1968247	total: 1m 11s	remaining: 17.3s
808:	learn: 0.1966656	total: 1m 11s	remaining: 17.2s
809:	learn: 0.1965076	total: 1m 11s	remaining: 17.1s
810:	learn: 0.1964296	total: 1m 11s	remaining: 17.1s
811:	learn: 0.1963705	total: 1m 11s	remaining: 17s
812:	learn: 0.1962936	total: 1m 11s	remaining: 16.9s
813:	learn: 0.1961866	total: 1m 11s	remaining: 16.8s
814:	learn: 0.1961284	total: 1m 11s	remaining: 16.8s
815:	learn: 0.1960871	total: 1m 12s	remaining: 16.7s
816:	learn: 0.1958963	total: 1m 12s	remaining: 16.6s
817:	learn: 0.1957997	total: 1m 12s	remaining: 16.

956:	learn: 0.1808413	total: 1m 29s	remaining: 4.47s
957:	learn: 0.1805999	total: 1m 29s	remaining: 4.38s
958:	learn: 0.1805580	total: 1m 29s	remaining: 4.28s
959:	learn: 0.1804728	total: 1m 29s	remaining: 4.19s
960:	learn: 0.1803643	total: 1m 29s	remaining: 4.09s
961:	learn: 0.1801504	total: 1m 29s	remaining: 4s
962:	learn: 0.1800546	total: 1m 29s	remaining: 3.9s
963:	learn: 0.1799350	total: 1m 29s	remaining: 3.81s
964:	learn: 0.1797145	total: 1m 29s	remaining: 3.72s
965:	learn: 0.1796477	total: 1m 29s	remaining: 3.62s
966:	learn: 0.1795160	total: 1m 29s	remaining: 3.53s
967:	learn: 0.1794582	total: 1m 29s	remaining: 3.44s
968:	learn: 0.1791473	total: 1m 30s	remaining: 3.34s
969:	learn: 0.1790878	total: 1m 30s	remaining: 3.25s
970:	learn: 0.1789546	total: 1m 30s	remaining: 3.16s
971:	learn: 0.1788824	total: 1m 30s	remaining: 3.06s
972:	learn: 0.1787462	total: 1m 30s	remaining: 2.97s
973:	learn: 0.1786857	total: 1m 30s	remaining: 2.88s
974:	learn: 0.1784895	total: 1m 30s	remaining: 2.7

ValueError: could not broadcast input array from shape (4760,1) into shape (4760)

In [None]:
sub["appno"] = test.appno
sub["importance"] = eclf1.predict(X_test_fs)
sub.to_csv("result_vote150hard.csv", index=False)

In [117]:
X_test_fs.shape

(4760, 150)

In [118]:
print(eclf1.predict(X_test_fs))

ValueError: could not broadcast input array from shape (4760,1) into shape (4760)