In [1]:
%%time
import time
print(time.asctime())

import pandas as pd
import numpy as np
import autosklearn.classification
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import pickle


Wed Oct 17 22:25:15 2018
CPU times: user 617 ms, sys: 88.5 ms, total: 706 ms
Wall time: 709 ms


  self.re = re.compile( self.reString )
  from numpy.core.umath_tests import inner1d


In [20]:
%%time
## Constant values
data_dir = "/home/fedora/programs/hackathon/"
data_file_prefix = "orange_large_train_data_chunk"
data_file_suffix = ".csv"
target_file = "orange_large_train_churn.labels"

tot_files          = 5
null_val_threshold = 90
unique_threshold   = 1
null_filler        = "ffill"

cols_to_drop    = []
col_names       = []
NUMROWS = 50000
SELROWS = 25000

## Cols to read. Beyond this the cols are categorical.
col_range          = [x for x in range(0, 14740)]
col_names          = ["Var"+str(x) for x in range(1, 14741)]

CPU times: user 8.1 ms, sys: 1.16 ms, total: 9.25 ms
Wall time: 9.03 ms


In [9]:
%%time

def select_and_drop_features():
    ## Read all chunk files and make a single dataframe
    for i in range(1, tot_files+1):
        if (i == 1):
            hdr = 0
            full_chunk_df = pd.read_csv(data_dir+data_file_prefix+str(i)+data_file_suffix,
                                       #sep="\t",
                                       #lineterminator="\n",
                                       header=hdr,
                                       usecols=col_range)
            full_chunk_df.columns = col_names
        else:
            hdr = None
            chunk_df = pd.read_csv(data_dir+data_file_prefix+str(i)+data_file_suffix,
                                   #sep="\t",
                                   #lineterminator="\n",
                                   header=hdr,
                                   usecols=col_range)
            chunk_df.columns = col_names
            full_chunk_df = full_chunk_df.append(chunk_df, ignore_index = True)
    
    ## Print the shape of dataframe
    print("\nshape of full chunk df", full_chunk_df.shape)     
    print ("\nMemory footprint of full chunk df = %0.3fGB" %(full_chunk_df.memory_usage(deep=True).sum()/1024**3))
    
    ## Preprocessing Step#1, principle of feature exclusion 
    ## Excluded feature with null value share more than 90% 
    ## ## Excluded feature with all null values (may be this is already included in the previous exclusion)
    ## Excluded feature with single uniqiue value
    ## Excluded feature with all zero values (may be this is already included in the previous exclusion)
    
    cols_to_drop = [col for col in col_names
                    if ((full_chunk_df[col].isnull().all())
                        or (((full_chunk_df[col].isnull().sum(axis = 0)/len(full_chunk_df[col])) * 100) >  null_val_threshold)
                        or ((((full_chunk_df.dtypes[col] == "int64") or  (full_chunk_df.dtypes[col] == "float64")) and (full_chunk_df[col].sum() == 0)))
                        or (len(full_chunk_df[col].unique()) == unique_threshold))]
    
    print("\nlength of columns to drop from full chunk dataframe", len(cols_to_drop))
    
    ## Dropping the identified features
    full_chunk_df.drop(columns=cols_to_drop, inplace=True)
    print("\nshape of full chunk df after dropping few columns", full_chunk_df.shape)
    print("\ndtypes of full chunk df after dropping few columns", full_chunk_df.dtypes)
    print ("\nMemory footprint of partial chunk df = %0.3fGB" %(full_chunk_df.memory_usage(deep=True).sum()/1024**3))
   
    ## Write the resultant dataframe to a pickle
    full_chunk_df.to_pickle("partialchunkdf")
     
    
select_and_drop_features()





shape of full chunk df (50000, 14740)

Memory footprint of full chunk df = 5.491GB

length of columns to drop from full chunk dataframe 1692

shape of full chunk df after dropping few columns (50000, 13048)

dtypes of full chunk df after dropping few columns Var1          int64
Var2          int64
Var3          int64
Var4          int64
Var5          int64
Var6          int64
Var7        float64
Var8          int64
Var9          int64
Var10         int64
Var11         int64
Var12       float64
Var13         int64
Var14         int64
Var16         int64
Var17         int64
Var18         int64
Var19         int64
Var20         int64
Var21         int64
Var22       float64
Var23         int64
Var24         int64
Var25         int64
Var26         int64
Var27         int64
Var28       float64
Var29         int64
Var30         int64
Var31         int64
             ...   
Var14708      int64
Var14709      int64
Var14710      int64
Var14711      int64
Var14712      int64
Var14713      int64


In [3]:
def handle_missing_feature_values():
    ## Preprocessing Step2: handling missing or null feature values
    ## missing values of each column is filled with the most frequent value of the same feature
    partial_chunk_df  = pd.read_pickle("partialchunkdf")
    print("\npartial chunk df shape\n", partial_chunk_df.shape)
    imr = Imputer(missing_values='NaN', strategy='most_frequent', axis=0, copy = False)
    imr = imr.fit(partial_chunk_df)
    imputed_data = imr.transform(partial_chunk_df.values)
    imputed_chunk_df = pd.DataFrame(imputed_data)
    imputed_chunk_df.columns = partial_chunk_df.columns
    print("\ndtypes of imputed chunk df", imputed_chunk_df.dtypes)
    print ("\nMemory footprint of full chunk df after dropping few columns = %0.3fGB" %(imputed_chunk_df.memory_usage(deep=True).sum()/1024**3))
    
    ## Write the resultant dataframe to a pickle
    imputed_chunk_df.to_pickle("imputedchunkdf")

handle_missing_feature_values()



partial chunk df shape
 (50000, 13048)

dtypes of imputed chunk df Var1        float64
Var2        float64
Var3        float64
Var4        float64
Var5        float64
Var6        float64
Var7        float64
Var8        float64
Var9        float64
Var10       float64
Var11       float64
Var12       float64
Var13       float64
Var14       float64
Var16       float64
Var17       float64
Var18       float64
Var19       float64
Var20       float64
Var21       float64
Var22       float64
Var23       float64
Var24       float64
Var25       float64
Var26       float64
Var27       float64
Var28       float64
Var29       float64
Var30       float64
Var31       float64
             ...   
Var14708    float64
Var14709    float64
Var14710    float64
Var14711    float64
Var14712    float64
Var14713    float64
Var14714    float64
Var14715    float64
Var14718    float64
Var14719    float64
Var14720    float64
Var14721    float64
Var14723    float64
Var14724    float64
Var14725    float64
Var14726    

In [4]:
def scale_feature_values():
    imputed_chunk_df  = pd.read_pickle("imputedchunkdf")
    print(imputed_chunk_df)
    scaler = preprocessing.MinMaxScaler()
    minmax_scaled_df = scaler.fit_transform(imputed_chunk_df)
    minmax_scaled_df = pd.DataFrame(minmax_scaled_df, columns=imputed_chunk_df.columns)
    print(minmax_scaled_df)
    print("\nshape of full chunk df after minmax scaling", minmax_scaled_df.shape)
    print("\ndtypes of full chunk df after minmax scaling", minmax_scaled_df.dtypes)
    print ("\nMemory footprint of minmax scaled chunk df = %0.3fGB" %(minmax_scaled_df.memory_usage(deep=True).sum()/1024**3))
    
    ## Write the resultant dataframe to a pickle
    minmax_scaled_df.to_pickle("minmaxscaledchunkdf")

scale_feature_values()

       Var1  Var2  Var3  Var4  Var5  Var6  Var7  Var8  Var9  Var10    ...     \
0       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
1       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
2       0.0   0.0   0.0   0.0   6.0   0.0   0.0   0.0   0.0    0.0    ...      
3       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
4       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
5       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
6       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
7       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
8       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
9       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
10      0.0   0.0   0.0   0.0   6.0   0.0   0.0   0.0   0.0    0.0    ...      
11      0.0   0.0   0.0   0.0   0.0   0.

       Var1  Var2  Var3  Var4  Var5  Var6  Var7  Var8  Var9  Var10    ...     \
0       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
1       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
2       0.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0    0.0    ...      
3       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
4       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
5       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
6       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
7       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
8       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
9       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0    0.0    ...      
10      0.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0    0.0    ...      
11      0.0   0.0   0.0   0.0   0.0   0.


Memory footprint of minmax scaled chunk df = 4.861GB


In [3]:
def resize_feature_dtypes():
    minmax_scaled_df  = pd.read_pickle("minmaxscaledchunkdf")
    print ("\nMemory footprint of minmax scaled data = %0.3fGB\n" %(minmax_scaled_df.memory_usage(deep=True).sum()/1024**3))
    for col in sorted(minmax_scaled_df.columns):
        ##convert to lower int types if applicable
        if (minmax_scaled_df[col].dtype == np.int64):
            if (minmax_scaled_df[col].max() < np.iinfo("int8").max):
                minmax_scaled_df[col] = minmax_scaled_df[col].astype("int8")
            elif (minmax_scaled_df[col].max() < np.iinfo("int16").max):
                minmax_scaled_df[col] = minmax_scaled_df[col].astype("int16")
            elif (minmax_scaled_df[col].max() < np.iinfo("int32").max):
                minmax_scaled_df[col] = minmax_scaled_df[col].astype("int32")            
        elif (minmax_scaled_df[col].dtype == np.int32):
            if (minmax_scaled_df[col].max() < np.iinfo("int8").max):
                minmax_scaled_df[col] = minmax_scaled_df[col].astype("int8")
            elif (minmax_scaled_df[col].max() < np.iinfo("int16").max):
                minmax_scaled_df[col] = minmax_scaled_df[col].astype("int16")  
        elif (minmax_scaled_df[col].dtype == np.int16):
            if (minmax_scaled_df[col].max() < np.iinfo("int8").max):
                minmax_scaled_df[col] = minmax_scaled_df[col].astype("int8") 
        ##convert to lower float types if applicable
        elif (minmax_scaled_df[col].dtype == np.float64):
            if (minmax_scaled_df[col].max() < np.finfo("float32").max):
                minmax_scaled_df[col] = minmax_scaled_df[col].astype("float32")
    
    print ("\nMemory footprint of resized data = %0.3fGB\n" %(minmax_scaled_df.memory_usage(deep=True).sum()/1024**3))
    minmax_scaled_df.to_pickle("resizedchunkdf")
    
resize_feature_dtypes()



Memory footprint of minmax scaled data = 4.861GB


Memory footprint of resized data = 2.430GB



In [4]:
def generate_train_and_test_data():
    resized_chunk_df  = pd.read_pickle("resizedchunkdf")
    train_indices = [x for x in range (0,25000)]
    train_indices.sort()

    train_df = resized_chunk_df.loc[train_indices]
    print("\nshape of training data", train_df.shape)
    print ("\nMemory footprint of train data = %0.3fGB\n" %(train_df.memory_usage(deep=True).sum()/1024**3))
    train_df.to_pickle("trainpickle")

    resized_chunk_df.drop(resized_chunk_df.index[train_indices], inplace=True)
    test_df = resized_chunk_df
    print("\nshape of test data", test_df.shape)
    print ("\nMemory footprint of test data = %0.3fGB\n" %(test_df.memory_usage(deep=True).sum()/1024**3))
    test_df.to_pickle("testpickle")

generate_train_and_test_data()


shape of training data (25000, 13048)

Memory footprint of train data = 1.215GB


shape of test data (25000, 13048)

Memory footprint of test data = 1.215GB



In [21]:
def apply_RFE():
    ## Traget data for training
    target_df = pd.read_csv(data_dir+target_file, header=None, squeeze=True, skiprows=0, nrows=NUMROWS)
    selrows = [x for x in range (0,SELROWS)]
    selrows.sort() 
    traintarget_df = target_df.loc[selrows]
    
    ## Target churn for testing
    target_df.drop(target_df.index[selrows], inplace=True)
    testtarget_series = pd.Series()
    testtarget_series = target_df
    
    ## Input data for training
    train_df  = pd.read_pickle("trainpickle")
    print("\nInput train data shape\n", train_df.shape)
    
    ## Input data for testing
    test_df  = pd.read_pickle("testpickle")
    print("\nInput test data shape\n", test_df.shape)

    y = pd.Series()
    y = traintarget_df
    x = train_df

    selector=rfecv.fit(x, y)
    print("\n Applied RFECV fit")
    print("\n no of features selected", selector.n_features_)
    print("\n mask of features selected", selector.support_)
    print("\n ranking of features selected", selector.ranking_)
    print("\n the cross validation score is", selector.grid_scores_)
    
    with open ("rfecv_results.txt", "w") as cv:
        cv.write ("-----------------------cv_results--------------------\n\n")  
        cv.write ("\n\n--------------no of features selected-----------------------\n\n")
        cv.write (str (selector.n_features_) + "\n")
        cv.write ("\n\n------------------mask of features selected----------------------------\n\n")
        cv.write (str (selector.support_) + "\n")
        cv.write ("\n\n------------------ranking of features selected----------------------------\n\n")
        cv.write (str (selector.ranking_) + "\n")
        cv.write ("\n\n------------------the cross validation score is----------------------------\n\n")
        cv.write (str (selector.grid_scores_) + "\n")
        cv.close()
    
    pickle.dump(file=open("rfecv_model" + ".pkl", "wb"), obj=rfecv)
    print("\n RFECV object dumped to pickle")
    
    ## Predictions
    predictions = []
    predictions.append(rfecv.predict(test_df))
    print("\n RFECV prediction made")

    ## Estimated churn from test
    estchurntest = np.array(predictions).sum(axis=0)
    estchurntest = [-1 if item < 0 else 1 for item in estchurntest]
    estchurntest_series = pd.Series(estchurntest)
    print("\nestchurntestseries\n", estchurntest_series)
    
    ## Check accuracy
    avg_vals = ["micro", "macro", "weighted"]

    with open ("rfecv_results.txt", "a") as cv:
        cv.write ("------------------Scores----------------------------\n\n")
        for avg_val in avg_vals:
            str_val = ("\n\nPrecision (" 
                       + avg_val 
                       + ") = " 
                       + str(precision_score(y_true=testtarget_series, y_pred=estchurntest_series, average=avg_val)))

            print (str_val)
            cv.write (str_val)
        
            str_val = ("\n\nRecall (" 
                       + avg_val 
                       + ") = " 
                       + str(recall_score(y_true=testtarget_series, y_pred=estchurntest_series, average=avg_val)))

            print (str_val)
            cv.write (str_val)
        
            str_val = ("\n\nF1 (" 
                       + avg_val 
                       + ") = " 
                       + str(f1_score(y_true=testtarget_series, y_pred=estchurntest_series, average=avg_val)))

            print (str_val)
            cv.write (str_val)
        cv.close()


rf = RandomForestClassifier(n_estimators=50, min_samples_leaf=5)
print("\n Initialized random forest classifer")

rfecv = RFECV(rf, step=25, cv=3, scoring='f1_weighted')
print("\n Initialized random forest classifer cross validator")       

apply_RFE()





 Initialized random forest classifer

 Initialized random forest classifer cross validator

Input train data shape
 (25000, 13048)

Input test data shape
 (25000, 13048)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



 Applied RFECV fit

 no of features selected 248

 mask of features selected [False False False ... False False False]

 ranking of features selected [513 512 511 ... 489 491 496]

 the cross validation score is [0.89038905 0.89096828 0.89082958 0.89098853 0.89080935 0.89081214
 0.89102619 0.89079217 0.89089027 0.89079217 0.8910865  0.89079204
 0.89067384 0.89075223 0.89084981 0.89086978 0.89071162 0.89084981
 0.8910463  0.89067384 0.89073186 0.89087031 0.89073186 0.89081214
 0.89094858 0.89073199 0.89087029 0.89087045 0.89073186 0.89081
 0.89071188 0.89089003 0.89077194 0.89063389 0.89089041 0.89067384
 0.89075196 0.89081214 0.89100861 0.89085034 0.89085046 0.89079204
 0.89087044 0.89085021 0.89091024 0.89098839 0.89096841 0.8907521
 0.89077208 0.89089027 0.89079204 0.89071378 0.89075196 0.89085034
 0.89071378 0.89089001 0.89089041 0.89079205 0.89075209 0.89067383
 0.89065386 0.89069381 0.89079218 0.89087044 0.89069381 0.89089041
 0.89079204 0.89071378 0.89071378 0.89077207 0.8907921

In [52]:
## RFECV has ranked the features. 
## Dropping those features which are ranked above 201 from both the train and test input.

train_df  = pd.read_pickle("trainpickle")
print("\nInput train data shape\n", train_df.shape)
print ("\nMemory footprint of train data = %0.3fGB\n" %(train_df.memory_usage(deep=True).sum()/1024**3))
    
test_df  = pd.read_pickle("testpickle")
print("\nInput test data shape\n", test_df.shape)
print ("\nMemory footprint of test data = %0.3fGB\n" %(test_df.memory_usage(deep=True).sum()/1024**3))
    
low_ranked_features = []
for i in range(0,len(rfecv.ranking_)):
    if (rfecv.ranking_[i] > 201):
        low_ranked_features.append(train_df.columns[i])
    
train_df.drop(columns=low_ranked_features, inplace=True)
print("\nshape of train df after dropping features with rank worse than 201", train_df.shape)
print ("\nMemory footprint of train data = %0.3fGB\n" %(train_df.memory_usage(deep=True).sum()/1024**3))
print("\ntrain df features", train_df.columns)

test_df.drop(columns=low_ranked_features, inplace=True)
print("\nshape of test df after dropping features with rank worse than 201", test_df.shape)
print ("\nMemory footprint of test data = %0.3fGB\n" %(test_df.memory_usage(deep=True).sum()/1024**3))
print("\ntest df features", test_df.columns)
    
train_df.to_pickle("highrankedtrainpickle")
test_df.to_pickle("highrankedtestpickle")

        
    


Input train data shape
 (25000, 13048)

Memory footprint of train data = 1.215GB


Input test data shape
 (25000, 13048)

Memory footprint of test data = 1.215GB


shape of train df after dropping features with rank worse than 201 (25000, 5248)

Memory footprint of train data = 0.489GB


train df features Index(['Var17', 'Var18', 'Var19', 'Var20', 'Var21', 'Var22', 'Var23', 'Var24',
       'Var25', 'Var26',
       ...
       'Var14723', 'Var14724', 'Var14725', 'Var14726', 'Var14727', 'Var14728',
       'Var14729', 'Var14730', 'Var14731', 'Var14732'],
      dtype='object', length=5248)

shape of test df after dropping features with rank worse than 201 (25000, 5248)

Memory footprint of test data = 0.489GB


test df features Index(['Var17', 'Var18', 'Var19', 'Var20', 'Var21', 'Var22', 'Var23', 'Var24',
       'Var25', 'Var26',
       ...
       'Var14723', 'Var14724', 'Var14725', 'Var14726', 'Var14727', 'Var14728',
       'Var14729', 'Var14730', 'Var14731', 'Var14732'],
      dtype='obj