In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style="whitegrid")
import matplotlib.pyplot as plt
%matplotlib inline
import warnings

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

##### Histogram for number of  empty cells in an instance

df -- some dataframe

In [None]:
feature_cols = [col for col in df.columns if 'feature' in col]

for col in feature_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:  
        print('created missing indicator for: {}'.format(col))
        df['{}_ismissing'.format(col)] = missing

ismissing_cols = [col for col in df.columns if 'ismissing' in col]

df['num_missing'] = df[ismissing_cols].sum(axis=1)

df['num_missing'].value_counts().reset_index().sort_values(by='index').plot.bar(x='index', y='num_missing')

##### An approach for working with outliers

df, df_ -- some dataframe

target -- target column

col -- some column

perc -- percentile

In [None]:
def one_factor_analysis_gini(df_, target, col, left_perc=0, right_perc=100):

    df = df_.copy()
    df = df[[col, target]]
    l_perc = np.percentile(df[col], left_perc)
    r_perc = np.percentile(df[col], right_perc)
    df = df[df[col] >= l_perc]
    df = df[df[col] <= r_perc]
    
    y = df[target]
    df = df.drop(target, axis=1)

    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(df, y,
                                                        test_size=test_size,
                                                        random_state=123)
    if len(y_test) == y_test.sum() or y_test.sum() == 0:
        return 0
    clf = RandomForestClassifier(random_state=123, max_depth = 10)
    clf = clf.fit(X_train, y_train)

    roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    return 2*roc_auc-1 


In [None]:
def box_plots(data, features):
    
    fig = plt.figure(figsize=(20,94))

    for  i, col in enumerate(features):
        
        f = data[col]
        
        ax = fig.add_subplot(len(features)//5 + 1, 5, i+1)
        data.boxplot(column = col)
        
        plt.xlabel(col, fontsize=17)
        plt.tick_params(axis='x', which='major', labelsize=9, pad=-2)
        plt.tick_params(axis='y', which='major', labelsize=12)

In [None]:
box_plots(df, features)

In [None]:
first_partition = np.arange(90, 101, 1)

In [None]:
for feat in features:
    
    best_gini = 0
    
    for perc in first_partition:
        gini = one_factor_analysis_gini(df, 'target', feat, right_perc = perc)
        if gini >= best_gini:
            best_gini = gini
            best_perc_right = perc
            
    if best_perc_right == 100:
        second_partition = np.arange(99.1, 99.9, 0.1)
    else:
        second_partition = np.arange(best_perc_right-0.9, best_perc_right+0.9, 0.1)
    
    for perc in second_partition:
        gini = one_factor_analysis_gini(df, 'target', feat, right_perc = perc)
        if gini > best_gini:
            best_gini = gini
            best_perc_right = perc
    
    temp_frame = pd.DataFrame({'feature': [feat],  
                                'right_perc': [best_perc_right]})
    high_perc_df = high_perc_df.append(temp_frame)

In [None]:
new_df = df.copy()

for feature in high_perc_df.index.values:
    perc = high_perc_df.loc[feature, 'right_perc']
    change_val = np.percentile(new_df[feature], perc)
    new_df[feature] = np.where(new_df[feature]>change_val, change_val, new_df[feature])

In [None]:
first_partition = np.arange(0, 6, 1)

In [None]:
for feat in features:
    
    best_gini = 0
    
    for perc in first_partition:
        gini = one_factor_analysis_gini(new_df, 'target', feat, left_perc = perc)
        if gini > best_gini:
            best_gini = gini
            best_perc_left = perc
            
    if best_perc_left == 0:
        second_partition = np.arange(0, 1, 0.1)
    else:
        second_partition = np.arange(best_perc_left-0.9, best_perc_left+0.9, 0.1)
    
    for perc in second_partition:
        gini = one_factor_analysis_gini(new_df, 'target', feat, left_perc = perc)
        if gini > best_gini:
            best_gini = gini
            best_perc_left = perc
    
    temp_frame = pd.DataFrame({'feature': [feat],  
                                'left_perc': [best_perc_left]})
    low_perc_df = low_perc_df.append(temp_frame)

In [None]:
for feature in low_perc_df.index.values:
    
    perc = low_perc_df.loc[feature, 'left_perc']
    change_val = np.percentile(new_df[feature], perc)
    new_df[feature] = np.where(new_df[feature]<change_val, change_val, new_df[feature])