In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

%matplotlib inline

In [27]:
def CategoricalPlots(data, cols, ref='accepted'):
    '''
    Plot categorical data from the DataFrame 'data' using the columns in cols.
    Split the categorical data in line with a label reference
    '''
    if not('dummy' in data.columns):
        data['dummy'] = np.ones(shape = data.shape[0])
    for col in cols:
        print(col)
        counts = data[['dummy', ref, col]].groupby([ref, col], as_index = False).count()
        _ = plt.figure(figsize = (13,4))
        plt.subplot(1, 2, 1)
        temp = counts[counts[ref] == 0][[col, 'dummy']]
        plt.bar(temp[col], temp.dummy)
        plt.xticks(rotation=90)
        plt.title('Counts for ' + col + '\n not accepted')
        plt.ylabel('count')
        plt.subplot(1, 2, 2)
        temp = counts[counts[ref] == 1][[col, 'dummy']]
        plt.bar(temp[col], temp.dummy)
        plt.xticks(rotation=90)
        plt.title('Counts for ' + col + '\n accepted')
        plt.ylabel('count')
        plt.show()

def BoxPlots(data, columns, ref = 'accepted'):
    for column in columns:
        sns.boxplot(x=ref, y=column, data=data)
        plt.xlabel(ref)
        plt.ylabel(column)
        plt.show()
        
def DistPlots(data, columns):
    for column in columns:
        ax = sns.distplot(data[column])
        plt.xlabel('value')
        plt.ylabel('density')
        plt.title('Histogram of ' + column)
        plt.show()
        
def ReplaceWithMedian(data, ref = 'property_type', col='applicant_income'):
    '''
    Replacing a missing value in the "col" based on the median calculation from the reference column
    '''
    for value in data[ref].unique():
        replace_median = data[data[ref]==value][col].median()
        data.loc[(data[ref]==value) & (data[col].isnull()), col] = replace_median

In [3]:
data = pd.read_csv('train_values.csv', index_col=0)
print(data.shape)
data.head()

(500000, 21)


Unnamed: 0_level_0,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,applicant_ethnicity,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,1,1,1,70.0,3,18,37,246,2,...,1,24.0,6203.0,44.23,60588.0,50.933,716.0,2642.0,4536,False
1,1,1,3,1,178.0,3,369,52,299,1,...,1,57.0,5774.0,15.905,54821.0,100.0,1622.0,2108.0,2458,False
2,2,1,3,1,163.0,3,16,10,306,2,...,1,67.0,6094.0,61.27,67719.0,100.0,760.0,1048.0,5710,False
3,1,1,1,1,155.0,1,305,47,180,2,...,1,105.0,6667.0,6.246,78439.0,100.0,2025.0,2299.0,5888,True
4,1,1,1,1,305.0,3,24,37,20,2,...,2,71.0,6732.0,100.0,63075.0,82.2,1464.0,1847.0,289,False


In [None]:
#data['missing_census'] = data[['number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
#                               'ffiecmedian_family_income', 'minority_population_pct', 
#                               'tract_to_msa_md_income_pct','population']].isna().all(axis=1)

print(data['state_code'].unique().shape)
print(data['county_code'].unique().shape)
data['state_code'].value_counts().sort_index()

In [4]:
labels = pd.read_csv('train_labels.csv', index_col=0)
print(labels.shape)
labels.head()

(500000, 1)


Unnamed: 0_level_0,accepted
row_id,Unnamed: 1_level_1
0,1
1,0
2,1
3,1
4,1


In [5]:
cols_numerical = ['loan_amount', 'applicant_income', 'number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
                  'ffiecmedian_family_income', 'minority_population_pct', 'tract_to_msa_md_income_pct','population']
cols_categorical = ['loan_type', 'property_type', 'loan_purpose', 'occupancy', 'preapproval', 
                    'applicant_ethnicity', 'applicant_race', 'applicant_sex','co_applicant']

cols_property_location = ['msa_md', 'state_code', 'county_code']

cols_lender = ['lender']

In [6]:
#data[cols_categorical] = data[cols_categorical].astype('category')

# let's drop the property location data and the lender information. This information is basically noise and
# does not contribute to meaningfull split between accepted and not accepted state

data.drop(['msa_md', 'state_code', 'county_code', 'lender'], inplace=True, axis = 1)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 17 columns):
loan_type                         500000 non-null int64
property_type                     500000 non-null int64
loan_purpose                      500000 non-null int64
occupancy                         500000 non-null int64
loan_amount                       500000 non-null float64
preapproval                       500000 non-null int64
applicant_ethnicity               500000 non-null int64
applicant_race                    500000 non-null int64
applicant_sex                     500000 non-null int64
applicant_income                  460052 non-null float64
population                        477535 non-null float64
minority_population_pct           477534 non-null float64
ffiecmedian_family_income         477560 non-null float64
tract_to_msa_md_income_pct        477486 non-null float64
number_of_owner-occupied_units    477435 non-null float64
number_of_1_to_4_family_units     4

In [None]:
cols_categorical

In [None]:
codes = [['loan_type', {1: 'Conventional',
                        2: 'FHA_insured',
                        3: 'VA_guaranteed',
                        4: 'FSA/RHS'}],
         ['property_type', {1: 'One to four_family',
                            2: 'Manufactured housing',
                            3: 'Multifamily'}],
         ['loan_purpose', {1: 'Home purchase',
                           2: 'Home improvement',
                           3: 'Refinancing'}],
         ['occupancy', {1: 'Owner_occupied',
                        2: 'Not owner_occupied',
                        3: 'Not applicable'}],
         ['preapproval',{1: 'Preapproval was requested',
                         2: 'Preapproval was not requested',
                         3: 'Not applicable'}],
         ['applicant_ethnicity',{1: 'Hispanic or Latino',
                                   2: 'Not Hispanic or Latino',
                                   3: 'Information not provided',
                                   4: 'Not applicable',
                                   5: 'No co-applicant'}],
         ['applicant_race', {1: 'American Indian or Alaska Native',
                             2: 'Asian',
                             3: 'Black or African American',
                             4: 'Native Hawaiian or Other Pacific Islander',
                             5: 'White',
                             6: 'Information not provided',
                             7: 'Not applicable',
                             8: 'No co-applicant'}],
         ['applicant_sex', {1: 'Male',
                            2: 'Female',
                            3: 'Information not provided',
                            4: 'Not applicable',
                            5: 'Not applicable'}]
]
for code in codes:
    col = code[0]
    dic = code[1]
    data[col] = [dic[x] for x in data[col]]

In [7]:
# we can remove it entirely. It is roughly 22000 rows which is 4.4% of the entire dataset
# but before this we need to merge the values with labels.

data = data.merge(labels, on='row_id')

data.drop(data[data[['number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
                               'ffiecmedian_family_income', 'minority_population_pct', 
                               'tract_to_msa_md_income_pct','population']].isna().all(axis=1)].index, inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477560 entries, 0 to 499999
Data columns (total 18 columns):
loan_type                         477560 non-null int64
property_type                     477560 non-null int64
loan_purpose                      477560 non-null int64
occupancy                         477560 non-null int64
loan_amount                       477560 non-null float64
preapproval                       477560 non-null int64
applicant_ethnicity               477560 non-null int64
applicant_race                    477560 non-null int64
applicant_sex                     477560 non-null int64
applicant_income                  438420 non-null float64
population                        477535 non-null float64
minority_population_pct           477534 non-null float64
ffiecmedian_family_income         477560 non-null float64
tract_to_msa_md_income_pct        477486 non-null float64
number_of_owner-occupied_units    477435 non-null float64
number_of_1_to_4_family_units     4

In [15]:
ref = 'co_applicant'
col = 'applicant_income'

print(data[ref].unique())

for value in data[ref].unique():
    replace_median = data[data[ref]==value][col].median()
    data.loc[(data[ref]==value) & (data[col].isnull()), col] = replace_median

[False  True]


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477560 entries, 0 to 499999
Data columns (total 18 columns):
loan_type                         477560 non-null int64
property_type                     477560 non-null int64
loan_purpose                      477560 non-null int64
occupancy                         477560 non-null int64
loan_amount                       477560 non-null float64
preapproval                       477560 non-null int64
applicant_ethnicity               477560 non-null int64
applicant_race                    477560 non-null int64
applicant_sex                     477560 non-null int64
applicant_income                  477560 non-null float64
population                        477535 non-null float64
minority_population_pct           477534 non-null float64
ffiecmedian_family_income         477560 non-null float64
tract_to_msa_md_income_pct        477486 non-null float64
number_of_owner-occupied_units    477435 non-null float64
number_of_1_to_4_family_units     4

Encoding

In [17]:
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import feature_selection as fs


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477560 entries, 0 to 499999
Data columns (total 18 columns):
loan_type                         477560 non-null int64
property_type                     477560 non-null int64
loan_purpose                      477560 non-null int64
occupancy                         477560 non-null int64
loan_amount                       477560 non-null float64
preapproval                       477560 non-null int64
applicant_ethnicity               477560 non-null int64
applicant_race                    477560 non-null int64
applicant_sex                     477560 non-null int64
applicant_income                  477560 non-null float64
population                        477560 non-null float64
minority_population_pct           477560 non-null float64
ffiecmedian_family_income         477560 non-null float64
tract_to_msa_md_income_pct        477560 non-null float64
number_of_owner-occupied_units    477560 non-null float64
number_of_1_to_4_family_units     4

In [28]:
 [ 'number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
               'minority_population_pct', 'tract_to_msa_md_income_pct','population']

for col in  [ 'number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
               'minority_population_pct', 'tract_to_msa_md_income_pct','population']:
    replace_median = data[col].median()
    data.loc[(data[col].isnull()), col] = replace_median

In [None]:
data.shape

In [80]:
def encode_category(numeric_category):
    ## First encode the strings to numeric categories
    #enc = preprocessing.LabelEncoder()
    #enc.fit(cat_features)
    #enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder(categories='auto')
    encoded = ohe.fit(numeric_category.values.reshape(-1,1))
    return encoded.transform(numeric_category.values.reshape(-1,1)).toarray()

features = np.array([])

for col in cols_categorical:
    temp = encode_category(data[col])
    if features.shape[0] == 0:
        features = temp
    else:
        features = np.concatenate([features, temp], axis = 1)

In [81]:
print(features.shape)
features[0,:]

(477560, 33)


array([0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.])

In [82]:
features = np.concatenate([features, np.array(data[cols_numerical])], axis = 1)

In [None]:
features = np.concatenate([features, np.array(data[cols_property_location_lender])], axis = 1)

In [90]:
features[0,:]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 7.0000e+01, 2.4000e+01,
       7.1600e+02, 2.6420e+03, 6.0588e+04, 4.4230e+01, 5.0933e+01,
       6.2030e+03])

In [91]:
print(features.shape)

## Define the variance threhold and fit the threshold to the feature array. 
sel = fs.VarianceThreshold(threshold=(.9 * (1 - .9)))
features_reduced = sel.fit_transform(features)

## Print the support and shape for the transformed features
print(sel.get_support())
print(features_reduced.shape)

(477560, 41)
[ True  True False False False False False  True False  True  True  True
 False False  True  True  True  True  True False False False False False
  True  True False  True  True False False  True  True  True  True  True
  True  True  True  True  True]
(477560, 25)


*Training*

In [92]:
X_train, X_test, y_train, y_test = train_test_split(features_reduced, data['accepted'], test_size=0.3) # 70% training and 30% test

In [None]:
n_est = [50,150,250,500]
n_lrn = [0.01, 0.1, 0.5, 0.9]

for est in n_est:
    for lrn in n_lrn:
        # Create adaboost classifer object
        abc = AdaBoostClassifier(n_estimators=est, learning_rate=lrn)
        # Train Adaboost Classifer
        model = abc.fit(X_train, y_train)

        #Predict the response for test dataset
        y_pred = model.predict(X_test)
        
        print("Estimators:", est, "Learners:", lrn, "Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
scale = preprocessing.StandardScaler()
scale.fit(X_train)
X_train = scale.transform(X_train)

In [None]:
X_test = scale.transform(X_test)

In [None]:
for est in [5,50, 100]:
        # Create adaboost classifer object
        abc = RandomForestClassifier(n_estimators=est)
        # Train Adaboost Classifer
        model = abc.fit(X_train, y_train)

        #Predict the response for test dataset
        y_pred = model.predict(X_test)
        
        print("Estimators:", est, "Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [86]:
abc = AdaBoostClassifier(n_estimators=250, learning_rate=0.1)
model_abc = abc.fit(X_train, y_train)
y_pred = model_abc.predict(X_test)
        
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6349917636876343


In [93]:
rfc = RandomForestClassifier(n_estimators=60)
model_rfc = rfc.fit(X_train, y_train)
y_pred = model_rfc.predict(X_test)
        
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6476882485970349


In [None]:
nn_mod = MLPClassifier(hidden_layer_sizes = (50,))
nn_model = nn_mod.fit(X_train, y_train)
y_pred = nn_model.predict(X_test)
        
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))


In [87]:
nb = BernoulliNB()
model_nb = nb.fit(X_train, y_train)
y_pred = model_nb.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6090613395873468


In [88]:
ng = GaussianNB()
model_ng = ng.fit(X_train, y_train)
y_pred = model_ng.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6055434570176174


Score model

In [None]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])
scores = score_model(probabilities, 0.5)
print(np.array(scores[:15]))
print(y_test[:15])

In [None]:
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy  %0.2f' % sklm.accuracy_score(labels, scores))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])


    
print_metrics(y_test, scores) 

In [68]:
test_data = pd.read_csv('test_values.csv', index_col=0)
print(test_data.shape)
test_data.head()

(500000, 21)


Unnamed: 0_level_0,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,applicant_ethnicity,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,1,3,1,115.0,3,101,16,276,2,...,1,,6329.0,59.536,69889.0,85.78,1874.0,2410.0,3791,True
1,1,1,1,1,252.0,2,87,20,68,2,...,1,107.0,2473.0,8.05,65313.0,100.0,947.0,1214.0,2839,True
2,1,1,1,1,270.0,1,-1,-1,-1,2,...,2,119.0,,,,,,,4701,False
3,2,1,1,1,179.0,2,376,20,11,2,...,2,44.0,4795.0,29.676,57766.0,100.0,1426.0,1765.0,2153,True
4,2,1,1,1,36.0,2,254,48,156,3,...,3,32.0,5246.0,5.11,63332.0,100.0,1452.0,2092.0,5710,False


In [69]:
test_data.drop(['msa_md', 'state_code', 'county_code', 'lender'], inplace=True, axis = 1)

In [70]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 17 columns):
loan_type                         500000 non-null int64
property_type                     500000 non-null int64
loan_purpose                      500000 non-null int64
occupancy                         500000 non-null int64
loan_amount                       500000 non-null float64
preapproval                       500000 non-null int64
applicant_ethnicity               500000 non-null int64
applicant_race                    500000 non-null int64
applicant_sex                     500000 non-null int64
applicant_income                  459859 non-null float64
population                        477520 non-null float64
minority_population_pct           477518 non-null float64
ffiecmedian_family_income         477547 non-null float64
tract_to_msa_md_income_pct        477483 non-null float64
number_of_owner-occupied_units    477426 non-null float64
number_of_1_to_4_family_units     4

In [71]:
for col in  [  'number_of_owner-occupied_units', 'number_of_1_to_4_family_units', 'ffiecmedian_family_income', 
               'minority_population_pct', 'tract_to_msa_md_income_pct','population']:
    replace_median = test_data[col].median()
    test_data.loc[(test_data[col].isnull()), col] = replace_median

In [72]:
ReplaceWithMedian(test_data, ref = 'property_type', col='applicant_income')

In [77]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 17 columns):
loan_type                         500000 non-null int64
property_type                     500000 non-null int64
loan_purpose                      500000 non-null int64
occupancy                         500000 non-null int64
loan_amount                       500000 non-null float64
preapproval                       500000 non-null int64
applicant_ethnicity               500000 non-null int64
applicant_race                    500000 non-null int64
applicant_sex                     500000 non-null int64
applicant_income                  500000 non-null float64
population                        500000 non-null float64
minority_population_pct           500000 non-null float64
ffiecmedian_family_income         500000 non-null float64
tract_to_msa_md_income_pct        500000 non-null float64
number_of_owner-occupied_units    500000 non-null float64
number_of_1_to_4_family_units     5

In [76]:
ReplaceWithMedian(test_data, ref = 'co_applicant', col='applicant_income')

In [78]:
def encode_category(numeric_category):
    ## First encode the strings to numeric categories
    #enc = preprocessing.LabelEncoder()
    #enc.fit(cat_features)
    #enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder(categories='auto')
    encoded = ohe.fit(numeric_category.values.reshape(-1,1))
    return encoded.transform(numeric_category.values.reshape(-1,1)).toarray()

test_features = np.array([])

for col in cols_categorical:
    temp = encode_category(test_data[col])
    if test_features.shape[0] == 0:
        test_features = temp
    else:
        test_features = np.concatenate([test_features, temp], axis = 1)

In [79]:
test_features = np.concatenate([test_features, np.array(test_data[cols_numerical])], axis = 1)

In [96]:
print(test_features[0])
print(sel.get_support())
print(test_features.shape[1])
print(sel.get_support().shape[0])

[0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 1.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 1.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 1.0000e+00 1.1500e+02 7.5000e+01 1.8740e+03
 2.4100e+03 6.9889e+04 5.9536e+01 8.5780e+01 6.3290e+03]
[ True  True False False False False False  True False  True  True  True
 False False  True  True  True  True  True False False False False False
  True  True False  True  True False False  True  True  True  True  True
  True  True  True  True  True]
41
41


In [125]:
reduced_test_features = np.array([])

for i,x in enumerate(sel.get_support()):
    if x:
        if reduced_test_features.shape[0]==0:
            reduced_test_features = test_features[:,i].reshape(-1,1)
        else:
            reduced_test_features = np.concatenate([reduced_test_features, test_features[:,i].reshape(-1,1)], axis=1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
(500000, 25)


In [127]:
test_result = model_rfc.predict(reduced_test_features)

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            499990, 499991, 499992, 499993, 499994, 499995, 499996, 499997,
            499998, 499999],
           dtype='int64', name='row_id', length=500000)


In [150]:
series = pd.Series(test_output)

In [156]:
test_output = pd.DataFrame(columns=['row_id', 'accepted'])

In [157]:
test_output['accepted'] = series

In [158]:
test_output['row_id'] = test_data.index.copy()
test_output.index = test_output['row_id']
test_output.drop(columns=['row_id'])

In [166]:
test_output.drop(columns=['row_id'], inplace=True)

In [167]:
test_output.head()

Unnamed: 0_level_0,accepted
row_id,Unnamed: 1_level_1
0,0
1,1
2,2
3,3
4,4


In [168]:
test_output.to_csv('submission-v1.csv')

In [161]:
test_output.index = test_output['row_id']