In [3]:
#!/usr/bin/env python3

# This python file is to calculate SVC classification using Fannie Mae data

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import re
import datetime
import dill
import sklearn as sk
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cross_validation import train_test_split, KFold, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
#from definitions import summaryfile_type

import warnings
warnings.filterwarnings('ignore')

In [6]:
# Definitions

summaryfile_type = {
                    'LOAN_SEQUENCE_NUMBER': object,
                    'CHANNEL': object,
                    'SELLER_NAME': object,
                    'ORIGINAL_INTEREST_RATE': object,
                    'ORIGINAL_UPB': object,
                    'ORIGINAL_LOAN_TERM': object,
                    #'ORIG_DTE': object,
                    'FIRST_PAYMENT_DATE': object,
                    'ORIGINAL_LOAN_TO_VALUE': object,
                    'ORGINAL_COMBINED_LOAN_TO_VALUE': object,
                    'NUMBER_OF_BORROWERS': object,
                    'ORIGINAL_DEBT_TO_INCOME_RATIO': object,
                    'CREDIT_SCORE': object,
                    'FIRST_TIME_HOMEBUYER_FLAG': object,
                    'LOAN_PURPOSE': object,
                    'PROPERTY_TYPE': object,
                    'NUMBER_OF_UNITS': object,
                    'OCCUPANCY_STATUS': object,
                    'PROPERTY_STATE': object,
                    'POSTAL_CODE': object,
                    #'MI_PCT': float,
                    'PROPERTY_TYPE': object,
                    'MONTHLY_REPORTING_PERIOD': object,
                    'SERVICER_NAME': object,
                    'CURRENT_INTEREST_RATE': object,
                    'CURENT_ACTUAL_UPB': object,
                    'LOAN_AGE': object,
                    'REMAINING_MONTHS_TO_LEAGL_MATURITY': object,
                    #'Adj.Month.To.Mat': float,
                    'MATURITY_DATE': object,
                    'MSA': object,
                    'CURRENT_LOAN_DELINQUENCY_STATUS': object,
                    'MODIFICATION_FLAG': object,
                    'ZERO_BALANCE_CODE': float,
                    'ZERO_BALANCE_EFFECTIVE_DATE': object,
                    'DUE_DATE_OF_LAST_PAID_INSTALLMENT': object,
                    #'FCC_DTE': object,
                    #'DISP_DT': object
                    }

In [8]:
# Part 1 Read the data file into memory
fannie1 = pd.read_csv('./processed/total_12005.csv', dtype=summaryfile_type,
                      nrows=9000000)
fannie1.drop(fannie1.columns[:1], axis=1, inplace=True)
fannie1.head()



Unnamed: 0,CREDIT_SCORE,FIRST_PAYMENT_DATE,FIRST_TIME_HOMEBUYER_FLAG,MATURITY_DATE,MSA,MORTAGAGE_INSURANCE_PERCENTAGE,NUMBER_OF_UNITS,OCCUPANCY_STATUS,ORGINAL_COMBINED_LOAN_TO_VALUE,ORIGINAL_DEBT_TO_INCOME_RATIO,...,MI_RECOVERIES,NET_SALES_PROCEEDS,NON_MI_RECOVERIES,EXPENSES,LEGAL_COSTS,MAINTAINENCE_PRESERVATION_COSTS,TAXES_AND_INSURANCE,MISC_EXPENSES,ACTUAL_LOSS_CALCULATION,MODIFICATION_COST
0,699,2005-05-01,N,203504,39300.0,0,1,O,56,42,...,,,,,,,,,,0.0
1,691,2005-04-01,N,203503,36420.0,25,1,O,90,36,...,,,,,,,,,,0.0
2,713,2005-03-01,N,203502,28740.0,0,1,O,72,45,...,,,,,,,,,,0.0
3,719,2005-05-01,N,203504,,0,1,S,85,47,...,,,,,,,,,,0.0
4,656,2005-03-01,N,203502,40340.0,0,1,O,68,30,...,,,,,,,,,,0.0


In [9]:

fannie1_filtered = fannie1.dropna(subset=('ORIGINAL_LOAN_TO_VALUE', 'ORGINAL_COMBINED_LOAN_TO_VALUE', 'ORIGINAL_DEBT_TO_INCOME_RATIO', 'CREDIT_SCORE'))
fannie1_filtered.head()

Unnamed: 0,CREDIT_SCORE,FIRST_PAYMENT_DATE,FIRST_TIME_HOMEBUYER_FLAG,MATURITY_DATE,MSA,MORTAGAGE_INSURANCE_PERCENTAGE,NUMBER_OF_UNITS,OCCUPANCY_STATUS,ORGINAL_COMBINED_LOAN_TO_VALUE,ORIGINAL_DEBT_TO_INCOME_RATIO,...,MI_RECOVERIES,NET_SALES_PROCEEDS,NON_MI_RECOVERIES,EXPENSES,LEGAL_COSTS,MAINTAINENCE_PRESERVATION_COSTS,TAXES_AND_INSURANCE,MISC_EXPENSES,ACTUAL_LOSS_CALCULATION,MODIFICATION_COST
0,699,2005-05-01,N,203504,39300.0,0,1,O,56,42,...,,,,,,,,,,0.0
1,691,2005-04-01,N,203503,36420.0,25,1,O,90,36,...,,,,,,,,,,0.0
2,713,2005-03-01,N,203502,28740.0,0,1,O,72,45,...,,,,,,,,,,0.0
3,719,2005-05-01,N,203504,,0,1,S,85,47,...,,,,,,,,,,0.0
4,656,2005-03-01,N,203502,40340.0,0,1,O,68,30,...,,,,,,,,,,0.0


In [10]:
fannie1_known = fannie1_filtered[fannie1_filtered['ZERO_BALANCE_CODE'] > 0]
fannie1_known.head()

Unnamed: 0,CREDIT_SCORE,FIRST_PAYMENT_DATE,FIRST_TIME_HOMEBUYER_FLAG,MATURITY_DATE,MSA,MORTAGAGE_INSURANCE_PERCENTAGE,NUMBER_OF_UNITS,OCCUPANCY_STATUS,ORGINAL_COMBINED_LOAN_TO_VALUE,ORIGINAL_DEBT_TO_INCOME_RATIO,...,MI_RECOVERIES,NET_SALES_PROCEEDS,NON_MI_RECOVERIES,EXPENSES,LEGAL_COSTS,MAINTAINENCE_PRESERVATION_COSTS,TAXES_AND_INSURANCE,MISC_EXPENSES,ACTUAL_LOSS_CALCULATION,MODIFICATION_COST
0,699,2005-05-01,N,203504,39300.0,0,1,O,56,42,...,,,,,,,,,,0.0
1,691,2005-04-01,N,203503,36420.0,25,1,O,90,36,...,,,,,,,,,,0.0
3,719,2005-05-01,N,203504,,0,1,S,85,47,...,,,,,,,,,,0.0
5,641,2005-04-01,N,203503,19500.0,30,1,O,94,41,...,,,,,,,,,,0.0
6,646,2005-05-01,N,203504,17140.0,0,1,O,77,43,...,,,,,,,,,,0.0


In [11]:
fannie1_known=fannie1_known.convert_objects(convert_numeric=True)
state_mean = fannie1_known.groupby('PROPERTY_STATE')[('ORIGINAL_UPB', 'ORGINAL_COMBINED_LOAN_TO_VALUE', 'ORIGINAL_DEBT_TO_INCOME_RATIO')].mean()
#state_mean


In [12]:
state_std = fannie1_known.groupby('PROPERTY_STATE')[('ORIGINAL_UPB', 'ORGINAL_COMBINED_LOAN_TO_VALUE', 'ORIGINAL_DEBT_TO_INCOME_RATIO')].std()
#state_std

In [13]:
# Helper function


class ExtractOrigYear(sk.base.BaseEstimator, sk.base.TransformerMixin):

    def __init__(self):
        self.int = np.vectorize(int)
        pass

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return self.int(x['FIRST_PAYMENT_DATE'].apply(lambda x: x.split('-')[0])).reshape(-1, 1)


class extractfeatures(sk.base.BaseEstimator, sk.base.TransformerMixin):

    def __init__(self, column='ORGINAL_COMBINED_LOAN_TO_VALUE'):
        '''
        We use the colname as the selection rule to judge the
        '''
        self.column = column

    def fit(self, x, y):
        return self

    def transform(self, x):
        return x.loc[:, self.column].values.reshape(-1, 1)


class ExtractLoanStatus(sk.base.BaseEstimator, sk.base.TransformerMixin):

    def __init__(self):
        '''
        Initialize the class with bisection of the loan status: Default or Healthy
        '''
        pass

    def fit(self, x):
        return self

    def transform(self, x):
        '''
        Transform the loan status to a tertiary status: Healthy (0), Default (1)
        '''
        status = x['ZERO_BALANCE_CODE'].apply(lambda x: 0 if x <= 1 else 1)
        return status


class ExtractCreditScore(sk.base.BaseEstimator, sk.base.TransformerMixin):

    def __init__(self, is_take_minimum=True):
        self.take_minimum = is_take_minimum
        pass

    def fit(self, x, y):
        return self

    def transform(self, x):
        
        result=x['ZERO_BALANCE_CODE']
        return result.reshape(-1, 1)


class ExtractCategory(sk.base.BaseEstimator, sk.base.TransformerMixin):

    def __init__(self, colname):
        self.colname = colname
        self.transformer = LabelEncoder()
        pass

    def fit(self, x, y):
        self.transformer.fit(x[self.colname])
        return self

    def transform(self, x):
        return self.transformer.transform(x[self.colname]).reshape(-1, 1)


class ExtractNormalized(sk.base.BaseEstimator, sk.base.TransformerMixin):

    def __init__(self, groupby, target, total_mean=state_mean, total_std=state_std):
        self.groupby = groupby
        self.target = target
        self.total_mean = total_mean
        self.total_std = total_std
        pass

    def fit(self, x, y):
        return self

    def transform(self, x):
        temp1 = x.groupby(self.groupby)[[self.groupby, self.target]
                                        ].apply(lambda x: (x[self.target])).values
        temp2 = x.groupby(self.groupby)[[self.groupby, self.target]].apply(
            lambda x: self.total_mean.ix[x[self.groupby].values, self.target]).values
        temp3 = x.groupby(self.groupby)[[self.groupby, self.target]].apply(
            lambda x: self.total_std.ix[x[self.groupby].values, self.target]).values
        return ((temp1 - temp2) / temp3).reshape(-1, 1)






In [36]:
# Logistic Regression
features = FeatureUnion([
    ('Loan_Amount', ExtractNormalized('PROPERTY_STATE', 'ORIGINAL_UPB')),
    #('Interest_Rate', ExtractNormalized('PROPERTY_STATE','ORIGINAL_INTEREST_RATE')),
    ('credit score', ExtractCreditScore()),
    ('Loan_to_Value', ExtractNormalized('STATE', 'ORGINAL_COMBINED_LOAN_TO_VALUE')),
    ('Debt_to_income', ExtractNormalized('PROPERTY_STATE', 'ORIGINAL_DEBT_TO_INCOME_RATIO')),
    ('Loan_purpose', ExtractCategory('LOAN_PURPOSE')),
    ('Property_Type', ExtractCategory('PROPERTY_TYPE')),
    ('Occupancy_Status', ExtractCategory('OCCUPANCY_STATUS'))

])
#features

In [37]:
sss = StratifiedShuffleSplit(ExtractLoanStatus().fit_transform(fannie1_known), 1, test_size=0.15)
for train_index, test_index in sss:
    fannie_train = fannie1_known.iloc[train_index, ]
    fannie_test = fannie1_known.iloc[test_index, ]
    status_train = ExtractLoanStatus().fit_transform(fannie1_known).iloc[train_index, ]
    status_test = ExtractLoanStatus().fit_transform(fannie1_known).iloc[test_index, ]


# fannie_train, fannie_test, status_train, status_test = train_test_split(fannie1_known,
#                                                                         ExtractLoanStatus().fit_transform(fannie1_known),
#                                                                         test_size=0.15)

print('Here is the Logistic regression results...')


Here is the Logistic regression results...


In [38]:
model2 = Pipeline([
    ('features', features),
    ('Logistic', LogisticRegression(C=0.00077426, class_weight='balanced'))
])



fannie_train.head()
#status_train.head()

Unnamed: 0,CREDIT_SCORE,FIRST_PAYMENT_DATE,FIRST_TIME_HOMEBUYER_FLAG,MATURITY_DATE,MSA,MORTAGAGE_INSURANCE_PERCENTAGE,NUMBER_OF_UNITS,OCCUPANCY_STATUS,ORGINAL_COMBINED_LOAN_TO_VALUE,ORIGINAL_DEBT_TO_INCOME_RATIO,...,MI_RECOVERIES,NET_SALES_PROCEEDS,NON_MI_RECOVERIES,EXPENSES,LEGAL_COSTS,MAINTAINENCE_PRESERVATION_COSTS,TAXES_AND_INSURANCE,MISC_EXPENSES,ACTUAL_LOSS_CALCULATION,MODIFICATION_COST
67372,687.0,2005-04-01,N,203503,41180.0,0.0,1.0,O,79,41.0,...,,,,,,,,,,0.0
174001,699.0,2005-05-01,N,203504,39300.0,0.0,1.0,O,90,42.0,...,,,,,,,,,,0.0
121480,717.0,2005-05-01,N,203504,16740.0,0.0,1.0,O,72,21.0,...,,,,,,,,,,0.0
290845,729.0,2005-04-01,,202503,,0.0,1.0,O,79,46.0,...,,,,,,,,,,0.0
1177,630.0,2005-04-01,N,203503,33460.0,30.0,1.0,O,95,39.0,...,51810.0,56000.0,1876.0,-11925.0,-2441.0,-4266.0,-4647.0,-572.0,-65110.0,5634.66


In [48]:

model2.fit(fannie_train, status_train)

TypeError: Series.name must be a hashable type