In [9]:
# coding: utf-8

#!/usr/bin/env python3

import pandas as pd
import os
import os.path
import numpy as np


#%% This is for Spyder IDE

## Define global label

Acquisition_label = {"CREDIT_SCORE": np.dtype(object),
                     "FIRST_PAYMENT_DATE": object,
                     "FIRST_TIME_HOMEBUYER_FLAG": object,
                     "MATURITY_DATE": object,
                     "MSA": np.dtype(object),
                     "MORTAGAGE_INSURANCE_PERCENTAGE": np.dtype(object),
                     "NUMBER_OF_UNITS": np.dtype(object),
                     "OCCUPANCY_STATUS": object,
                     "ORGINAL_COMBINED_LOAN_TO_VALUE": np.dtype(object),
                     "ORIGINAL_DEBT_TO_INCOME_RATIO": np.dtype(object),
                     "ORIGINAL_UPB": np.dtype(object),
                     "ORIGINAL_LOAN_TO_VALUE": np.dtype(object),
                     "ORIGINAL_INTEREST_RATE": np.dtype(object),
                     "CHANNEL": object,
                     "PREPAYMENT_PENALTY_MORTGAGE_FLAG":np.dtype(object),
                     "PRODUCT_TYPE": np.dtype(object),
                     "PROPERTY_STATE":np.dtype(object),
                     "PROPERTY_TYPE": np.dtype(object),
                     "POSTAL_CODE": np.dtype(object),
                     "LOAN_SEQUENCE_NUMBER": np.dtype(object),
                     "LOAN_PURPOSE": np.dtype(object),
                     "ORIGINAL_LOAN_TERM": np.dtype(object),
                     "NUMBER_OF_BORROWERS": np.dtype(object),
                     "SELLER_NAME": np.dtype(object),
                     "SERVICER_NAME": np.dtype(object),
                     "SUPER_CONFORMING_FLAG": np.dtype(object)
                     
}

Acquisition_names = ["CREDIT_SCORE",
                     "FIRST_PAYMENT_DATE",
                     "FIRST_TIME_HOMEBUYER_FLAG",
                     "MATURITY_DATE",
                     "MSA",
                     "MORTAGAGE_INSURANCE_PERCENTAGE",
                     "NUMBER_OF_UNITS",
                     "OCCUPANCY_STATUS",
                     "ORGINAL_COMBINED_LOAN_TO_VALUE",
                     "ORIGINAL_DEBT_TO_INCOME_RATIO",
                     "ORIGINAL_UPB",
                     "ORIGINAL_LOAN_TO_VALUE",
                     "ORIGINAL_INTEREST_RATE",
                     "CHANNEL",
                     "PREPAYMENT_PENALTY_MORTGAGE_FLAG",
                     "PRODUCT_TYPE",
                     "PROPERTY_STATE",
                     "POSTAL_CODE",
                     "LOAN_SEQUENCE_NUMBER",
                     "LOAN_PURPOSE",
                     "ORIGINAL_LOAN_TERM",
                     "NUMBER_OF_BORROWERS",
                     "SELLER_NAME",
                     "SERVICER_NAME",
                     "SUPER_CONFORMING_FLAG"
                    ]


Performance_label ={"LOAN_SEQUENCE_NUMBER": object,
                    "MONTHLY_REPORTING_PERIOD": object,
                    "CURENT_ACTUAL_UPB": float,
                    "CURRENT_LOAN_DELINQUENCY_STATUS": object,
                    "LOAN_AGE": int,
                    "REMAINING_MONTHS_TO_LEAGL_MATURITY": float,
                    "REPURCHASE_FLAG": object,
                    "MODIFICATION_FLAG": object,
                    "ZERO_BALANCE_CODE":object,
                    "ZERO_BALANCE_EFFECTIVE_DATE": object,
                    "CURRENT_INTEREST_RATE": float,
                    "CURRENT_DEFEREED_UPB": np.dtype(object),
                    "DUE_DATE_OF_LAST_PAID_INSTALLMENT":np.dtype(object),
                    "MI_RECOVERIES":np.dtype(object),
                    "NET_SALES_PROCEEDS":np.dtype(object),
                    "NON_MI_RECOVERIES":np.dtype(object),
                    "EXPENSES":np.dtype(object),
                    "LEGAL_COSTS":np.dtype(object),
                    "MAINTAINENCE_PRESERVATION_COSTS":np.dtype(object),
                    "TAXES_AND_INSURANCE":np.dtype(object),
                    "MISC_EXPENSES":np.dtype(object),
                    "ACTUAL_LOSS_CALCULATION":np.dtype(object),
                    #"MODIFICATION_COST":np.dtype(object)
}

Performance_names =["LOAN_SEQUENCE_NUMBER",
                    "MONTHLY_REPORTING_PERIOD",
                    "CURENT_ACTUAL_UPB",
                    "CURRENT_LOAN_DELINQUENCY_STATUS",
                    "LOAN_AGE",
                    "REMAINING_MONTHS_TO_LEAGL_MATURITY",
                    "REPURCHASE_FLAG",
                    "MODIFICATION_FLAG",
                    "ZERO_BALANCE_CODE",
                    "ZERO_BALANCE_EFFECTIVE_DATE",
                    "CURRENT_INTEREST_RATE",
                    "CURRENT_DEFEREED_UPB",
                    "DUE_DATE_OF_LAST_PAID_INSTALLMENT",
                    "MI_RECOVERIES",
                    "NET_SALES_PROCEEDS",
                    "NON_MI_RECOVERIES",
                    "EXPENSES",
                    "LEGAL_COSTS",
                    "MAINTAINENCE_PRESERVATION_COSTS",
                    "TAXES_AND_INSURANCE",
                    "MISC_EXPENSES",
                    "ACTUAL_LOSS_CALCULATION",
                    #"MODIFICATION_COST"
]



def processData(Pfile, Afile, year, quarter, folder = 'processed'):
    '''
    This function is to process the Fannie raw data and condense it to the
    summary dataset.
    Usage:
    processData("Performance_2014Q2.txt", "Acquisition_2014Q2.txt", '2014', '2')
    return True if everything is successful.
    Otherwise return False, possible with any exceptions.
    
    '''
    global Performance_label, Acquisition_label, Performance_names, Acquisition_names
    print('Start to process data from {year} {quarter}...\n'.format(year=year,
                                                               quarter = quarter))

    perform = pd.read_csv(Pfile, header = None, sep = '|', names =
                          Performance_names, na_values = "NaN",
                          index_col = False, dtype = Performance_label,
                          usecols=range(22))
    #perform.to_csv('time.csv')
    print('Performance Reading Finished!')
    acquisition = pd.read_csv(Afile, header = None, sep='|', names =
                              Acquisition_names,na_values = "NaN",
                              index_col = False, dtype=Acquisition_label,
                              error_bad_lines = False)
    #acquisition.to_csv('data.csv')
    print('All Reading Finished!')
#     #%% In[21]:
#     ## Massage the data
#     ## 1) Convert LOAN_ID field into characater field.
    perform['LOAN_SEQUENCE_NUMBER'] = perform['LOAN_SEQUENCE_NUMBER'].astype('<U20')
    acquisition['LOAN_SEQUENCE_NUMBER'] = acquisition['LOAN_SEQUENCE_NUMBER'].astype('<U20')
    print("LOAN_ID conversion complete!")
#     ## Change the NaN Zero Balance Code into 0 for the convenience of later processing.
#     #perform['Monthly.Rpt.Prd'] = pd.to_datetime(perform['Monthly.Rpt.Prd'],
#     #                                            format ='%m/%d/%Y')
#     #acquisition['ORIG_DTE'] = pd.to_datetime(acquisition['ORIG_DTE'], format='%m/%Y')
#     #print("Montly Report Date conversion complete!")
#     perform.loc[perform['Zero.Bal.Code'].isnull(), 'Zero.Bal.Code'] = 0
#     print('Processing the performance data...\n')
    perform_byid = perform.groupby(["LOAN_SEQUENCE_NUMBER"], sort = False).last()
    print('Groupby Performance data is done')
    
#     ## Merge the processed data with acquisition data together.
    print('Merging performance and acquisition data...\n')
    res = acquisition.merge(perform_byid.reset_index(), on = 'LOAN_SEQUENCE_NUMBER', how = 'outer')
    
    print('Writing summary file summary_{0}Q{1}.csv...\n'.format(year,quarter))
    SFile_prefix = 'summary_'
    filename = SFile_prefix + str(year) + 'Q' + str(quarter) + '.csv'
    cwd = os.getcwd()
    fullpath = os.path.join(cwd, folder, filename)
    #res.to_csv(fullpath)
    return res 

def analysis(res):
    print(res.head())
    print("Columns with all NaN values",pd.isnull(res).all())
    
    #drop all the columns with only NaN values
    res_new=res.dropna(axis=1,how='all')
    print("New datafarme created, columns with all NaN values removed")
    #print(res_new.head())
    
    #create a new column credit bucket
    res_new.insert(1, "CREDIT_BUCKET", "")
    
    #categorize the user in medium/good credit standing on the basis of their credit score (threshold set as 650)
    count=0
    for i in res_new:
        if(res_new["CREDIT_SCORE"][count] <=650):
            res_new["CREDIT_BUCKET"][count]="Medium Credit"
        else:
            res_new["CREDIT_BUCKET"][count]="Good Credit"
        count=count+1
        
    print(res_new.head())
    
if (__name__ == "__main__"):
    years = range(2015,2016)
    quarters = range(1,2)
    cwd = os.getcwd()
    #dest = os.path.join(cwd,'raw')
    Afile_prefix = 'sample_orig_'
    Pfile_prefix = 'sample_svcg_'
    

    for year in years:
        for quarter in quarters:
            Afile = os.path.join(cwd,
                                 Afile_prefix+str(year)+'.txt')
            print(Afile)
            Pfile = os.path.join(cwd,
                                 Pfile_prefix+str(year)+'.txt')
            print(Pfile)
            df=processData(Pfile, Afile, year, quarter)
            analysis(df)
    
    print("All finished! Enjoy!")

C:\Users\Vasanti\Desktop\NEUdocs\Studymaterial\ADS\Github\MachineLearning_US_Housing_Urban_Development_CaseStudy\sample_orig_2015.txt
C:\Users\Vasanti\Desktop\NEUdocs\Studymaterial\ADS\Github\MachineLearning_US_Housing_Urban_Development_CaseStudy\sample_svcg_2015.txt
Start to process data from 2015 1...

Performance Reading Finished!
All Reading Finished!
LOAN_ID conversion complete!
Groupby Performance data is done
Merging performance and acquisition data...

Writing summary file summary_2015Q1.csv...

  CREDIT_SCORE FIRST_PAYMENT_DATE FIRST_TIME_HOMEBUYER_FLAG MATURITY_DATE  \
0          703             201503                       NaN        203002   
1          813             201503                       NaN        204502   
2          742             201504                       NaN        204503   
3          696             201503                       NaN        204502   
4          820             201505                       NaN        203004   

     MSA MORTAGAGE_INSURANCE

TypeError: unorderable types: str() <= int()

In [None]:
historical_data1_time_Q12005