In [1]:
import pandas

In [2]:
def correlation(data, method, caption):
    """Calculates the correlation coefficients between columns.
    Displays them in descending order of their absolute values."""
    columns = list(data)
    coefficients = data.astype(float).corr(method=method)
    results = []
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            coefficient = coefficients[columns[i]][columns[j]]
            results.append((
                abs(coefficient), coefficient,
                columns[i] + ' x ' + columns[j]))
    print('# ' + caption + ', ' + method)
    for result in reversed(sorted(results)):
        abs_coefficient, coefficient, columns_pair = result
        print (coefficient, columns_pair)

In [3]:
# Training dataset
training_data = pandas.read_csv('in/cs-training.csv')
training_data.drop(training_data.columns[0], axis=1, inplace=True)
for method in ('pearson', 'spearman'):
    correlation(training_data, method, 'training')


# training, pearson
(0.99279618259160096, 'NumberOfTimes90DaysLate x NumberOfTime60-89DaysPastDueNotWorse')
(0.98700544747994545, 'NumberOfTime30-59DaysPastDueNotWorse x NumberOfTime60-89DaysPastDueNotWorse')
(0.98360268128207107, 'NumberOfTime30-59DaysPastDueNotWorse x NumberOfTimes90DaysLate')
(0.43395860305532008, 'NumberOfOpenCreditLinesAndLoans x NumberRealEstateLoansOrLines')
(-0.21330257804499639, 'age x NumberOfDependents')
(0.14770531827119118, 'age x NumberOfOpenCreditLinesAndLoans')
(0.12558696457286117, 'SeriousDlqin2yrs x NumberOfTime30-59DaysPastDueNotWorse')
(0.1249589610947737, 'MonthlyIncome x NumberRealEstateLoansOrLines')
(0.12468428521270425, 'NumberRealEstateLoansOrLines x NumberOfDependents')
(0.12004602812504976, 'DebtRatio x NumberRealEstateLoansOrLines')
(0.11717461344362123, 'SeriousDlqin2yrs x NumberOfTimes90DaysLate')
(-0.11538551846889157, 'SeriousDlqin2yrs x age')
(0.10226086105037467, 'SeriousDlqin2yrs x NumberOfTime60-89DaysPastDueNotWorse')
(0.091454657

In [4]:
# Test dataset
test_data = pandas.read_csv('in/cs-test.csv')
test_data.drop(test_data.columns[0], axis=1, inplace=True)
for method in ('pearson', 'spearman'):
    correlation(test_data, method, 'test')

# test, pearson
(0.99411105422498003, 'NumberOfTimes90DaysLate x NumberOfTime60-89DaysPastDueNotWorse')
(0.98879970539148254, 'NumberOfTime30-59DaysPastDueNotWorse x NumberOfTime60-89DaysPastDueNotWorse')
(0.98568486730861449, 'NumberOfTime30-59DaysPastDueNotWorse x NumberOfTimes90DaysLate')
(0.43114131547739243, 'NumberOfOpenCreditLinesAndLoans x NumberRealEstateLoansOrLines')
(-0.2171539756251786, 'age x NumberOfDependents')
(0.14378766275773319, 'age x NumberOfOpenCreditLinesAndLoans')
(0.12923357869659158, 'DebtRatio x NumberRealEstateLoansOrLines')
(0.12112577406439484, 'NumberRealEstateLoansOrLines x NumberOfDependents')
(-0.084974355629491988, 'NumberOfOpenCreditLinesAndLoans x NumberOfTimes90DaysLate')
(-0.076187909462431244, 'NumberOfOpenCreditLinesAndLoans x NumberOfTime60-89DaysPastDueNotWorse')
(-0.069226597739272283, 'age x NumberOfTime30-59DaysPastDueNotWorse')
(-0.067605503480325929, 'age x NumberOfTimes90DaysLate')
(0.065111561734806436, 'DebtRatio x NumberOfOpenCreditL