In [84]:
import numpy as np
from collections import defaultdict
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from me388 import QuantumBlackPCA
from feature_scaling import scale_features
from sklearn.model_selection import train_test_split
from bayes import bayes_output
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

## NO FILTER, MEANS AND NO PCA

In [96]:
data = pd.DataFrame.from_csv('../DataFiles/train.csv')
data = data.fillna(data.mean())
cols = [col for col in data.columns if col != 'class'] 
features = data[cols]
bankrupt = data['class']
feature_train, feature_test, class_train, class_test = train_test_split(
    features, bankrupt, test_size=0.2, random_state=1)

forest = RandomForestClassifier(n_estimators=1000, class_weight={0:0.9, 1:0.1}, max_depth=5, n_jobs=6)
forest = forest.fit(feature_train, class_train)
output = forest.predict(feature_test)
score = forest.score(feature_test, class_test)

bayes_output(output, score, class_test)

sum:  6946
true positives:  0
true negatives:  6620
false positives:  1
false negatives:  325
ratio pos/neg:  0.003076923076923077
test ratio pos/neg:  0.0490862407491
bs score 0.953066513101


## NO FILTER, MEANS, NORMALIZED AND NO PCA

In [33]:
data = pd.DataFrame.from_csv('../DataFiles/train.csv')
data = data.fillna(data.mean())
cols = [col for col in data.columns if col != 'class'] 
features = data[cols]
bankrupt = data['class']
feature_train, feature_test, class_train, class_test = train_test_split(
    features, bankrupt, test_size=0.8, random_state=1)

feature_train = scale_features(np.asarray(feature_train))
feature_test = scale_features(np.asarray(feature_test))
forest = RandomForestClassifier(n_estimators=100, n_jobs=6)
forest = forest.fit(feature_train, class_train)
output = forest.predict(feature_test)
score = forest.score(feature_test, class_test)
print(score)

0.952089557611


## NO FILTER, MEANS, NaN FEATURES, NORMALIZED AND NO PCA

In [97]:
data = pd.DataFrame.from_csv('../DataFiles/train_withNaNinfo.csv')
data = data.fillna(data.mean())
cols = [col for col in data.columns if col != 'class'] 
features = data[cols]
bankrupt = data['class']
feature_train, feature_test, class_train, class_test = train_test_split(
    features, bankrupt, test_size=0.2, random_state=1)

forest = RandomForestClassifier(n_estimators=100, n_jobs=6, class_weight={0:0.9, 1:0.1}, max_depth=3)
forest = forest.fit(feature_train, class_train)
output = forest.predict(feature_test)
score = forest.score(feature_test, class_test)

bayes_output(output, score, class_test)

sum:  6946
true positives:  0
true negatives:  6621
false positives:  0
false negatives:  325
ratio pos/neg:  0.0
test ratio pos/neg:  0.0490862407491
bs score 0.953210480852


In [83]:
data = pd.DataFrame.from_csv('../DataFiles/train_withNaNinfo.csv')
data = data.fillna(data.mean())
cols = [col for col in data.columns if col != 'class'] 
features = data[cols]
bankrupt = data['class']
feature_train, feature_test, class_train, class_test = train_test_split(
    features, bankrupt, test_size=0.2, random_state=1)

forest = ExtraTreesClassifier(n_estimators=100, n_jobs=6, class_weight={0:0.9, 1:0.1})
forest = forest.fit(feature_train, class_train)
output = forest.predict(feature_test)
score = forest.score(feature_test, class_test)

bayes_output(output, score, class_test)

sum:  6946
true positives:  118
true negatives:  6590
false positives:  31
false negatives:  207
ratio pos/neg:  0.1497584541062802
test ratio pos/neg:  0.0490862407491
bs score 0.965735675209


In [93]:
data = pd.DataFrame.from_csv('../DataFiles/train_withNaN27.csv')
data = data.fillna(data.mean())
cols = [col for col in data.columns if col != 'class'] 
features = data[cols]
bankrupt = data['class']
feature_train, feature_test, class_train, class_test = train_test_split(
    features, bankrupt, test_size=0.2, random_state=1)

forest = GradientBoostingClassifier(n_estimators=100000, learning_rate=0.5)
forest = forest.fit(feature_train, class_train)
output = forest.predict(feature_test)
score = forest.score(feature_test, class_test)

bayes_output(output, score, class_test)

sum:  6946
true positives:  195
true negatives:  6562
false positives:  59
false negatives:  130
ratio pos/neg:  0.45384615384615384
test ratio pos/neg:  0.0490862407491
bs score 0.972790095019


## PCA, MEANS, No NaN FEATURES, NORMALIZED, 3 COMPONENTS

In [35]:
data = pd.DataFrame.from_csv('../DataFiles/train.csv')
data = data.fillna(data.mean())
cols = [col for col in data.columns if col != 'class'] 
features = data[cols]
bankrupt = data['class']
n_components = [20]
feature_train, feature_test, class_train, class_test = train_test_split(
    features, bankrupt, test_size=0.2, random_state=1)

feature_train = scale_features(np.asarray(feature_train))
feature_test = scale_features(np.asarray(feature_test))

pca = QuantumBlackPCA(feature_train, n_components=n_components, columns=cols)

feature_train_pca = np.zeros((len(feature_train), n_components[0]))
feature_test_pca = np.zeros((len(feature_test), n_components[0]))
for j, component in enumerate(pca.pca[n_components[0]].components_):
    for i in range(len(feature_train_pca)):
        feature_train_pca[i][j] = np.sum(component * feature_train[i])
    for i in range(len(feature_test_pca)):
        feature_test_pca[i][j] = np.sum(component * feature_test[i])

forest = RandomForestClassifier(n_estimators=100, n_jobs=6)
forest = forest.fit(feature_train_pca, class_train)
output = forest.predict(feature_test_pca)
score = forest.score(feature_test_pca, class_test)

errors = np.where(class_test!=output,class_test,output)
true_positives  = np.where(np.logical_and(class_test,output))
true_negatives  = np.where(np.logical_not(np.logical_or(class_test,output)))
false_positives = np.where(class_test<output)
false_negatives = np.where(class_test>output)

n_true_positives = len(true_positives[0])
n_true_negatives = len(true_negatives[0])
n_false_negatives = len(false_negatives[0])
n_false_positives = len(false_positives[0])

print('sum: ', n_true_positives+n_false_positives+n_false_negatives+n_true_negatives)
print('true positives: ' , n_true_positives)
print('true negatives: ' , n_true_negatives)
print('false positives: ' , n_false_positives)
print('false negatives: ' , n_false_negatives)
print('ratio pos/neg: ', n_false_positives/n_false_negatives)
print('test ratio pos/neg: ', np.sum(class_test)/(len(class_test)-np.sum(class_test)))

print(score)

sum:  6946
true positives:  6
true negatives:  6618
false positives:  3
false negatives:  319
ratio pos/neg:  0.009404388714733543
test ratio pos/neg:  0.0490862407491
0.953642384106


## NaNs, PCA, 20 components

In [116]:
data = pd.DataFrame.from_csv('../DataFiles/train_withNaNinfo.csv')
data = data.fillna(data.median())
cols = [col for col in data.columns if col != 'class'] 
features = data[cols]
bankrupt = data['class']
n_components = [10]
feature_train, feature_test, class_train, class_test = train_test_split(
    features, bankrupt, test_size=0.01, random_state=1)

feature_train = scale_features(np.asarray(feature_train))
feature_test = scale_features(np.asarray(feature_test))

pca = QuantumBlackPCA(feature_train, n_components=n_components, columns=cols)

feature_train_pca = np.zeros((len(feature_train), n_components[0]))
feature_test_pca = np.zeros((len(feature_test), n_components[0]))

for j, component in enumerate(pca.pca[n_components[0]].components_):
    for i in range(len(feature_train_pca)):
        feature_train_pca[i][j] = np.sum(component * feature_train[i])
    for i in range(len(feature_test_pca)):
        feature_test_pca[i][j] = np.sum(component * feature_test[i])
        




In [117]:
with open('../DataFiles/train_PCA_10.csv', 'w') as f:
    for ind, row in enumerate(feature_train_pca):
        f.write(','.join([str(feature) for feature in row])+'\n')

In [62]:
forest = RandomForestClassifier(n_estimators=100, n_jobs=6, class_weight={0: 0.9, 1: 0.1})
forest = forest.fit(feature_train_pca, class_train)
output = forest.predict(feature_test_pca)
score = forest.score(feature_test_pca, class_test)

bayes_output(output, score, class_test)

sum:  6946
true positives:  101
true negatives:  6533
false positives:  88
false negatives:  224
ratio pos/neg:  0.39285714285714285
test ratio pos/neg:  0.0490862407491
bs score 0.955082061618


In [74]:
forest = RandomForestClassifier(n_estimators=100, n_jobs=6, class_weight={0:0.8, 1:0.2}, random_state=1)
forest = forest.fit(feature_train_pca, class_train)
output = forest.predict(feature_test_pca)
score = forest.score(feature_test_pca, class_test)

bayes_output(output, score, class_test)

sum:  6946
true positives:  98
true negatives:  6554
false positives:  67
false negatives:  227
ratio pos/neg:  0.29515418502202645
test ratio pos/neg:  0.0490862407491
bs score 0.95767348114
