In [1]:
from library import *

In [2]:
data_location = '../DSI_output/'

name = 'binary'
dev = False
# number of documents used for prediction
n = 3 # 1 or 3
# resample the training data with balance_prop % of failed firm-year samples (e.g. 5%, 10% or 50%)
# uses under-sampling of the majority class
balance = True
balance_prop = 10

In [3]:
k_range = [10000, 25000, 'all']
C_range = [1e-05, 1e-03, 0.05, 0.1, 0.15, 0.5, 1, 10, 100, 1000, 5000]

In [4]:
# the range of n_grams to use (e.g. (1,2) means uni- and bi-grams)
n_grams = (1, 2)
# the feature extraction method, CountVectorizer creates binary features, TfidfVectorizer creates TFIDF features.
feature = CountVectorizer(binary=True, ngram_range=n_grams)

In [5]:
train = pd.read_csv(data_location + 'model_data/bow/train_full_bow_' + str(n) + '.csv', index_col=0)
holdout = pd.read_csv(data_location + 'model_data/bow/holdout_bow_' + str(n) + '.csv', index_col=0)
# split up the holdout set
holdout1 = holdout[holdout['holdout_year'] == 2019]
holdout2 = holdout[holdout['holdout_year'] == 2020]

In [6]:
train.head()

Unnamed: 0,cik,year_1,year_2,year_3,label
0,1310724,missing,missing,19208,0
1,1310738,missing,missing,19209,0
2,1343512,missing,missing,20131,0
3,1081369,14182,14181,14180,0
4,1081369,missing,14182,14181,0


In [7]:
save_location = data_location + 'results_' + name + '/'
store_location = data_location + 'intermediate_processed/'

In [8]:
healthy = pd.read_csv(store_location + 'healthy_bow.csv', index_col=0).reset_index(drop=True)
failed = pd.read_csv(store_location + 'failed_bow.csv', index_col=0).reset_index(drop=True)

In [9]:
healthy['item_7'].iloc[0]

"item 7. management 's discussion analysis financial condition result operation fiscal year ended april 30 2000 1999 1998. result operation revenue revenue continuing operation 2000 _UNK_ compared _UNK_ _UNK_ 1999 1998 respectively represents increase revenue 1 2000 5 1999. revenue include interest income _UNK_ _UNK_ _UNK_ 2000 1999 1998 respectively income _UNK_ _UNK_ _UNK_ 2000 1999 1998 respectively figure chart include interest income income intersegment revenue one segment involved revenue reported segment sell product service unaffiliated purchaser cost applicable segment revenue percentage total segment revenue see chart applicable total segment cost see chart b _UNK_ 2000 _UNK_ 1999 _UNK_ 1998 89 92 89 respectively selling shipping general administrative expense year 2000 1999 1998 selling shipping general administrative expense see chart c _UNK_ _UNK_ _UNK_ respectively percentage consolidated revenue expense 6 three year reviewing chart c reader recognize volume revenue gener

In [None]:
all_docs = pd.concat([healthy, failed]).reset_index(drop=True)
all_docs = all_docs.astype({'doc_id': str})
all_docs = all_docs[['item_7', 'doc_id']]
all_docs = all_docs._append(pd.DataFrame({'item_7': 'missing', 'doc_id': 'missing'}, index=[len(all_docs)]))
del healthy, failed

In [None]:
all_docs.head()

In [None]:
variables_to_swap = ['year_1', 'year_2', 'year_3']

In [None]:
train.head(2)

In [None]:
all_docs.head(2)

In [None]:
pd.merge(train, all_docs, left_on='year_1', right_on='doc_id', how='left')

In [None]:
for variable in variables_to_swap:
    # left merge the firm-year sample dataset with the document dataset on doc_id
    train = pd.merge(train, all_docs, left_on=variable, right_on='doc_id', how='left')
    holdout = pd.merge(holdout, all_docs, left_on=variable, right_on='doc_id', how='left')
    # rename the variable to the corresponding text
    train[variable] = train['item_7']
    holdout[variable] = holdout['item_7']
    # remove the unnecessary variables
    train = train.drop(['item_7', 'doc_id'], axis=1)
    holdout = holdout.drop(['item_7', 'doc_id'], axis=1)
    # cast to string
    train = train.astype({variable: str})
    holdout = holdout.astype({variable: str})

In [None]:
train = train.sample(frac = 1)

In [None]:
X_train = np.array(train['year_1'] + ' ' + train['year_2'] + ' ' + train['year_3'])
y_train = np.array(train['label'])

X_hol1 = np.array(holdout1['year_1'] + ' ' + holdout1['year_2'] + ' ' + holdout1['year_3'])
y_hol1 = np.array(holdout1['label'])

X_hol2 = np.array(holdout2['year_1'] + ' ' + holdout2['year_2'] + ' ' + holdout2['year_3'])
y_hol2 = np.array(holdout2['label'])

# also store the company IDs for analysis later on
cik_1 = np.array(holdout1['cik'])
cik_2 = np.array(holdout2['cik'])

In [None]:
feature_constructor = feature.fit_transform(X_train)

In [None]:
scaler = StandardScaler(with_mean=False)
data = scaler.fit_transform(feature_constructor)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
SelectKBest(k='all')

In [None]:
x = SelectKBest(k='all').fit_transform(data, y_train)

In [None]:
print(x[9, :].toarray())

In [None]:
C_range = [1e-05, 1e-03, 0.05, 0.1, 0.15, 0.5, 1, 10, 100, 1000, 5000]

for C in C_range:
    clf = LogisticRegression(C=C, penalty='l2', solver='lbfgs', max_iter=100).fit(x, y_train)
    predictions = clf.predict_proba(x)
    predictions_0 = [row[1] for row in predictions]
    
    print(roc_auc_score(y_train, predictions_0))

In [None]:
clf.predict(x[:100, :])

In [None]:
roc_auc_score(clf.predict(x), y_train)

In [None]:
y_train[:100]