<a href="https://colab.research.google.com/github/zchuning/PublicSchoolDataAnalysis/blob/master/project_bankruptcy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup and Data Loading



In [1]:
!pip install numpy
!pip install pandas
!pip install scipy
!pip install scikit-learn



### Loading data into collab

1.   Open the file browsing menu by clicking on the tab with a right pointing
arrow.
2.   Navigate to each directories parent until you reach the root.
3.   Upload "MA_Public_Schools_2017.csv" and "MA_Public_Schools_datadict.csv".
4.   Run the code blocks below.



In [0]:
# Relevant module imports are all located here.
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from scipy.io import arff

In [0]:
# Load the bankruptcy data into two pandas dataframes
data, meta = arff.loadarff('/2year.arff')
data = np.array(data.tolist(), np.float)

X = data[:, :-1]
y = data[:, -1]

# Data Imputation

In [0]:
# Zero imputation function
def zeroImpute(X_miss):
  '''
  Returns :
  X_imputed which has zeroes instead of missing values and same shape as X_miss.
  '''
  return np.nan_to_num(X_miss.copy())

In [0]:
# Regression imputation function
def regressedImpute(X_baseImputed, X_miss):
  '''
  Returns :
    X_imputed which has mean of the linearly regressed value instead of the missing values and same shape as X_miss.
  if computePerFeatureStatistics is True, also:
    list of Frobenius norms of difference between reconstructions and original data (without missing values) calculated after each imputing each column.
    list of accuracies on test set of Logistic Regression classifier trained on imputed data after each imputing each column.
  '''
  X_imputed = X_baseImputed.copy()
  # We do a linear regression based imputation here, for each column, train a 
  # classifier to predict its value based on values of other features and
  # replace the NaN with the predicted values. 

  for j in range(X_baseImputed.shape[1]):
    # Build model for current column
    selector = [x for x in range(X_baseImputed.shape[1]) if x != j]
    X_filtered = X_baseImputed[~np.isnan(X_miss[:,j])]
    y_filtered = X_filtered[:,j]
    X_filtered = X_filtered[:, selector]
    col_clf = LinearRegression().fit(X_filtered, y_filtered)

    # Replace X_imputed with new values using new model
    for i in range(X_miss.shape[0]):
      if np.isnan(X_miss[i, j]):
        X_imputed[i, j] =\
          col_clf.predict(X_baseImputed[i, selector].reshape(1, -1))
  
  return X_imputed

In [0]:
# Epoch regression imputation
def impute(X_miss, epochs=2):
  X_imputed = zeroImpute(X_miss.copy())
  for _ in range(epochs):
    X_imputed = regressedImpute(X_imputed, X_miss)
  return X_imputed

In [0]:
X_imputed = impute(X)

In [0]:
X_pos = X[y == 1]
y_pos = y[y == 1]
X_neg = X[y == 0]
y_neg = y[y == 0]

pos_split = int(len(y_pos) * 0.7)
neg_split = int(len(y_neg) * 0.7)

X_train = np.concatenate((X_neg[:neg_split], X_pos[:pos_split]))
y_train = np.concatenate((y_neg[:neg_split], y_pos[:pos_split]))

X_test = np.concatenate((X_neg[neg_split:], X_pos[pos_split:]))
y_test = np.concatenate((y_neg[neg_split:], y_pos[pos_split:]))