In [None]:
from scipy import io as sio
import pandas as pd
import numpy as np

# Process Q and B data separately

filePath = "./data/QMatrix_label.csv"
if 'B' in filePath:
    dataType = 'B'
else:
    dataType = 'Q'

impute_strategy = 'mean'
    
df = pd.read_csv(filePath)

In [None]:
# See what does the data looks like
df

In [None]:
# Read the labels
labels = df.ix[:,1].values

In [None]:
# Convert alphabet subgroups to numbers
mappingSub = {j:i for i,j in enumerate(np.unique(df['Smoking_Sub_Group']))}
numericSub = np.array([mappingSub[i] for i in df['Smoking_Sub_Group']])
df['Smoking_Sub_Group'] = numericSub

In [None]:
# Convert string labels to numbers
mappingLabels = {j:i for i,j in enumerate(np.unique(labels))}
numericLabels = np.array([mappingLabels[i] for i in labels])

In [None]:
# Drop first two columns
df.drop(df.columns[[0,1]], axis=1, inplace=True)

In [None]:
# Data to be imputed
dataToBeImputed = df.values

# Get column names
columnNames = df.columns.values.astype('U')
# columnNames = [i for i in columnNames]

## Get the info from the data, for each feature and each label, calculate their NA rate and store the non-NA values for further analysis, like box-plot

In [None]:
info = {}
for column in xrange(len(columnNames)):
    if column % 100 == 0:
        print column
    info[columnNames[column]] = {}
    for label in xrange(3):
        info[columnNames[column]][label] = {}
        indexes = list(np.where(numericLabels==label)[0])
        temp = df.ix[indexes,column]
        info[columnNames[column]][label]['NA rate'] = temp.isnull().values.sum()*1.0/len(indexes)
        info[columnNames[column]][label]['non-NA data'] = temp[temp.notnull().values].values

info['labelMap'] = {0:'Acos',1:'Asthma',2:'COPD'}

In [None]:
np.save('./data/Info_' + dataType + '.npy', info)

## Data Imputation with simply mean/median, advanced methods will be attached as well

In [None]:
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy=impute_strategy, axis=0)
imp.fit(dataToBeImputed)
imputedData = imp.transform(dataToBeImputed)

## Scale the data from 0 to 1

In [None]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler(feature_range=(0, 1))
scaledData = mm.fit_transform(imputedData)

In [None]:
# Read the patient list generated from the last file

patientList = np.load("./data/patientList"+dataType+".npy")

In [None]:
# Save the file
sio.savemat("./data/" + dataType + "_3labels_mean_scaled.mat", \
                {'X':scaledData,'Y':numericLabels,'patients':patientList, 'columnNames':columnNames})

In [None]:
labels2_list = ['AsthmaCOPD','AcosCOPD','AcosAsthma']
for i in xrange(len(labels2_list)):
    # Create 2-classes patients list
    indexes = np.where(numericLabels!=i)
    Y = numericLabels[indexes]
    if i == 0:
        Y = Y-1
    elif i == 1:
        Y = np.array([j if j == 0 else 1 for j in Y])
    X, p = scaledData[indexes,:], patientList[indexes]
    
    sio.savemat("./data/" + dataType + "_" + labels2_list[i] + "_mean_scaled.mat", \
                {'X':X, 'Y':Y, 'patients':p, 'columnNames':columnNames})