In [3]:
%matplotlib inline
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.cross_validation import LabelKFold
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import RandomizedLogisticRegression

# models
from sklearn import linear_model

from FeatureExtractor.fraser_feature_set import get_top_50
from FeatureExtractor.domain_adaptation import expand_feature_space
# hide depreciation warning (sklearn)
# ====================
import warnings
warnings.filterwarnings('ignore')

# ====================
# globals
REGULARIZATION_CONSTANT = 1

# ------------------
# Diagnosis keys
# - Control
# - MCI
# - Memory
# - Other
# - PossibleAD
# - ProbableAD
# - Vascular
# ------------------

ALZHEIMERS     = ["PossibleAD", "ProbableAD"]
CONTROL        = ['Control']
NON_ALZHEIMERS = ["MCI", "Memory", "Other", "Vascular"]

# ======================
# setup mysql connection
# ----------------------
USER   = 'dementia'
PASSWD = 'Dementia123!'
DB     = 'dementia'
# url = 'mysql://%s:%s@127.0.0.1/%s' % (USER, PASSWD, DB) 
url = 'mysql://%s:%s@localhost/%s' % (USER, PASSWD, DB) 
engine = create_engine(url)
cnx = engine.connect()
# ======================

# ------------------
# Helper functions
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
    

def get_top_pearson_features(X,y,n):
    df = pd.DataFrame(X).apply(pd.to_numeric)
    df['y'] = y
    corr_coeff = df.corr()['y'].abs().sort(inplace=False, ascending=False)
    return corr_coeff.index.values[1:n+1].astype(int)

    
def get_data(lexical_table_name, diagnosis, features=None):
    # Read from sql
    lexical  = pd.read_sql_table(lexical_table_name, cnx)
    acoustic = pd.read_sql_table("dbank_acoustic", cnx)
    diag     = pd.read_sql_table("diagnosis", cnx)
    demo     = pd.read_sql_table("demographic_imputed", cnx)

    # Merge
    fv = pd.merge(lexical, acoustic, on=['interview'])

    # Select diagnosis
    diag = diag[diag['diagnosis'].isin(diagnosis)]
    fv = pd.merge(fv,diag)
    # Impute
    # demo['age'].fillna(demo['age'].mean(), inplace=True)

    # # Add demographics
    fv = pd.merge(fv,demo)
    
    # Randomize
    fv = fv.sample(frac=1,random_state=20)

    # Collect Labels 
    labels = [label[:3] for label in fv['interview']]
    # Split 
    y = fv['dementia'].astype('bool')
    # Clean 
    X = fv.drop(['dementia', 'level_0', 'interview', 'diagnosis', 'gender'], 1)

    X = X.apply(pd.to_numeric, errors='ignore')

    if features:
        X = X[features]
    # Return
    return X, y, labels

In [4]:
target = ALZHEIMERS + CONTROL
dbname = "dbank_lexical"
X, y, labels  = get_data(dbname, target)

In [5]:
# Split into folds using labels 
label_kfold = LabelKFold(labels, n_folds=10)

In [6]:
model = linear_model.LogisticRegression(penalty='l2', C=REGULARIZATION_CONSTANT)

In [9]:
model_fs = RandomizedLogisticRegression(C=1)
scores      = []
feat_number = []
for train_index, test_index in label_kfold:
    # Split
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]

    indices = get_top_pearson_features(X_train, y_train, 100)

#     # Select k features 
#     X_train = X_train[:,indices]
#     X_test  = X_test[:,indices]

    model_fs = model_fs.fit(X_train, y_train)
#     print model_fs.scores_
#     print X.columns[model_fs.get_support()]
#     print model_fs.scores_[model_fs.get_support(indices=True)]

    X_train = model_fs.transform(X_train)
    X_test  = model_fs.transform(X_test)

    feat_number.append(X_train.shape[1])

    model.fit(X_train, y_train)
    yhat  = model.predict(X_test)              
    scores.append(accuracy_score(y_test, yhat))
    import pdb; pdb.set_trace()
    print "randomizedLR"
    for f in sorted(zip(map(lambda x: round(x, 4), model_fs.scores_), X.columns), reverse=True):
        print f

    print "Pearson"
    for f in X.columns[indices]:
        print f
    
    

print "Avg. features: %f" % np.mean(feat_number)
print np.mean(scores)


> <ipython-input-9-9ed60eaf02e5>(29)<module>()
-> print "randomizedLR"
(Pdb) model_fs.scores_
array([ 0.175,  0.54 ,  0.   ,  0.   ,  0.   ,  0.225,  0.425,  0.   ,
        0.135,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.035,  0.005,  0.005,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.63 ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.085,  0.545,  0.   ,  0.   ,  0.17 ,  0.   ,  0.005,
        0.38 ,  0.   ,  0.035,  0.145,  0.   ,  0.   ,  0.09 ,  0.03 ,
        0.115,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.22 ,  0.   ,  0.065,  0.01 ,  0.005,  0.   ,  0.   ,  0.   ,
        0.205,  0.12 ,  0.   ,  0.   ,  0.06 ,  0.425,  0.315,  0.   ,
        0.345,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.02 ,  0.065,  0.535,  0.105,  0.02 ,  0.135,  0.   ,  0.41 ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.43 ,  0.49 ,

KeyboardInterrupt: 

In [10]:
model_fs.scores_

array([ 0.175,  0.54 ,  0.   ,  0.   ,  0.   ,  0.225,  0.425,  0.   ,
        0.135,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.035,  0.005,  0.005,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.63 ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.085,  0.545,  0.   ,  0.   ,  0.17 ,  0.   ,  0.005,
        0.38 ,  0.   ,  0.035,  0.145,  0.   ,  0.   ,  0.09 ,  0.03 ,
        0.115,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.22 ,  0.   ,  0.065,  0.01 ,  0.005,  0.   ,  0.   ,  0.   ,
        0.205,  0.12 ,  0.   ,  0.   ,  0.06 ,  0.425,  0.315,  0.   ,
        0.345,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.02 ,  0.065,  0.535,  0.105,  0.02 ,  0.135,  0.   ,  0.41 ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.43 ,  0.49 ,  0.   ,  0.49 ,
        0.305,  0.   ,  0.   ,  0.515,  0.335,  0.   ,  0.02 ,  0.065,
      

In [34]:
f = sorted(zip(X.columns, map(lambda x: round(x, 4), model_fs.scores_)), reverse=True,  key=lambda x: x[1])
df = pd.DataFrame(f)
df

Unnamed: 0,0,1
0,age,0.925
1,MeanWordLength,0.630
2,NP_to_PRP,0.545
3,ADVP,0.540
4,binaryIUActionStoolFalling,0.535
5,binaryIUSubjectGirl,0.515
6,keywordIUSubjectWoman,0.515
7,binaryIUObjectStool,0.490
8,binaryIUObjectWindow,0.490
9,mfcc5_kurtosis,0.490
