In [5]:
import pandas as pd
import numpy as np

data = pd.DataFrame()

# reading in each of our ten files
for i in range(1, 11):
    filename = f"0001-Data-{i}.tsv"
    
    # reading visit file into a dataframe
    with open(filename, "r") as infile:
        visit_data = pd.read_csv(infile, sep = "\t")
        
        # creating a column for the visit number
        visit_data["visit"] = i
        
        # removing the visit number from the end of each attribute name
        for col in visit_data.columns:
            if col[-len(str(i)):] == str(i):
                visit_data.rename(columns={col:col[:-len(str(i))]}, inplace=True)
        
        # adding this visit data back to the merged dataframe
        data = pd.concat([data, visit_data], ignore_index = True)

In [6]:
null_cap = 5000

# replacing blank cells with NaN
data.replace(r'^\s*$', np.nan, regex=True, inplace = True)
data.replace("-1", np.nan, inplace = True)

for col in data.columns:
    # removing columns about family age
    if "AGE" in col and col != "AGE":
        del data[col]

    # removing columns with > 75% NaN values
    if col in data:
        nas = data[col].isna().sum()
        if nas > null_cap:
            del data[col]

       SWANID  VISIT INTDAY AGE LANGINT  RACE PREGNAN PREVBLO EATDRIN STRTPER  \
0       10046      1    413  53       1     2       1       1       1       2   
1       10056      1    357  52       1     4       1               1       2   
2       10092      1    364  46       1     4       1               1       1   
3       10126      1    442  50       1     1       1               1       2   
4       10153      1    374  52       1     3       1               1       2   
...       ...    ...    ...  ..     ...   ...     ...     ...     ...     ...   
25482   99805     10   3661  52       1     1       1     NaN       1       1   
25483   99809     10   3624  53       1     4       1     NaN       1       2   
25484   99888     10   3661  58       1     3       1     NaN       1       1   
25485   99898     10   3626  55       1     4       1     NaN       1       1   
25486   99962     10   3647  57       3     2       1     NaN       1       1   

       ... MEALBAR INSTSHK 

In [7]:
data.head()

Unnamed: 0,SWANID,VISIT,INTDAY,AGE,LANGINT,RACE,PREGNAN,EATDRIN,STRTPER,BLDRWAT,...,T,FLGCV,FLGDIF,SPSCTIM,HPSCTIM,STATUS,visit,ENDO,PROLAPS,ABBLEED
0,10046,1,413,53,1,2,1,1,2,1,...,41.9,0,0,0:10:08,0:09:52,7,1,,,
1,10056,1,357,52,1,4,1,1,2,1,...,26.8,0,0,0:12:55,0:13:03,4,1,,,
2,10092,1,364,46,1,4,1,1,1,1,...,57.9,0,0,0:18:01,0:17:39,4,1,,,
3,10126,1,442,50,1,1,1,1,2,1,...,40.9,0,0,.,.,4,1,,,
4,10153,1,374,52,1,3,1,1,2,1,...,38.6,0,0,0:10:47,0:10:40,4,1,,,


In [8]:
# dropping rows with nulls as lasso cannot use them
data.dropna(axis = 0, how = "any", inplace=True)

In [9]:
# convert nums stored as strings to numbers!!!!!!!!!
non_num = ["READSPE", "THNKSPE", "LANGSPE", "SPEKSPE", "PROGSPE", "SPORT1", "STPTIM1", 
        "SPORT2", "LANGFFQ", "SPSCTIM", "HPSCTIM", "STRTIM1", "STRTIM2", "STRTIM3", "STPTIM2", "STPTIM3"]
for col in non_num:
    if col in data.columns:
        del data[col]
    
"""
# finding non numeric columns

for col in list(data.columns):
    print(col)
    for item in data[col]:
        it = float(item)
"""        
        
data = data.apply(pd.to_numeric)

In [10]:
import numpy as np

# store age and ids in separate dataframe
y = pd.DataFrame({"AGE": data["AGE"], "SWANID": data["SWANID"], "VISIT": data["VISIT"]})

x = data

# remove predicted feature
x = x.drop(["AGE"], axis=1)
x = x.drop(["VISIT"], axis=1)

# select only numeric columns to use for lasso
x = x.select_dtypes(include = np.number)
x

Unnamed: 0,SWANID,INTDAY,LANGINT,RACE,PREGNAN,EATDRIN,STRTPER,BLDRWAT,BLDDRAW,ANTICO1,...,FSH,SHBG,T,FLGCV,FLGDIF,STATUS,visit,ENDO,PROLAPS,ABBLEED
2881,10046,777,1,2,1,1,2,1,2,1,...,30.4,44.9,61.1,1,0,7,2,1,1,1
2883,10126,757,1,1,1,1,1,2,2,1,...,144.4,53.5,39.8,0,0,4,2,1,1,1
2884,10153,729,1,3,1,1,1,1,2,1,...,95.3,24.5,43.0,0,0,3,2,1,1,1
2885,10196,726,1,2,1,1,1,1,2,1,...,87.7,47.1,25.0,0,0,4,2,1,1,1
2886,10245,661,1,4,1,1,1,2,2,1,...,48.5,40.5,32.3,0,0,4,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23237,99805,3389,1,1,1,1,1,2,2,1,...,90.5,10.8,76.4,0,0,1,9,1,1,1
23238,99809,3240,1,4,1,1,2,1,2,1,...,35.1,48.5,25.0,0,0,4,9,1,1,1
23239,99888,3307,1,3,1,1,1,1,2,1,...,65.7,41.8,49.2,0,0,2,9,1,1,1
23240,99898,3296,1,4,1,1,1,2,2,1,...,40.1,27.0,42.4,0,0,1,9,1,1,1


In [11]:
from sklearn.model_selection import train_test_split

# split data by train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

In [12]:
#from sklearn.preprocessing import StandardScaler

#list_numerical = x.columns

#scaler = StandardScaler().fit(x_train[list_numerical]) 
#x_train[list_numerical] = scaler.transform(x_train[list_numerical])
#x_test[list_numerical] = scaler.transform(x_test[list_numerical])

In [13]:
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

alphas = np.linspace(0.01,500,100)
coefs = []
r2s = []

for i in alphas:
    lasso = Lasso(alpha = i)

    # fit lasso to data
    y_pred_lasso = lasso.fit(x_train, y_train).predict(x_test)
    r2_score_lasso = r2_score(y_test, y_pred_lasso)
    r2s.append(r2_score_lasso)
    
    coefs.append(lasso.coef_)

    #print(lasso)
    #print("r^2 on test data : %f" % r2_score_lasso, "\n")
"""
ax = plt.gca()

ax.plot(alphas, coefs)
ax.set_xscale("log")
plt.axis("tight")
plt.xlabel("alpha")
plt.ylabel("Standardized Coefficients")
plt.title("Lasso coefficients as a function of alpha")
"""

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


'\nax = plt.gca()\n\nax.plot(alphas, coefs)\nax.set_xscale("log")\nplt.axis("tight")\nplt.xlabel("alpha")\nplt.ylabel("Standardized Coefficients")\nplt.title("Lasso coefficients as a function of alpha")\n'

In [14]:
from sklearn.linear_model import MultiTaskLassoCV

# Lasso with 5 fold cross-validation
model = MultiTaskLassoCV(cv=5, random_state=0, max_iter=10000)

# Fit model
model.fit(x_train, y_train)

MultiTaskLassoCV(cv=5, max_iter=10000, random_state=0)

In [15]:
model.alpha_

659425.3891152552

In [16]:
lasso_best = Lasso(alpha = model.alpha_)
lasso_best.fit(x_train, y_train)
print(lasso_best.coef_)
print(list(zip(lasso_best.coef_, x)))
x

[[-0.     0.     0.    -0.     0.    -0.    -0.    -0.     0.     0.
   0.     0.     0.     0.     0.     0.     0.    -0.    -0.    -0.
   0.     0.     0.     0.    -0.     0.     0.     0.     0.    -0.
  -0.     0.     0.     0.    -0.     0.     0.     0.     0.     0.
   0.     0.     0.    -0.    -0.     0.    -0.     0.    -0.    -0.
   0.    -0.    -0.    -0.     0.    -0.    -0.    -0.     0.    -0.
  -0.     0.     0.    -0.    -0.    -0.     0.    -0.    -0.    -0.
  -0.    -0.    -0.     0.    -0.     0.     0.    -0.     0.     0.
   0.    -0.     0.     0.     0.     0.    -0.    -0.     0.     0.
   0.     0.     0.     0.    -0.    -0.    -0.    -0.     0.    -0.
   0.    -0.    -0.     0.     0.     0.     0.    -0.    -0.    -0.
  -0.    -0.    -0.    -0.    -0.    -0.     0.     0.    -0.    -0.
  -0.    -0.     0.     0.     0.     0.    -0.     0.     0.     0.
   0.    -0.    -0.    -0.     0.     0.     0.     0.     0.     0.
   0.    -0.    -0.     0.     0. 

Unnamed: 0,SWANID,INTDAY,LANGINT,RACE,PREGNAN,EATDRIN,STRTPER,BLDRWAT,BLDDRAW,ANTICO1,...,FSH,SHBG,T,FLGCV,FLGDIF,STATUS,visit,ENDO,PROLAPS,ABBLEED
2881,10046,777,1,2,1,1,2,1,2,1,...,30.4,44.9,61.1,1,0,7,2,1,1,1
2883,10126,757,1,1,1,1,1,2,2,1,...,144.4,53.5,39.8,0,0,4,2,1,1,1
2884,10153,729,1,3,1,1,1,1,2,1,...,95.3,24.5,43.0,0,0,3,2,1,1,1
2885,10196,726,1,2,1,1,1,1,2,1,...,87.7,47.1,25.0,0,0,4,2,1,1,1
2886,10245,661,1,4,1,1,1,2,2,1,...,48.5,40.5,32.3,0,0,4,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23237,99805,3389,1,1,1,1,1,2,2,1,...,90.5,10.8,76.4,0,0,1,9,1,1,1
23238,99809,3240,1,4,1,1,2,1,2,1,...,35.1,48.5,25.0,0,0,4,9,1,1,1
23239,99888,3307,1,3,1,1,1,1,2,1,...,65.7,41.8,49.2,0,0,2,9,1,1,1
23240,99898,3296,1,4,1,1,1,2,2,1,...,40.1,27.0,42.4,0,0,1,9,1,1,1


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# retrieving x and y features from dataframe
x = data
x = x.drop(["HIGHBP"], axis = 1)

y = data.loc[:, "HIGHBP"].values

# initializing random forest classifier
rf_clf = RandomForestClassifier(n_estimators = 100, max_depth = 3)

# split data into training / test data
kfold = StratifiedKFold(n_splits = 10)
y_pred = np.empty_like(y)
for train_idx, test_idx in kfold.split(x, y):
    
        # testing and training speficied parts of data
        x_train = x[train_idx, :]
        x_test = x[test_idx, :]
        y_true_train = y[train_idx]

        # fitting classifier to training data
        rf_clf.fit(x_train, y_true_train)

        # estimate each test properties price range
        y_pred[test_idx] = rf_clf.predict(x_test)

TypeError: '(array([ 1153,  1154,  1155, ..., 12222, 12223, 12224]), slice(None, None, None))' is an invalid key

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

sns.set()

# creating confusion matrix for forest classifier
conf_mat = confusion_matrix(y_true = y, y_pred = y_pred)

# displaying confusion matrix
conf_mat_disp = ConfusionMatrixDisplay(conf_mat, display_labels = np.unique(y))
conf_mat_disp.plot()

# adjusting graph
plt.gcf().set_size_inches(10, 10)
plt.grid(False)

In [None]:
def plot_feat_import(feat_list, feat_import, sort=True, limit=None):
    """ plots feature importances in a horizontal bar chart
    
    Args:
        feat_list (list): str names of features
        feat_import (np.array): feature importances (mean gini reduce)
        sort (bool): if True, sorts features in decreasing importance
            from top to bottom of plot
        limit (int): if passed, limits the number of features shown
            to this value    
    """
    
    if sort:
        # sort features in decreasing importance
        idx = np.argsort(feat_import).astype(int)
        feat_list = [feat_list[_idx] for _idx in idx]
        feat_import = feat_import[idx] 
        
    if limit is not None:
        # limit to the first limit feature
        feat_list = feat_list[:limit]
        feat_import = feat_import[:limit]
    
    # plot and label feature importance
    plt.barh(feat_list, feat_import)
    plt.gcf().set_size_inches(5, len(feat_list) / 2)
    plt.xlabel("Feature importance\n(Mean decrease in Gini across all Decision Trees)")

In [None]:
# plotting property feature importance graph
plot_feat_import(x_feat_list, rf_clf.feature_importances_, limit=10)