In [3]:
import pandas as pd
import numpy as np

from dsbox.datapreprocessing.cleaner import Imputation
from dsbox.datapreprocessing.cleaner import encoder
from dsbox.datapreprocessing.cleaner import missing_value_pred as mvp

# PART1: get mutual information
data_path = "../dsbox-data/o_38/original/data/"
data_name = data_path + "trainData.csv"
label_name = data_path + "trainTargets.csv" # make sure your label target is in the second column of this file
data = encoder.encode(data_name)
label = encoder.text2int(pd.read_csv(label_name)["Class"])
data = data.drop("TBG_measured",axis=1)    # drop because all same value, useless
data = data.drop("d3mIndex",axis=1)    # drop because all same value, useless

from dsbox.datapreprocessing.cleaner import bayesTree

graph = bayesTree.build_complete_graph(data)

# print all
FTI_edges = graph.edges("FTI", data=True)
print sorted(FTI_edges, key=lambda x: x[2]['weight'],reverse=True)[:5]

TSH_edges = graph.edges("TSH", data=True)
print sorted(TSH_edges, key=lambda x: x[2]['weight'],reverse=True)[:5]

T4U_edges = graph.edges("T4U", data=True)
print sorted(T4U_edges, key=lambda x: x[2]['weight'],reverse=True)[:5]

T3_edges = graph.edges("T3", data=True)
print sorted(T3_edges, key=lambda x: x[2]['weight'],reverse=True)[:5]

TT4_edges = graph.edges("TT4", data=True)
print sorted(TT4_edges, key=lambda x: x[2]['weight'],reverse=True)[:5]

age_edges = graph.edges("age", data=True)
print sorted(age_edges, key=lambda x: x[2]['weight'],reverse=True)[:5]


  age
0  55
1  60
2  53
3  38
4  45


...Delete *sex* column: object/other category.
...Insert columns to onehot encode *sex*.
...Delete *on_thyroxine* column: object/other category.
...Insert columns to onehot encode *on_thyroxine*.
...Delete *query_on_thyroxine* column: object/other category.
...Insert columns to onehot encode *query_on_thyroxine*.
...Delete *on_antithyroid_medication* column: object/other category.
...Insert columns to onehot encode *on_antithyroid_medication*.
...Delete *sick* column: object/other category.
...Insert columns to onehot encode *sick*.
...Delete *pregnant* column: object/other category.
...Insert columns to onehot encode *pregnant*.
...Delete *thyroid_surgery* column: object/other category.
...Insert columns to onehot encode *thyroid_surgery*.
...Delete *I131_treatment* column: object/other category.
...Insert columns to onehot encode *I131_treatment*.
...Delete *query_hypothyroid* column: object/other category.
...Insert columns to onehot encode *qu

In [13]:
# PART2: pearson correlation
import pandas as pd
from sklearn.preprocessing import scale

pearson_correlation = data.corr("pearson")

part2_result = {}
missing_col_name = ["FTI", "TSH", "T4U", "T3", "TT4", "age"]
for each in missing_col_name:
    each_result = pearson_correlation[each].abs().sort_values(ascending=False)[1:6]
    part2_result[each] = each_result.keys().tolist()
    print each_result

TT4             0.790391
T3              0.342407
TSH             0.318614
on_thyroxine    0.201717
T4U             0.187939
Name: FTI, dtype: float64
FTI    0.318614
TT4    0.281660
T3     0.171399
T4U    0.082166
age    0.064178
Name: TSH, dtype: float64
T3                      0.451123
TT4                     0.424056
pregnant                0.338915
referral_source_STMW    0.310362
sex_M                   0.235473
Name: T4U, dtype: float64
TT4                    0.551944
T4U                    0.451123
FTI                    0.342407
referral_source_SVI    0.300079
age                    0.229938
Name: T3, dtype: float64
FTI             0.790391
T3              0.551944
T4U             0.424056
TSH             0.281660
on_thyroxine    0.222697
Name: TT4, dtype: float64
referral_source_SVI      0.285744
T3                       0.229938
referral_source_STMW     0.179420
T4U                      0.162862
referral_source_other    0.132033
Name: age, dtype: float64


In [7]:
# PART3: model coeff
# todo: maybe need to scale the data first?
data_path = "../dsbox-data_my/o_38/data/"
data_name = data_path + "trainData_scaled.csv"
label_name = data_path + "trainTargets.csv" # make sure your label target is in the second column of this file
data = encoder.encode(data_name)
label = encoder.text2int(pd.read_csv(label_name)["Class"])
data = data.drop("TBG_measured",axis=1)    # drop because all same value, useless
data = data.drop("d3mIndex",axis=1)    # drop because all same value, useless


from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score, make_scorer

clf = LogisticRegression()
scorer = make_scorer(f1_score, average="macro") # score will be * -1, if greater_is_better is set to False

imputer = Imputation(model=clf, scorer=scorer, strategy="iteratively_regre")
imputer.fit(data)
model_list = imputer.transform(data, label)


               age
0   0.159280450632
1   0.404734082033
2  0.0610989980716
3  -0.675261896131
4   -0.33162681217


...Delete *sex* column: object/other category.
...Insert columns to onehot encode *sex*.
...Delete *on_thyroxine* column: object/other category.
...Insert columns to onehot encode *on_thyroxine*.
...Delete *query_on_thyroxine* column: object/other category.
...Insert columns to onehot encode *query_on_thyroxine*.
...Delete *on_antithyroid_medication* column: object/other category.
...Insert columns to onehot encode *on_antithyroid_medication*.
...Delete *sick* column: object/other category.
...Insert columns to onehot encode *sick*.
...Delete *pregnant* column: object/other category.
...Insert columns to onehot encode *pregnant*.
...Delete *thyroid_surgery* column: object/other category.
...Insert columns to onehot encode *thyroid_surgery*.
...Delete *I131_treatment* column: object/other category.
...Insert columns to onehot encode *I131_treatment*.
...Delete *query_hypo

number of imputated cells: 604
score is: 0.852798882884
number of imputated cells: 184
score is: 0.852798882884
number of imputated cells: 326
score is: 0.852798882884
number of imputated cells: 324
score is: 0.852798882884
changed distance: 1.34602000796
number of imputated cells: 1
score is: 0.852798882884
number of imputated cells: 295
score is: 0.852798882884
number of imputated cells: 604
score is: 0.852798882884
number of imputated cells: 184
score is: 0.852798882884
number of imputated cells: 326
score is: 0.852798882884
number of imputated cells: 324
score is: 0.852798882884
changed distance: 1.04790114648
number of imputated cells: 1
score is: 0.852798882884
number of imputated cells: 295
score is: 0.852798882884
number of imputated cells: 604
score is: 0.852798882884
number of imputated cells: 184
score is: 0.852798882884
number of imputated cells: 326
score is: 0.852798882884
number of imputated cells: 324
score is: 0.852798882884
changed distance: 0.817215779596
number of i

score is: 0.852798882884
number of imputated cells: 326
score is: 0.852798882884
number of imputated cells: 324
score is: 0.852798882884
changed distance: 0.0621938043023
number of imputated cells: 1
score is: 0.852798882884
number of imputated cells: 295
score is: 0.852798882884
number of imputated cells: 604
score is: 0.852798882884
number of imputated cells: 184
score is: 0.852798882884
number of imputated cells: 326
score is: 0.852798882884
number of imputated cells: 324
score is: 0.852798882884
changed distance: 0.0531213739779
number of imputated cells: 1
score is: 0.852798882884
number of imputated cells: 295
score is: 0.852798882884
number of imputated cells: 604
score is: 0.852798882884
number of imputated cells: 184
score is: 0.852798882884
number of imputated cells: 326
score is: 0.852798882884
number of imputated cells: 324
score is: 0.852798882884
changed distance: 0.0458283854998
number of imputated cells: 1
score is: 0.852798882884
number of imputated cells: 295
score is

In [15]:
part3_result = {}
FTI_coef = pd.Series(abs(model_list[5].coef_),data.keys().delete(5))
part3_result["FTI"] = FTI_coef.sort_values(ascending=False)[:5].keys().tolist()

TSH_coef = pd.Series(abs(model_list[1].coef_),data.keys().delete(1))
part3_result["TSH"] = TSH_coef.sort_values(ascending=False)[:5].keys().tolist()

T4U_coef = pd.Series(abs(model_list[4].coef_),data.keys().delete(4))
part3_result["T4U"] = T4U_coef.sort_values(ascending=False)[:5].keys().tolist()

T3_coef = pd.Series(abs(model_list[2].coef_),data.keys().delete(2))
part3_result["T3"] = T3_coef.sort_values(ascending=False)[:5].keys().tolist()

TT4_coef = pd.Series(abs(model_list[3].coef_),data.keys().delete(3))
part3_result["TT4"] = TT4_coef.sort_values(ascending=False)[:5].keys().tolist()

age_coef = pd.Series(abs(model_list[0].coef_),data.keys().delete(0))
part3_result["age"] = age_coef.sort_values(ascending=False)[:5].keys().tolist()


In [20]:
# part2_result -> top 5 attributes using mutual information (or say: pearson correlation)

# part3_result -> top 5 attributes using regression model coefficient

print part2_result
print "\n"
print part3_result
print "\n"

# lets see what the intersection looks like
for each in missing_col_name:
    intersection = [filter(lambda x: x in part2_result[each], part3_result[each])]
    print "{}: {}".format(each, intersection)
    
# as can see from the result, they for top5, usually have 2~3 number of intersected elements.

{'TSH': ['FTI', 'TT4', 'T3', 'T4U', 'age'], 'age': ['referral_source_SVI', 'T3', 'referral_source_STMW', 'T4U', 'referral_source_other'], 'T3': ['TT4', 'T4U', 'FTI', 'referral_source_SVI', 'age'], 'T4U': ['T3', 'TT4', 'pregnant', 'referral_source_STMW', 'sex_M'], 'TT4': ['FTI', 'T3', 'T4U', 'TSH', 'on_thyroxine'], 'FTI': ['TT4', 'T3', 'TSH', 'on_thyroxine', 'T4U']}


{'TSH': ['TT4', 'T4U', 'FTI', 'pregnant', 'goitre'], 'age': ['referral_source_STMW', 'referral_source_SVI', 'I131_treatment', 'goitre', 'on_antithyroid_medication'], 'T3': ['T4U', 'FTI', 'tumor', 'referral_source_SVI', 'on_thyroxine'], 'T4U': ['TT4', 'FTI', 'pregnant', 'hypopituitary', 'thyroid_surgery'], 'TT4': ['FTI', 'T4U', 'hypopituitary', 'thyroid_surgery', 'goitre'], 'FTI': ['TT4', 'T4U', 'thyroid_surgery', 'pregnant', 'goitre']}


FTI: [['TT4', 'T4U']]
TSH: [['TT4', 'T4U', 'FTI']]
T4U: [['TT4', 'pregnant']]
T3: [['T4U', 'FTI', 'referral_source_SVI']]
TT4: [['FTI', 'T4U']]
age: [['referral_source_STMW', 'referral_sou