In [None]:
%run appV2_dataCleaning.ipynb

### step 1: RE manipulation needs to be done with SME (aka Amar) 
tidy up, uniform data by using regular expression based on data exploration results. 

In [None]:
%run appV2_data_exploration.ipynb

In [None]:
import pyforest

### step 2: model selection
a Bag of Words & tf-idf techniques to extract features from each document.

In [None]:
#lemmatize the comment text
from nltk.stem import WordNetLemmatizer

def lemma(text):
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w,'v') for w in tokens]

    return lemmatized

In [None]:
#split the data into train and test sets
#df_binary = df.copy()
X = df['final']
y = df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
#Scikit-learn's tfidf
#Text preprocessing, tokenizing and filtering out stopwords, which builds a dictionary of features and transforms documents to
#feature vectors: from sklearn.feature_extraction.text import CountVectorizer
vect = TfidfVectorizer(tokenizer= lemma, min_df=5, lowercase = True, ngram_range=(1,2), stop_words='english').fit(X_train)

In [None]:
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

In [None]:
X_train_vectorized.max(0).toarray().ravel().argsort()
X_train_vectorized.shape

### SVC

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=10)
text_clf =clf.fit(X_train_vectorized,y_train)

In [None]:
# Form a prediction set
predictions = text_clf.predict(X_test_vectorized)

In [None]:
from sklearn.metrics import plot_confusion_matrix
class_names = np.array(['Negative', 'Positive','Intermediate'])
mx = plot_confusion_matrix(text_clf, X_test_vectorized, y_test, display_labels=class_names,  cmap=plt.cm.Blues,normalize=None, xticks_rotation= 45)

In [None]:
#len(X_test_vectorized)
len(predictions)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, predictions)))
#print('Precision: {:.2f}'.format(precision_score(y_test, predictions,  average='micro')))
#print('Recall: {:.2f}'.format(recall_score(y_test, predictions,  average='micro')))
#average ='macro' for small classes in imbalanced data
print('F1: {:.2f}'.format(f1_score(y_test, predictions, average='macro')))
target_names = ['Negative', 'Positive', 'Intermediate']
print(classification_report(y_test, predictions, target_names=target_names))

### step 2: Multi-Class Classifier: Features and Design

In [None]:
#Naive bayes classifier: the one most suitable for word count is the multinomial variant 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_vectorized, y_train)

In [None]:
clf.predict(vect.transform(['Bone left foot third metatarsal biopsy Acute osteomyelitis']))

In [None]:
pd.set_option('max_colwidth', 200)  
X_test 

In [None]:
y_test

### Model selection
ready to experiment different ml models and evaluate their accuracy. 
we will benchmark the following 6 models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [None]:
import lightgbm as lgb
import xgboost as xgb 

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(multi_class='ovr'),
    MultinomialNB(),
    LogisticRegression(multi_class='ovr', random_state=0),
    lgb.LGBMClassifier(objective='multiclass'),
    xgb.XGBClassifier(objective= 'multi:softmax')
]

In [None]:
cv_df

In [None]:
#calculate metric for each label, find average weight by support number of true instances for each label. 
import warnings
warnings.filterwarnings('ignore')
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    f1 = cross_val_score(model, features, labels, scoring='f1_macro', cv=CV) 
    print(f1)
    for fold_idx, fv in enumerate(f1):
        entries.append((model_name, fold_idx, fv))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'f1_macro'])

In [None]:
plt.figure(figsize = (15,8))
sns.boxplot(x='model_name', y='f1_macro', data=cv_df)
sns.stripplot(x='model_name', y='f1_macro', data=cv_df, size=8, jitter=True, edgecolor="gray", linewidth=2)
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
plt.show()

In [None]:
cv_df.groupby('model_name').f1_macro.mean().sort_values(ascending= False)

### step 3: Model Evaluation

In [None]:
model = lgb.LGBMClassifier(objective='multiclass')

In [None]:
labels = df.result

In [None]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.20, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
target_names = ['Negative', 'Positive', 'Intermediate']
sns.heatmap(conf_mat, annot=True, fmt='d',  xticklabels=target_names, yticklabels=target_names) #cmap=plt.cm.Blues,
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
#the vast majority of predictions end up on the diagonal,which we expected. However, there are misclassifications, and it might be 
#interesting to see what those are caused by
from IPython.display import display


In [None]:
for _, predicted in interpretation_to_id.items():
    for _, actual in interpretation_to_id.items():
        if predicted != actual and conf_mat[actual, predicted] >= 1:
            print(f"'{id_to_interpretation[actual]}' predicted as '{id_to_interpretation[predicted]}' : {conf_mat[actual, predicted]} examples.")
            #display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['interpretation', 'comment']])
            display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['interpretation', 'final']].style.set_properties(subset=['final'], **{'width': '800px'}))

In [None]:
# My way:difference btw y_test and y_pred and grab their index 
temp=y_test != y_pred
temp[temp == True].index

In [None]:
#locate the error records by using the index above
df_error = df.loc[temp[temp == True].index]
#In order to add the pred value to the df_error, need to convert the y_pred from array to series first. 
pred = pd.Series(y_pred, index=y_test.index, name='pred')
#then subset the error prediction but using the index above
pred= pred[temp[temp == True].index]
#add the prediction column to the error dataframe
df_error['pred'] = pred
df_error

### xgboost default 

In [None]:
model = xgb.XGBClassifier(objective= 'multi:softmax')

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
%run Utility.ipynb

In [None]:
plot_conf_matrix(y_test, y_pred)

In [None]:
conf_mat = np.array(pd.crosstab(y_test, y_pred))
#conf_mat = confusion_matrix(y_test, y_pred)
conf_mat[1, 0]

In [None]:
for _, predicted in interpretation_to_id.items():
    for _, actual in interpretation_to_id.items():
        if predicted != actual and conf_mat[actual, predicted] >= 1:
            print(f"'{id_to_interpretation[actual]}' predicted as '{id_to_interpretation[predicted]}' : {conf_mat[actual, predicted]} examples.")
            #display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['interpretation', 'comment']])
            display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['interpretation', 'final']].style.set_properties(subset=['final'], **{'width': '800px'}))

## Grid search param dict for xgboost and light GBM

In [None]:
#sample_pos_weight = number of negative samples / number of positive samples
#learning rate: default = 0.1
#num_iterations : def=100  aliases: n_estimators, num_iteration
#num_leaves: def=31 max leaves for each trained tree. Typical: 255, usually {15, 31, 63, 127, 255, 511, 1023, 2047, 4095}.
#max_depth: def=-1 typical 6 [3, 12]
#min_data_in_leaf: 20 typical 100 min_child_samples
#min_sum_hessian_in_leaf: def= 0.001 min_child_weight
#max_bin: def= 255 
param_dict={'objective': ['multiclass'], 
            'is_unbalance': ['True'],
            'metric': ['AUC-mu']
            'scale_pos_weight': [],
            'learning_rate': [],
            'num_iterations': [],
            'num_leaves': [],
            'min_data_in_leaf': [],
            'min_child_weight': [],
            'max_bin':[],
            'max_depth':[]
                        
            }

In [None]:
np.arange(0.01, 0.6, 0.01)

In [None]:
param_dict={'objective': ['multiclass'], 
            'is_unbalance': ['True']}

In [None]:
from sklearn.model_selection import GridSearchCV
model = lgb.LGBMClassifier()
X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.20, random_state=0)
def paramTuning(param_name, param_value_ls):
    param_dict[param_name] = param_value_ls
    gridsearch = GridSearchCV(model, param_dict, cv=5, scoring='f1_macro') 
    #lgb_model = gridsearch.fit(features, labels)
    lgb_model = gridsearch.fit(X_train, y_train)
    best_index = gridsearch.best_index_
    best_param_value = gridsearch.cv_results_['params'][best_index][param_name]
    plt.plot(param_value_ls, gridsearch.cv_results_['mean_test_score'], label='f1_macro')
    plt.xlabel(param_name)
    plt.ylabel('f1_macro')
    plt.title('lightgbm ' + param_name + ' vs f1_macro')
    plt.legend(loc="best") 
    plt.show()
    print('Best', param_name, '=', best_param_value, '| Best f1_macro = ', gridsearch.best_score_)
    return [best_param_value]

In [None]:
param_dict['learning_rate'] =paramTuning('learning_rate', np.arange(0.01, 0.6, 0.01))

In [None]:
arr1 = np.array([50, 75])
arr2 = np.arange(100, 300, 50)
arr = np.concatenate((arr1, arr2))
#n_estimators 
param_dict['num_iterations'] =paramTuning('num_iterations',  np.array([75, 100, 125, 150, 200, 300, 400, 500, 600]))

In [None]:
param_dict['num_leaves'] =paramTuning('num_leaves', np.array([15, 31, 63, 127, 255, 511, 1023, 2047, 4095]))

In [None]:
param_dict['max_depth'] =paramTuning('max_depth', np.array([1, 3, 5, 7, 10, 15, 20, 25, 30, 35, 40, 45]))

In [None]:
#min_child_samples
param_dict['min_data_in_leaf'] =paramTuning('min_data_in_leaf', np.array([5, 10, 15, 20, 25, 30, 35]))

In [None]:
#min_child_weight
param_dict['min_sum_hessian_in_leaf'] =paramTuning('min_sum_hessian_in_leaf', np.array([0.0001, 0.0005, 0.001, 0.002, 0.003, 0.004, 0.005]))

In [None]:
param_dict['max_bin'] =paramTuning('max_bin', np.array([100, 200, 240, 255, 300, 350, 400]))

In [None]:
#min_data_in_leaf, max_depth, do not mess around with min_sum_hessian_in_leaf (seems 0.0001 works)
param_dict

In [None]:
param_dict=param_dict={'objective': ['multiclass'],  'is_unbalance': ['True'], 'num_iterations': [300], 'learning_rate': [0.4],'max_depth': [25], 'num_leaves': [31], 'min_data_in_leaf': [20], 'max_bin': [255] }

In [None]:
#model = lgb.LGBMClassifier(objective= 'objective', is_unbalance=True, num_iterations=75, learning_rate=0.4, max_dept=9, min_data_in_leaf=5, min_sum_hessian_in_leaf= 0.0001, scale_pos_weight=0.1)
#model.fit(X_train, y_train)
#y_pred = model.predict(X_test)
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.20, random_state=0)
type(X_train)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
gs.best_score_

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
f1_score(y_test, y_pred, average='macro')

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,5)) 
target_names = ['Negative', 'Positive', 'Intermediate']
res=sns.heatmap(conf_mat, annot=True, fmt='d', cmap=plt.cm.Blues, xticklabels=target_names, yticklabels=target_names) #cmap=plt.cm.Blues,
for _, spine in res.spines.items():
    spine.set_visible(True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)

plt.show()

In [None]:
import neptune

In [None]:
from neptunecontrib.monitoring.utils import axes2fig