In [None]:
# # memory footprint support libraries/code
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
# !pip install psutil
# !pip install humanize

# import psutil
# import humanize
# import os
# import GPUtil as GPU

# GPUs = GPU.getGPUs()
# # XXX: only one GPU on Colab and isn’t guaranteed
# gpu = GPUs[0]
# def printm():
#     process = psutil.Process(os.getpid())
#     print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
#     print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
# printm()

In [None]:
import pandas as pd
import numpy as np
import random
import codecs
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn import naive_bayes, svm
from sklearn.preprocessing import LabelEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC
import pickle
import nltk
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# load the data set
obtained from: https://conservancy.umn.edu/handle/11299/137703

In [None]:
# read txt file (save the file with utf-8 encoding on visual studio)
column=['short_form','long_form','represent_in_st','pos_start','pos_end','section','compt_st']
f = codecs.open("AnonymizedClinicalAbbreviationsAndAcronymsDataSet_encoded.txt", "r", "utf-8")
dt = pd.read_csv(f, sep='[|]', names=column, keep_default_na=False, na_values=['', 'null'])

  after removing the cwd from sys.path.


In [None]:
dt.head(100)


Unnamed: 0,short_form,long_form,represent_in_st,pos_start,pos_end,section,compt_st
0,AB,abortion,AB.,231,233,,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...
1,AB,abortion,AB.,249,251,,She is now bleeding quite heavily. Ultrasound ...
2,AB,abortion,AB,223,224,PAST OB HISTORY,ALLERGIES: Heparin and Imitrex. PAST OB HISTOR...
3,AB,abortion,AB.,194,196,HISTORY OF THE PRESENT ILLNESS,She had a pelvic ultrasound at Park Nicollet o...
4,AB,abortion,AB,114,115,PAST OB-GYN HISTORY,"On _%#MMDD2007#%_, normal anatomy with anterio..."
...,...,...,...,...,...,...,...
95,AB,abortion,AB,63,64,,_%#NAME#%_ _%#NAME#%_ is a 56-year-old female ...
96,AB,abortion,AB,65,66,,_%#NAME#%_ _%#NAME#%_ is a 30-year-old teacher...
97,AB,abortion,AB,250,251,OB HISTORY,"She received RhoGAM on _%#MM#%_ _%#DD#%_, 2006..."
98,AB,blood group in ABO system,AB,162,163,PRENATAL LABORATORY,CURRENT MEDICATIONS: Prenatal vitamins. ALLERG...


## observe the data

In [None]:
dt.head(2)

Unnamed: 0,short_form,long_form,represent_in_st,pos_start,pos_end,section,compt_st
0,AB,abortion,AB.,231,233,,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...
1,AB,abortion,AB.,249,251,,She is now bleeding quite heavily. Ultrasound ...


In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37500 entries, 0 to 37499
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   short_form       37500 non-null  object
 1   long_form        37500 non-null  object
 2   represent_in_st  37500 non-null  object
 3   pos_start        37500 non-null  int64 
 4   pos_end          37500 non-null  int64 
 5   section          36479 non-null  object
 6   compt_st         37500 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.0+ MB


In [None]:
dt.loc[dt['section'].isnull(), :]

Unnamed: 0,short_form,long_form,represent_in_st,pos_start,pos_end,section,compt_st
0,AB,abortion,AB.,231,233,,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...
1,AB,abortion,AB.,249,251,,She is now bleeding quite heavily. Ultrasound ...
9,AB,abortion,AB,65,66,,_%#NAME#%_ _%#NAME#%_ is a 25-year-old female ...
22,AB,abortion,AB,66,67,,"_%#NAME#%_ _%#NAME#%_ is a 40-year-old female,..."
23,AB,abortion,AB,65,66,,"_%#NAME#%_ _%#NAME#%_ is a 40-year-old female,..."
...,...,...,...,...,...,...,...
37396,VAD,ventricular assist device,VAD,255,257,,This has led to a sense of needing to vomit an...
37438,VAD,vincristine adriamycin and dexamethasone,VAD,215,217,,He was hospitalized _%#MM#%_ _%#DD#%_ to _%#MM...
37443,VAD,ventricular assist device,VAD,152,154,,His PA pressures have improved significantly f...
37444,VAD,ventricular assist device,VAD,144,146,,His BP is stable and has required NTG for bett...


In [None]:
# identify if if this data is unbalanced.
dt['long_form'].value_counts()

idiopathic thrombocytopenic purpura                   500
Narcotics Anonymous                                   474
(stage) IB                                            472
over the counter                                      469
extra strength                                        469
                                                     ... 
right                                                   1
pleural effusion                                        1
methicillin-susceptible Staphylococcus aureus:MSSA      1
left anterior descending:LAD                            1
(drug) DC                                               1
Name: long_form, Length: 351, dtype: int64

Since the long form is our labels, we first test if there are any inbalance in the data. Since there are many labels only shows once, and we try to avoid the inbalance data, we set the threshold to use the labels are 125. (The most frequency lebels has 500 records, we try to keep (80:20) ratio.

In [None]:
# build filter for the label:
label_dict = dt['long_form'].value_counts().to_dict()
filtered_label_dict = [key for key, value in label_dict.items() if value > 125]
filtered_label_dict[:5]

['idiopathic thrombocytopenic purpura',
 'Narcotics Anonymous',
 '(stage) IB',
 'over the counter',
 'extra strength']

In [None]:
# keey only labels that have over 125 records 
dt = dt[dt['long_form'].isin(filtered_label_dict)]

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32566 entries, 0 to 37499
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   short_form       32566 non-null  object
 1   long_form        32566 non-null  object
 2   represent_in_st  32566 non-null  object
 3   pos_start        32566 non-null  int64 
 4   pos_end          32566 non-null  int64 
 5   section          31787 non-null  object
 6   compt_st         32566 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.0+ MB


In [None]:
dt=dt.reset_index(drop=True)
#add id column
dt['id'] = dt.index

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32566 entries, 0 to 32565
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   short_form       32566 non-null  object
 1   long_form        32566 non-null  object
 2   represent_in_st  32566 non-null  object
 3   pos_start        32566 non-null  int64 
 4   pos_end          32566 non-null  int64 
 5   section          31787 non-null  object
 6   compt_st         32566 non-null  object
 7   id               32566 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 2.0+ MB


In [None]:
dt.columns

Index(['short_form', 'long_form', 'represent_in_st', 'pos_start', 'pos_end',
       'section', 'compt_st', 'id'],
      dtype='object')

In [None]:
# function Replace represent_in_st to short form to prevent any special character in the short form like M.O.M. to MOM
replace_list = ["compt_st", "represent_in_st", "short_form"]
replace_index_list = [dt.columns.get_loc(c) for c in replace_list if c in dt]
def adv_clean(df, replace_index_list=replace_index_list):
  full_sentenct = df[replace_index_list[0]]
  short_form_represent = df[replace_index_list[1]]
  short_form = df[replace_index_list[2]]
  clean_short_form_sentence = full_sentenct.replace(short_form_represent, short_form)
  return clean_short_form_sentence

In [None]:
#Text cleaning (keep all special character )
stop = stopwords.words('english')
dt['clean_compt_st']=dt.apply(adv_clean, axis=1)
#delete all nums in string of the dataframe
dt['clean_compt_st'] = dt['clean_compt_st'].str.replace(r'\b[0-9]+\b\s*', ' ')
dt['clean_compt_st'] = dt['clean_compt_st'].str.replace('[^a-zA-Z1-9&]', ' ')
de_identify_list=['mmdd','name','street','city','country','prectinct','address','55400','00000','ddmm2000','mm','dd','2000','ddmm1914','tel','fax','email','ssn','mrn','hpbn','accountn','ln','vn','device']
remove = stop # change the code
pat = r'\b(?:{})\b'.format('|'.join(remove))
dt['clean_compt_st'] = dt['clean_compt_st'].str.replace(pat, '    ')
dt['clean_compt_st'] = dt['clean_compt_st'].str.replace('\s+', ' ', regex=True)
dt['clean_compt_st']=dt['clean_compt_st'].str.lower()
#dt.head()

In [None]:
# identify 10 words windows around the keyword(short_form)
name_list = ["compt_st", "clean_compt_st", "represent_in_st", "short_form", "pos_start", 'id']
index_list = [dt.columns.get_loc(c) for c in name_list if c in dt]
def extract_windows(data_frame, index_list=index_list):
    sentence = data_frame[index_list[0]]
    clean_sentence = data_frame[index_list[1]]
    raw_keyword = data_frame[index_list[2]]
    keyword = data_frame[index_list[3]]
    pos = data_frame[index_list[4]]
    id_required_revised = data_frame[index_list[5]]
    k = sentence[:int(pos)].split()
    count = sentence[:int(pos)].split().count(raw_keyword) # identify the number of same keywords already appeard before this one
    try:
        if count == 0:
            windows_pos = clean_sentence.split().index(keyword.lower())
        else:
            windows_pos = clean_sentence.replace(keyword.lower(), 'XXXXXX', count).split().index(keyword.lower()) # identify the position of the keywords
    except ValueError:
        print(sentence, pos)
        print(id_required_revised)
        print(clean_sentence, keyword.lower())
        print(count)
        print(clean_sentence.split())
        print(clean_sentence.replace(keyword.lower(), 'XXXXXX', count).split().index(keyword.lower()))
    words = clean_sentence.split()
    res=' '.join(words[max(0,windows_pos-5):min(windows_pos+6,len(words))]) # get the 11 words windows
    return res
  
dt["adv_window"] = dt.apply(extract_windows, axis=1)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
dt.head(3)

Unnamed: 0,short_form,long_form,represent_in_st,pos_start,pos_end,section,compt_st,id,clean_compt_st,adv_window
0,AB,abortion,AB.,231,233,,"_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida 3, para 2-0-0-2, who presented to the Emergency Room complaining of increasing vaginal bleeding since approximately 6 a.m. The patient does have a known history of having had a missed AB. She had been followed at another clinic and was told that she had a missed AB shortly after Christmas. The patient at that time had been counseled to undergo a D&C and was even offered misoprostol to help complete a miscarriage, however, patient declined at that time to schedule a D&C or to take the misoprostol.",0,name name year old gravida para presented emergency room complaining increasing vaginal bleeding since approximately the patient known history missed ab she followed another clinic told missed ab shortly christmas the patient time counseled undergo d&c even offered misoprostol help complete miscarriage however patient declined time schedule d&c take misoprostol,the patient known history missed ab she followed another clinic told
1,AB,abortion,AB.,249,251,,"She is now bleeding quite heavily. Ultrasound this morning demonstrated a missed AB consistent with a 6 week pregnancy with blood clots in the uterine cavity, as well as continued bleeding from the cervical os. This is consistent with an incomplete AB. The patient presents now for a suction D&C. Medical history is negative. Surgical history is negative. CURRENT MEDICATIONS: Include prenatal vitamins.",1,she bleeding quite heavily ultrasound morning demonstrated missed ab consistent week pregnancy blood clots uterine cavity well continued bleeding cervical os this consistent incomplete ab the patient presents suction d&c medical history negative surgical history negative current medications include prenatal vitamins,heavily ultrasound morning demonstrated missed ab consistent week pregnancy blood clots
2,AB,abortion,AB,223,224,PAST OB HISTORY,"ALLERGIES: Heparin and Imitrex. PAST OB HISTORY: 1. 1992 full term primary section for breech presentation. 2. 1995 full term successful VBAC, no complications. 3. _%#1999#%_ full term repeat C-section. 4. 2005 spontaneous AB followed by suction D&C. PAST GYN HISTORY: The patient denies any history of abnormal Pap smears except for her most recent Pap performed _%#MM2006#%_.",2,allergies heparin imitrex past ob history full term primary section breech presentation full term successful vbac complications full term repeat c section spontaneous ab followed suction d&c past gyn history the patient denies history abnormal pap smears except recent pap performed mm2 6,term repeat c section spontaneous ab followed suction d&c past gyn


In [None]:
pd.reset_option('display.max_colwidth')

In [None]:
dt['long_form'].value_counts()

idiopathic thrombocytopenic purpura    500
Narcotics Anonymous                    474
(stage) IB                             472
extra strength                         469
over the counter                       469
                                      ... 
patent ductus arteriosus               138
blood group in ABO system              137
physician assistant certification      137
lower extremity                        134
delirium tremens                       129
Name: long_form, Length: 102, dtype: int64

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32566 entries, 0 to 32565
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   short_form       32566 non-null  object
 1   long_form        32566 non-null  object
 2   represent_in_st  32566 non-null  object
 3   pos_start        32566 non-null  int64 
 4   pos_end          32566 non-null  int64 
 5   section          31787 non-null  object
 6   compt_st         32566 non-null  object
 7   id               32566 non-null  int64 
 8   clean_compt_st   32566 non-null  object
 9   adv_window       32566 non-null  object
dtypes: int64(3), object(7)
memory usage: 2.5+ MB


## Feature Engineering

### clean categorical feature: Section

#### transform to numerical label

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32566 entries, 0 to 32565
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   short_form       32566 non-null  object
 1   long_form        32566 non-null  object
 2   represent_in_st  32566 non-null  object
 3   pos_start        32566 non-null  int64 
 4   pos_end          32566 non-null  int64 
 5   section          31787 non-null  object
 6   compt_st         32566 non-null  object
 7   id               32566 non-null  int64 
 8   clean_compt_st   32566 non-null  object
 9   adv_window       32566 non-null  object
dtypes: int64(3), object(7)
memory usage: 2.5+ MB


There are around 800 records missing section value

In [None]:
# fill out missing values for section
dt.loc[dt["section"].isnull(),:][:3]

Unnamed: 0,short_form,long_form,represent_in_st,pos_start,pos_end,section,compt_st,id,clean_compt_st,adv_window
0,AB,abortion,AB.,231,233,,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...,0,name name year old gravida para presented eme...,the patient known history missed ab she follow...
1,AB,abortion,AB.,249,251,,She is now bleeding quite heavily. Ultrasound ...,1,she bleeding quite heavily ultrasound morning ...,heavily ultrasound morning demonstrated missed...
8,AB,abortion,AB,65,66,,_%#NAME#%_ _%#NAME#%_ is a 25-year-old female ...,8,name name year old female gravida para ab she...,year old female gravida para ab she previous e...


In [None]:
dt['section_test'] = dt['section']

In [None]:
# since long_form is our prediction, we will use short_form to fill out the blank
mode = dt.groupby('short_form')['section_test'].transform(lambda x: x.mode()[0] if any(x.mode()) else 'ALL_NAN')
dt['section_test'].fillna(mode, inplace=True)

In [None]:
dt.loc[dt['section_test'] == 'ALL_NAN', :]['short_form'].value_counts()

AC    319
Name: short_form, dtype: int64

The short_form 'AC' has no section information available

In [None]:
# since the 'section' column contain long text as well as special characters, we will transform it to numerical label then
#possible section format 1
section_lst=dt['section_test'].unique().tolist()
dic_section={}
i=0
for word in section_lst:
  dic_section[word]=i
  i=i+1
print(dic_section)

dt['section_id'] = dt['section_test'].replace(dic_section)
#dt.head()

{'HISTORY OF PRESENT ILLNESS': 0, 'PAST OB HISTORY': 1, 'HISTORY OF THE PRESENT ILLNESS': 2, 'PAST OB-GYN HISTORY': 3, 'PATIENT IDENTIFICATION': 4, 'PAST MEDICAL HISTORY': 5, 'HISTORY': 6, 'PLAN': 7, 'HOSPITAL COURSE': 8, 'PREOPERATIVE STATUS AND JUDGMENT': 9, 'PAST SURGICAL HISTORY': 10, 'PREGNANCY HISTORY': 11, 'DATE OF DISCHARGE': 12, 'SUMMARY OF HOSPITAL COURSE': 13, 'PRENATAL LABS': 14, 'ASSESSMENT': 15, 'LABORATORY DATA': 16, 'PREGNANCY COMPLICATIONS': 17, 'GYNECOLOGIC HISTORY': 18, 'REVIEW OF SYSTEMS': 19, 'SURGEON': 20, 'LABORATORY  DATA': 21, 'LABORATORY': 22, 'COMPLICATIONS': 23, 'PAST OBSTETRICAL HISTORY': 24, '1. FEN': 25, 'OB-GYN HISTORY': 26, 'SUMMARY OF ADMISSION': 27, 'PRENATAL LABORATORIES': 28, 'IMPRESSION': 29, 'DOB': 30, 'PRENATAL CARE': 31, 'REASON FOR ADMISSION': 32, 'OB HISTORY': 33, 'PRENATAL LABORATORY': 34, 'PROBLEMS ENCOUNTERED IN THE HOSPITAL': 35, 'DISCHARGE MEDICATIONS': 36, 'GYN HISTORY': 37, 'PAST HISTORY': 38, 'PAST SURGERIES': 39, 'ASSESSMENT/PLAN': 40

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32566 entries, 0 to 32565
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   short_form       32566 non-null  object
 1   long_form        32566 non-null  object
 2   represent_in_st  32566 non-null  object
 3   pos_start        32566 non-null  int64 
 4   pos_end          32566 non-null  int64 
 5   section          31787 non-null  object
 6   compt_st         32566 non-null  object
 7   id               32566 non-null  int64 
 8   clean_compt_st   32566 non-null  object
 9   adv_window       32566 non-null  object
 10  section_test     32566 non-null  object
 11  section_id       32566 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 3.0+ MB


In [None]:
dt['long_form'].value_counts()

idiopathic thrombocytopenic purpura    500
Narcotics Anonymous                    474
(stage) IB                             472
extra strength                         469
over the counter                       469
                                      ... 
patent ductus arteriosus               138
blood group in ABO system              137
physician assistant certification      137
lower extremity                        134
delirium tremens                       129
Name: long_form, Length: 102, dtype: int64

### Train_Test_split (80% and 20%)

In [None]:
# split the dataset in train and test with test set size of .2
dt_train, dt_test = train_test_split(dt, stratify=dt['long_form'], test_size=.2, random_state = 8)

# Modeling (Convential Models)

split the dt_train data to train and validation set with validation size 0.25
  - train:validation:test = 60:20:20

In [None]:
train, validation = train_test_split(dt_train, stratify=dt_train['long_form'], test_size=.25, random_state = 8)

## Baseline Model

NLTK NaiveBayesClassifier
- Bag of words using the 2000 most frequent words in the training data.
- The abbreviation or acronym


In [None]:
def get_vocabulary(examples):
    tokens = [word.lower() for example in examples
              for word in nltk.word_tokenize(example[7])]
    vocabulary = nltk.FreqDist(t for t in tokens)
    return list(vocabulary)[:2000] # the 2000 most frequent wors in the training set

In [None]:
def get_features(train_data, test_data):    
    short_form_vocab = {train_ex[0].lower() for train_ex in train_data}
    vocabulary = get_vocabulary(train_data)

    # (train_features, test_features)
    feature_sets = ([], [])
    for (i, example_set) in enumerate([train_data, test_data]):
        for example in example_set:
            # Add new features in this loop.
            target_sf = example[0]
            sf_feature = get_short_form_feature(target_sf, short_form_vocab)
            document = example[7]
            bow_feature = get_bag_of_words_features(document, vocabulary)
            feat = {**sf_feature, **bow_feature}
            feature_sets[i].append(feat)
    return feature_sets

In [None]:
def get_short_form_feature(short_form, all_short_forms):
    features = {}
    for sf in all_short_forms:
        features[f"short_form({sf})"] = (sf == short_form.lower())
    features["UNK"] = (short_form.lower() in all_short_forms)
    return features

In [None]:
def get_bag_of_words_features(document, vocabulary):
    document_words = set(nltk.word_tokenize(document.lower()))
    features = {}
    for word in vocabulary:
        features[f"contains({word})"] = (word in document_words)
    return features

In [None]:
# get data and label for train set
train_data = train.loc[:, train.columns != 'long_form'].values.tolist()
train_labels = train.loc[:, train.columns == 'long_form'].values.tolist()
train_shuffled = random.sample(list(zip(train_data, train_labels)), k=len(train_data)) # shufffled the list not in place
train_data = [elem[0] for elem in train_shuffled]
train_labels = [elem[1][0] for elem in train_shuffled]


# get data and label for validation set
validation_data = validation.loc[:, validation.columns != 'long_form'].values.tolist()
validation_labels = validation.loc[:, validation.columns == 'long_form'].values.tolist()
validation_shuffled = random.sample(list(zip(validation_data, validation_labels)), k=len(validation_data)) # shufffled the list not in place
validation_data = [elem[0] for elem in validation_shuffled]
validation_labels = [elem[1][0] for elem in validation_shuffled]

# obtain features: 2000 most frequent words and the abbreviation or acronym 
train_feats, validation_feats = get_features(train_data, validation_data)
# train_feats, dt_test_data_feats = get_features(train_data, dt_test_data)
train_examples = zip(train_feats, train_labels)
validation_set = zip(validation_feats, validation_labels)
# dt_test_set = zip(dt_test_data_feats, dt_test_labels)


# use nltk naive bayes classifier
naive_trained_classifier = nltk.NaiveBayesClassifier.train(train_examples)

# prediction
naive_validation = naive_trained_classifier.classify_many(validation_feats)  # fit on the validation set
# naive_predictions = naive_trained_classifier.classify_many(dt_test_data_feats)  # fit on the test set

In [None]:
print(classification_report(validation_labels, naive_validation, zero_division=0))

                                          precision    recall  f1-score   support

                (complement) component 3       0.96      0.96      0.96        48
                (complement) component 4       0.92      0.98      0.95        46
                               (drug) AC       0.96      0.75      0.84        32
                              (stage) IA       0.98      0.98      0.98        55
                              (stage) IB       1.00      0.98      0.99        94
                              BK (virus)       0.97      0.97      0.97        69
         Children's Depression Inventory       1.00      1.00      1.00        54
             Fairview Southdale Hospital       0.93      0.83      0.87        46
                         GENERAL ENGLISH       0.78      0.91      0.84        64
                     Narcotics Anonymous       1.00      0.97      0.98        95
         Pneumocystis jiroveci pneumonia       0.97      0.98      0.97        59
               



## TF-IDF Window


In [None]:
vectorizer_1 = TfidfVectorizer(min_df= 3, sublinear_tf=True, norm='l2', ngram_range=(1, 2))
vectorizer_2 = TfidfVectorizer(min_df= 3, sublinear_tf=True, norm='l2', ngram_range=(1, 3))


#final_features = vectorizer.fit_transform(dt['adv_window']).toarray()
#final_features.shape

#### Random Forest with n-grams

In [None]:
# Split our dataset into testing and training set:
X = dt_train['adv_window']
y = dt_train['long_form']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.25, stratify=y, random_state = 8)

##### Random Forest with (1, 2)-grams

In [None]:
pipeline_rf = Pipeline([('vect', vectorizer_1),
                     ('clf', RandomForestClassifier(random_state=8))])

# Grid Search Parameters for RandomForest
param_grid = {'clf__n_estimators': np.linspace(1, 100, 10, dtype=int),
              'clf__min_samples_split': [3, 10],
              'clf__min_samples_leaf': [2],
              'clf__max_depth': [None],
              'clf__criterion': ['gini'],
              'clf__bootstrap': [False]}

# Training config
kfold = StratifiedKFold(n_splits=5)
scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
refit = 'F1'

# Perform GridSearch
rf_model_1 = GridSearchCV(pipeline_rf, param_grid=param_grid, cv=kfold, scoring=scoring, 
                         refit=refit, n_jobs=-1, return_train_score=True, verbose=1)
rf_model_1.fit(X_train, y_train)
rf_model_1_best = rf_model_1.best_estimator_
# # fitting our model
yvalidation = np.array(y_validation)
print(classification_report(yvalidation, rf_model_1.predict(X_validation)))
print(rf_model_1.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
                                          precision    recall  f1-score   support

                (complement) component 3       0.40      0.35      0.37        48
                (complement) component 4       0.45      0.43      0.44        46
                               (drug) AC       0.89      1.00      0.94        32
                              (stage) IA       1.00      0.96      0.98        55
                              (stage) IB       0.99      1.00      0.99        94
                              BK (virus)       0.92      1.00      0.96        69
         Children's Depression Inventory       1.00      0.96      0.98        54
             Fairview Southdale Hospital       0.98      0.89      0.93        46
                         GENERAL ENGLISH       0.92      0.88      0.90        64
                     Narcotics Anonymous       1.00      1.00      1.00        95
         Pneumocystis jiroveci pneu

##### Random Forest with (1, 2, 3)-grams

In [None]:
pipeline_rf = Pipeline([('vect', vectorizer_2),
                     ('clf', RandomForestClassifier(random_state=8))])

# Grid Search Parameters for RandomForest
param_grid = {'clf__n_estimators': np.linspace(1, 100, 10, dtype=int),
              'clf__min_samples_split': [3, 10],
              'clf__min_samples_leaf': [2],
              'clf__max_depth': [None],
              'clf__criterion': ['gini'],
              'clf__bootstrap': [False]}

# Training config
kfold = StratifiedKFold(n_splits=5)
scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
refit = 'F1'

# Perform GridSearch
rf_model_2 = GridSearchCV(pipeline_rf, param_grid=param_grid, cv=kfold, scoring=scoring, 
                         refit=refit, n_jobs=-1, return_train_score=True, verbose=1)
rf_model_2.fit(X_train, y_train)
rf_best_2 = rf_model_2.best_estimator_
yvalidation = np.array(y_validation)
print(classification_report(yvalidation, rf_model_2.predict(X_validation)))
print(rf_model_2.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
                                          precision    recall  f1-score   support

                (complement) component 3       0.35      0.35      0.35        48
                (complement) component 4       0.38      0.33      0.35        46
                               (drug) AC       0.91      1.00      0.96        32
                              (stage) IA       1.00      0.96      0.98        55
                              (stage) IB       0.99      1.00      0.99        94
                              BK (virus)       0.90      1.00      0.95        69
         Children's Depression Inventory       1.00      0.98      0.99        54
             Fairview Southdale Hospital       0.97      0.74      0.84        46
                         GENERAL ENGLISH       0.97      0.88      0.92        64
                     Narcotics Anonymous       1.00      1.00      1.00        95
         Pneumocystis jiroveci pneu

#### Random Forest with n-grams + section

In [None]:
X_section = dt_train.loc[:,['adv_window', 'section_id']]
y_section = dt_train['long_form']
X_train_section, X_validation_section, y_train_section, y_validation_section = train_test_split(X_section, y_section, test_size=0.25, stratify=y, random_state = 8)

##### Random Foreset with (1, 2)-grams + sectionm

In [None]:
def get_text_data(x):
    return [record[:] for record in x['adv_window']]


transfomer_category = FunctionTransformer(lambda x: x[['section_id']], validate=False)
transformer_text = FunctionTransformer(get_text_data)

pipeline_rf_with_section = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category)\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_1)
                            ]))\
                        ])), ('clf', RandomForestClassifier())])

# Grid Search Parameters for RandomForest
param_grid = {'clf__n_estimators': np.linspace(1, 100, 10, dtype=int),
              'clf__min_samples_split': [3, 10],
              'clf__min_samples_leaf': [2],
              'clf__max_depth': [None],
              'clf__criterion': ['gini'],
              'clf__bootstrap': [False]}

# Training config
kfold = StratifiedKFold(n_splits=5)
scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
refit = 'F1'

# Perform GridSearch
rf_model_section_1 = GridSearchCV(pipeline_rf_with_section, param_grid=param_grid, cv=kfold, scoring=scoring, 
                         refit=refit, n_jobs=-1, return_train_score=True, verbose=1)
rf_model_section_1.fit(X_train_section, y_train_section)
rf_best_section_1 = rf_model_section_1.best_estimator_
# # fitting our model
yvalidation_section = np.array(y_validation_section)
print(classification_report(yvalidation_section, rf_model_section_1.predict(X_validation_section)))
print(rf_model_section_1.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
                                          precision    recall  f1-score   support

                (complement) component 3       0.41      0.44      0.42        48
                (complement) component 4       0.44      0.35      0.39        46
                               (drug) AC       0.91      1.00      0.96        32
                              (stage) IA       1.00      0.98      0.99        55
                              (stage) IB       0.99      1.00      0.99        94
                              BK (virus)       0.92      1.00      0.96        69
         Children's Depression Inventory       1.00      0.98      0.99        54
             Fairview Southdale Hospital       0.97      0.85      0.91        46
                         GENERAL ENGLISH       0.95      0.84      0.89        64
                     Narcotics Anonymous       1.00      1.00      1.00        95
         Pneumocystis jiroveci pneu

##### Random Foreset with (1, 2, 3)-grams + sectionm

In [None]:
def get_text_data(x):
    return [record[:] for record in x['adv_window']]


transfomer_category = FunctionTransformer(lambda x: x[['section_id']], validate=False)
transformer_text = FunctionTransformer(get_text_data)

pipeline_rf_with_section = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category)\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_2)
                            ]))\
                        ])), ('clf', RandomForestClassifier())])

# Grid Search Parameters for RandomForest
param_grid = {'clf__n_estimators': np.linspace(1, 100, 10, dtype=int),
              'clf__min_samples_split': [3, 10],
              'clf__min_samples_leaf': [2],
              'clf__max_depth': [None],
              'clf__criterion': ['gini'],
              'clf__bootstrap': [False]}

# Training config
kfold = StratifiedKFold(n_splits=5)
scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
refit = 'F1'

# Perform GridSearch
rf_model_section_2 = GridSearchCV(pipeline_rf_with_section, param_grid=param_grid, cv=kfold, scoring=scoring, 
                         refit=refit, n_jobs=-1, return_train_score=True, verbose=1)
rf_model_section_2.fit(X_train_section, y_train_section)
rf_best_section_2 = rf_model_section_2.best_estimator_
# # fitting our model
yvalidation_section_2 = np.array(y_validation_section)
print(classification_report(yvalidation_section_2, rf_model_section_2.predict(X_validation_section)))
print(rf_model_section_2.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
                                          precision    recall  f1-score   support

                (complement) component 3       0.37      0.40      0.38        48
                (complement) component 4       0.42      0.33      0.37        46
                               (drug) AC       0.91      1.00      0.96        32
                              (stage) IA       1.00      0.96      0.98        55
                              (stage) IB       0.99      1.00      0.99        94
                              BK (virus)       0.92      1.00      0.96        69
         Children's Depression Inventory       1.00      1.00      1.00        54
             Fairview Southdale Hospital       0.97      0.78      0.87        46
                         GENERAL ENGLISH       0.95      0.81      0.87        64
                     Narcotics Anonymous       1.00      1.00      1.00        95
         Pneumocystis jiroveci pneu

### TF-IDF with SVM

##### SVM with n-grams

In [None]:
# Split our dataset into testing and training set:
X = dt_train['adv_window']
y = dt_train['long_form']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.25, stratify=y, random_state = 8)

###### SVM with (1, 2)-grams

In [None]:
pipeline_svm_1 = Pipeline([('vect', vectorizer_1),
                     ('clf', SVC(probability=True, kernel='rbf'))])
# fitting our model
svm_model_1 = pipeline_svm_1.fit(X_train, y_train)
yvalidation_1 = np.array(y_validation)
print(classification_report(yvalidation_1, svm_model_1.predict(X_validation)))

                                          precision    recall  f1-score   support

                (complement) component 3       0.50      0.56      0.53        48
                (complement) component 4       0.50      0.43      0.47        46
                               (drug) AC       1.00      0.94      0.97        32
                              (stage) IA       0.98      0.98      0.98        55
                              (stage) IB       0.99      0.97      0.98        94
                              BK (virus)       0.95      1.00      0.97        69
         Children's Depression Inventory       1.00      0.98      0.99        54
             Fairview Southdale Hospital       0.98      0.96      0.97        46
                         GENERAL ENGLISH       0.95      0.91      0.93        64
                     Narcotics Anonymous       1.00      1.00      1.00        95
         Pneumocystis jiroveci pneumonia       0.98      1.00      0.99        59
               

###### SVM with (1, 2, 3)-grams

In [None]:
pipeline_svm_2 = Pipeline([('vect', vectorizer_2),
                     ('clf', SVC(probability=True, kernel='rbf'))])
# fitting our model
svm_model_2 = pipeline_svm_2.fit(X_train, y_train)
yvalidation_2 = np.array(y_validation)
print(classification_report(yvalidation_2, svm_model_2.predict(X_validation)))

                                          precision    recall  f1-score   support

                (complement) component 3       0.46      0.54      0.50        48
                (complement) component 4       0.46      0.39      0.42        46
                               (drug) AC       1.00      0.94      0.97        32
                              (stage) IA       0.98      0.98      0.98        55
                              (stage) IB       0.99      0.97      0.98        94
                              BK (virus)       0.95      1.00      0.97        69
         Children's Depression Inventory       1.00      0.98      0.99        54
             Fairview Southdale Hospital       0.98      0.96      0.97        46
                         GENERAL ENGLISH       0.95      0.91      0.93        64
                     Narcotics Anonymous       1.00      1.00      1.00        95
         Pneumocystis jiroveci pneumonia       0.98      1.00      0.99        59
               

#### SVM with n-grams + section

In [None]:
X_section = dt_train.loc[:,['adv_window', 'section_id']]
y_section = dt_train['long_form']
X_train_section, X_validation_section, y_train_section, y_validation_section = train_test_split(X_section, y_section, test_size=0.25, stratify=y, random_state = 8)

###### SVM with (1, 2)-grams + section

In [None]:
def get_text_data(x):
    return [record[:] for record in x['adv_window']]


transfomer_category = FunctionTransformer(lambda x: x[['section_id']], validate=False)
transformer_text = FunctionTransformer(get_text_data)

pipeline_svm_1_section = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category),\
                                ('scaler', StandardScaler(with_mean=False))\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_1)
                            ]))\
                        ])), ('clf', SVC(probability=True, kernel='rbf'))])

svm_model_section_1 = pipeline_svm_1_section.fit(X_train_section, y_train_section)
yvalidation_section = np.array(y_validation_section)
print(classification_report(yvalidation_section, svm_model_section_1.predict(X_validation_section)))

                                          precision    recall  f1-score   support

                (complement) component 3       0.44      0.60      0.51        48
                (complement) component 4       0.43      0.28      0.34        46
                               (drug) AC       1.00      1.00      1.00        32
                              (stage) IA       1.00      0.98      0.99        55
                              (stage) IB       0.99      0.98      0.98        94
                              BK (virus)       0.92      1.00      0.96        69
         Children's Depression Inventory       0.81      1.00      0.89        54
             Fairview Southdale Hospital       1.00      0.78      0.88        46
                         GENERAL ENGLISH       0.93      0.89      0.91        64
                     Narcotics Anonymous       0.90      0.97      0.93        95
         Pneumocystis jiroveci pneumonia       0.98      0.95      0.97        59
               

###### SVM with (1, 2, 3)-grams + section

In [None]:
pipeline_svm_2_section = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category),\
                                ('scaler', StandardScaler(with_mean=False))\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_2)
                            ]))\
                        ])), ('clf', SVC(probability=True, kernel='rbf'))])

svm_model_section_2 = pipeline_svm_2_section.fit(X_train_section, y_train_section)
yvalidation_section_2 = np.array(y_validation_section)
print(classification_report(yvalidation_section_2, svm_model_section_2.predict(X_validation_section)))


                                          precision    recall  f1-score   support

                (complement) component 3       0.44      0.62      0.52        48
                (complement) component 4       0.41      0.26      0.32        46
                               (drug) AC       1.00      1.00      1.00        32
                              (stage) IA       0.98      1.00      0.99        55
                              (stage) IB       0.99      0.98      0.98        94
                              BK (virus)       0.93      1.00      0.97        69
         Children's Depression Inventory       0.78      1.00      0.88        54
             Fairview Southdale Hospital       1.00      0.85      0.92        46
                         GENERAL ENGLISH       0.95      0.89      0.92        64
                     Narcotics Anonymous       0.84      0.95      0.89        95
         Pneumocystis jiroveci pneumonia       0.98      0.95      0.97        59
               

## Cross-validation

create the cross validation data set

In [None]:
X_cv = dt_train['adv_window']
X_cv_section = dt_train.loc[:,['adv_window', 'section_id']]
y_cv = dt_train['long_form']

In [None]:
accuracy = []

In [None]:
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipeline_rf_1 = Pipeline([('vect', vectorizer_1),
                  ('clf', RandomForestClassifier(n_estimators=56, min_samples_leaf=2, min_samples_split=3))])
accuracy.append(cross_validate(pipeline_rf_1, X_cv, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))

pipeline_rf_2 = Pipeline([('vect', vectorizer_2),
                  ('clf', RandomForestClassifier(n_estimators=100, min_samples_leaf=2, min_samples_split=10))])
accuracy.append(cross_validate(pipeline_rf_2, X_cv, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))

pipeline_rf_with_section_1 = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category)\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_1)
                            ]))\
                        ])), ('clf', RandomForestClassifier(n_estimators=78, min_samples_leaf=2, min_samples_split=3))])
accuracy.append(cross_validate(pipeline_rf_with_section_1, X_cv_section, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))

pipeline_rf_with_section_2 = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category)\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_2)
                            ]))\
                        ])), ('clf', RandomForestClassifier(n_estimators=78, min_samples_leaf=2, min_samples_split=3))])
accuracy.append(cross_validate(pipeline_rf_with_section_2, X_cv_section, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))


  

In [None]:
pipeline_svm_1 = Pipeline([('vect', vectorizer_1),
                  ('clf', SVC(probability=True, kernel='rbf'))])
accuracy.append(cross_validate(pipeline_svm_1, X_cv, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))

pipeline_svm_2 = Pipeline([('vect', vectorizer_2),
                  ('clf', SVC(probability=True, kernel='rbf'))])
accuracy.append(cross_validate(pipeline_svm_2, X_cv, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))

pipeline_svm_with_section_1 = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category),\
                                ('scaler', StandardScaler(with_mean=False))\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_1)
                            ]))\
                        ])), ('clf', SVC(probability=True, kernel='rbf'))])
accuracy.append(cross_validate(pipeline_svm_with_section_1, X_cv_section, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))

pipeline_svm_with_section_2 = Pipeline([
                         ('features', FeatureUnion([\
                            ('numeric_features', Pipeline([\
                                ('selector', transfomer_category),\
                                ('scaler', StandardScaler(with_mean=False))\
                            ])),\
                            ('text_features', Pipeline([\
                                ('selector', transformer_text),\
                                ('vec', vectorizer_2)
                            ]))\
                        ])), ('clf', SVC(probability=True, kernel='rbf'))])
accuracy.append(cross_validate(pipeline_svm_with_section_2, X_cv_section, y_cv, scoring=['accuracy', 'f1_weighted'], cv=cv, n_jobs=-1, return_train_score=True))


In [None]:
accuracy

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_weighted,train_accuracy,train_f1_weighted
0,"[7.463834047317505, 7.54076623916626, 7.153670...","[0.5900702476501465, 0.5788319110870361, 0.602...","[0.9439646900786797, 0.9466513145269622, 0.944...","[0.9404583163141017, 0.9432473294086345, 0.939...","[0.9625737728515906, 0.9649728899764887, 0.965...","[0.9604356711576487, 0.9626830346813915, 0.963..."
1,"[13.108197689056396, 13.029526710510254, 12.77...","[0.9428575038909912, 0.9283778667449951, 0.964...","[0.9472270197658799, 0.9408942621377855, 0.944...","[0.9441337293121146, 0.9372898967215808, 0.939...","[0.9655006957439662, 0.9625257905090927, 0.963...","[0.9634474837830358, 0.96014285148193, 0.96158..."
2,"[9.863546371459961, 9.880082130432129, 9.60763...","[0.7694454193115234, 0.7533588409423828, 0.751...","[0.9451161005565151, 0.9468432162732681, 0.943...","[0.9419362125946835, 0.9436395181058662, 0.939...","[0.9651168370039825, 0.9665083249364234, 0.964...","[0.962948653807088, 0.9646345984520984, 0.9620..."
3,"[10.798992156982422, 10.909960985183716, 10.78...","[0.8491432666778564, 0.8676283359527588, 0.839...","[0.945308002302821, 0.945308002302821, 0.94433...","[0.9417235194508795, 0.9418744921416711, 0.939...","[0.9629096492490763, 0.9623338611391008, 0.963...","[0.9607025458125964, 0.9603035246565451, 0.961..."
4,"[882.2351791858673, 885.651451587677, 887.3583...","[39.989128828048706, 39.735318422317505, 40.19...","[0.9604682402609864, 0.9625791594703512, 0.965...","[0.9601698006544878, 0.9619564855947225, 0.964...","[0.9973609711626121, 0.9974089535051102, 0.996...","[0.9973596039112126, 0.9974070706062603, 0.996..."
5,"[986.4656262397766, 987.6787304878235, 1008.43...","[44.27550029754639, 41.515896797180176, 43.546...","[0.9602763385146805, 0.9618115524851276, 0.962...","[0.9598980863562986, 0.9610978050783582, 0.962...","[0.9973609711626121, 0.997504918190106, 0.9968...","[0.9973602348264609, 0.9975031685484838, 0.996..."
6,"[807.0318870544434, 806.8023536205292, 787.596...","[41.647300004959106, 39.51086902618408, 40.832...","[0.9422375743619267, 0.9395509499136442, 0.945...","[0.9419089607028791, 0.9396231486126924, 0.944...","[0.989012043567967, 0.9882443260879996, 0.9887...","[0.9889670387521904, 0.9882154402963218, 0.988..."
7,"[888.0014510154724, 886.6452074050903, 876.490...","[44.11907362937927, 40.97827959060669, 43.0937...","[0.939934753406256, 0.9376319324505853, 0.9418...","[0.9396125677057665, 0.9378235188356466, 0.940...","[0.9881483614030037, 0.9880044143755098, 0.988...","[0.9880949941907136, 0.9879877430071982, 0.988..."


In [None]:
df = pd.DataFrame(accuracy)
result_table = pd.DataFrame()
col_names = df_a.columns
for name in col_names:
  result_table[name] = np.mean(df_a[name].tolist(), axis=1)

In [None]:
result_table

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_weighted,train_accuracy,train_f1_weighted
0,6.742147,0.555332,0.944995,0.941263,0.963458,0.961232
1,11.9845,0.897091,0.944304,0.940742,0.963477,0.961269
2,9.048508,0.710682,0.945877,0.942531,0.965243,0.96318
3,10.013789,0.794977,0.94392,0.940042,0.962987,0.960775
4,836.192251,37.830103,0.962421,0.961986,0.99715,0.997148
5,945.550765,41.342226,0.961385,0.960853,0.997035,0.997033
6,751.663976,38.6738,0.942001,0.941751,0.988696,0.988673
7,833.815899,40.786686,0.939506,0.939407,0.988216,0.988194


Based on the runtime, weighted average F1 score, and accuracy, we choose Random Forest with unigrams, bigrams and section information.

## Model Performance

Apply the chosed model fit the training set, and then apply it to the test set. 

In [None]:
X_train = dt_train.loc[:,['adv_window', 'section_id']]
y_train = dt_train['long_form']
X_test = dt_test.loc[:,['adv_window', 'section_id']]
y_test = dt_test['long_form']

In [None]:
ytest = np.array(y_test)
# classification report(precision, recall, F1-score)
pipeline_rf_with_section_1.fit(X_train, y_train)
ytest = np.array(y_test)
print(classification_report(ytest, pipeline_rf_with_section_1.predict(X_test)))

                                          precision    recall  f1-score   support

                (complement) component 3       0.40      0.43      0.41        49
                (complement) component 4       0.45      0.30      0.36        46
                               (drug) AC       0.94      1.00      0.97        32
                              (stage) IA       1.00      1.00      1.00        55
                              (stage) IB       0.99      1.00      0.99        94
                              BK (virus)       0.96      0.99      0.97        69
         Children's Depression Inventory       1.00      0.98      0.99        54
             Fairview Southdale Hospital       0.98      0.89      0.93        46
                         GENERAL ENGLISH       0.94      0.72      0.81        64
                     Narcotics Anonymous       1.00      1.00      1.00        95
         Pneumocystis jiroveci pneumonia       1.00      0.97      0.98        59
               