###### Project: Adverse Medical Outcomes Prediction 
##### Data Scientist: Victoria M. Ng 

# Import libraries

In [1]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
import psycopg2 as pg

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
chromedriver = "/home/victoria/projects/metis/Project3/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy


  """)
2018-05-11 15:36:36,981 : INFO : 'pattern' package not found; tag filters are not available for English
  from pandas.core import datetools


# Create a list of stop words

In [2]:
stop_words = list(STOP_WORDS)
stop_words.append('a')
stop_words

['in',
 'almost',
 'onto',
 'neither',
 'three',
 'so',
 'yet',
 'go',
 'hundred',
 'would',
 'fifteen',
 'some',
 'using',
 'because',
 'whereas',
 'nothing',
 'on',
 'too',
 'well',
 'toward',
 'who',
 'still',
 'everything',
 'last',
 'same',
 'much',
 'make',
 'meanwhile',
 'fifty',
 'only',
 'see',
 'whither',
 'wherein',
 'third',
 'nobody',
 'several',
 'no',
 'ours',
 'else',
 'say',
 'done',
 'quite',
 'someone',
 'become',
 'her',
 'that',
 'them',
 'whereby',
 'their',
 'during',
 'him',
 'becoming',
 'back',
 'next',
 'eight',
 'nine',
 'yours',
 'off',
 'been',
 'we',
 'has',
 'every',
 'unless',
 'themselves',
 'us',
 'me',
 'somehow',
 'there',
 'everywhere',
 'regarding',
 'per',
 'before',
 'whom',
 'how',
 'further',
 'sixty',
 'then',
 'which',
 'less',
 'elsewhere',
 'became',
 'latterly',
 'many',
 'sometime',
 'whether',
 'anyone',
 'an',
 'due',
 'just',
 'but',
 'must',
 'thus',
 'wherever',
 'whole',
 'such',
 'thereafter',
 'a',
 'four',
 'moreover',
 'where',

# Define a function to make each wiki page one long string

In [3]:
from string import punctuation

for char in punctuation:
    print(char)

!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~


In [4]:
def clean_page(page_list):
    clean_list = []
    clean_page = []
    for string in page_list: 
        string=string.lower()
        for char in punctuation:
            string = string.replace(char,'')
        string=string.replace('0','')
        string=string.replace('1','')
        string=string.replace('2','')
        string=string.replace('3','')
        string=string.replace('4','')
        string=string.replace('5','')
        string=string.replace('6','')
        string=string.replace('7','')
        string=string.replace('8','')
        string=string.replace('9','')
        clean_list.append(string)
    clean_page = ''.join(clean_list)
    return clean_page

# Import the food_df symptoms list and dict 

Remember to maintain the order of the dict since you're making a list out of the keys and iterating on that list

In [5]:
with open('unique_food_df_symptoms_list.pkl', 'rb') as picklefile: 
    unique_food_df_symptoms_list = pickle.load(picklefile)

In [6]:
len(unique_food_df_symptoms_list)

3004

In [7]:
with open('symptoms_food_df_dict.pkl', 'rb') as picklefile: 
    symptoms_food_df_dict = pickle.load(picklefile)

In [17]:
symptoms_food_df_dict['diarrhoea']

852

In [9]:
list_of_instance_counts = list(symptoms_food_df_dict.values())
list_of_instance_counts

[2660,
 2336,
 2983,
 757,
 1430,
 923,
 643,
 707,
 2637,
 798,
 1826,
 620,
 1343,
 874,
 441,
 2087,
 2109,
 2179,
 2056,
 638,
 1047,
 2917,
 92,
 209,
 835,
 1460,
 1336,
 1506,
 1888,
 1143,
 1024,
 2075,
 2677,
 1156,
 1753,
 763,
 1614,
 12,
 2963,
 852,
 1018,
 916,
 1061,
 2594,
 2703,
 2937,
 2531,
 973,
 1064,
 1058,
 538,
 1915,
 14,
 903,
 2701,
 2537,
 2538,
 327,
 540,
 2449,
 2110,
 1523,
 221,
 1667,
 822,
 1864,
 1116,
 2670,
 603,
 1710,
 1137,
 432,
 1109,
 2866,
 2633,
 18,
 351,
 1442,
 821,
 449,
 176,
 2777,
 1493,
 1279,
 1065,
 1063,
 2486,
 1067,
 167,
 2694,
 1448,
 492,
 125,
 1817,
 1773,
 652,
 1616,
 217,
 81,
 2343,
 2285,
 2659,
 66,
 2097,
 1837,
 133,
 1879,
 855,
 866,
 2339,
 823,
 2436,
 706,
 1861,
 1104,
 2992,
 1894,
 373,
 1121,
 869,
 2218,
 2114,
 800,
 1564,
 2695,
 2421,
 2976,
 740,
 1184,
 1274,
 1432,
 1954,
 2873,
 2872,
 1290,
 625,
 2241,
 2807,
 1838,
 479,
 2102,
 76,
 920,
 2123,
 2455,
 1507,
 898,
 2086,
 2404,
 1235,
 252,
 26

# Define a function to clean the symptoms to categorize for easier wiki searching

In [10]:
def clean_symptoms_to_categorize(symptoms_to_categorize):
    symptoms_to_wiki_search_list = []
    for symptom in symptoms_to_categorize:
        symptom = symptom.replace('_', ' ')
        symptoms_to_wiki_search_list.append(symptom)
    return symptoms_to_wiki_search_list

# Clean the unique symptoms list to be used for wiki querying (symptoms_to_wiki_search_list)

In [11]:
symptoms_to_wiki_search_list = clean_symptoms_to_categorize(unique_food_df_symptoms_list)

In [12]:
with open('symptoms_to_wiki_search_list.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_to_wiki_search_list, picklefile)

# Define functions to create a corpus of wiki symptom pages

In [13]:
'''
This function attempts to scrape the text from a wiki page based on a supplied term 
using a function in the wikipedia api. If the page cannot be sourced, it will pass 
the resulting DisambiguationError.
''' 
def attempt_to_create_wiki_page(search_term):
    wiki_page = None
    try:
        wiki_page = wiki.page(search_term)
    except wiki.exceptions.DisambiguationError as e:
        pass
    
    return wiki_page

In [19]:
'''
This function attempts to create a full corpus out of each of the scraped page text.
If the page cannot be sourced based on the provided search term, it will attempt to append
the text 'medical' onto the search term to see if that results in a page. If that does not 
work then the it will input placeholder text for that missing page so that all indices
of the resulting corpus list match the indices of the supplied symptoms list. That way,
I will be able to properly identify which corpus associates to which symptom.
This function also prints a list of tuples of the symptoms that did not have a sourced page
and their associated indices. Therefore, I could manually search for those pages and replace
the content within the corpus list at the correct index
'''

def create_wiki_symptom_corpus(symptoms_to_wiki_search_list):
    symptoms_wiki_corpus = []
    missing_symptoms_list = []
    for index, symptom in enumerate(symptoms_to_wiki_search_list):
        try:
            symptom_wiki_page = attempt_to_create_wiki_page(symptom)
            if symptom_wiki_page is None:
                symptom_wiki_page = attempt_to_create_wiki_page(symptom + ' (medical)')
                if symptom_wiki_page is None:
                    symptom_wiki_page = '{} unavailable_page'.format(symptom)
                    symptoms_wiki_corpus.append(symptom_wiki_page)
                    missing_symptoms_list.append((symptom, index))
                    continue
                
            symptom_wiki_page = symptom_wiki_page.content.splitlines()
            symptom_wiki_page = clean_page(symptom_wiki_page)
            symptoms_wiki_corpus.append(symptom_wiki_page)
        except:
            symptom_wiki_page = '{} unavailable_page'.format(symptom)
            symptoms_wiki_corpus.append(symptom_wiki_page)
            missing_symptoms_list.append((symptom, index))
            continue

    print(missing_symptoms_list)
    return symptoms_wiki_corpus

# Define a function to replace the wiki symptom pages that weren't originally searchable

In [16]:
'''Since there might be many missing pages that I had to manually source, this function attempts
to replace all of them at their associated indices. And if the page titles I manually sourced
still do not result in a page, this function prints those still missing pages so I can try 
again.
'''

def replace_missing_wiki(missing_symptom_redo_list, symptoms_to_wiki_search_list):
    still_missing_list = []
    for missing_symptom in missing_symptom_redo_list:
        try:
            wiki_symptom = wiki.page(missing_symptom[0])
            wiki_symptom = wiki_symptom.content.splitlines()
            wiki_symptom = clean_page(wiki_symptom)
            symptoms_to_wiki_search_list[missing_symptom[1]] = wiki_symptom
        except:
            wiki_symptom = 'Page_Unavailable'
            symptoms_to_wiki_search_list[missing_symptom[1]] = wiki_symptom
            still_missing_list.append(missing_symptom)
    print(still_missing_list)
    return symptoms_to_wiki_search_list

# Test the create wiki symptom corpus function on subset of 10 symptoms

In [15]:
symptoms_to_wiki_search_list

['swelling face',
 'rash',
 'wheezing',
 'cough',
 'hospitalisation',
 'dyspnoea',
 'choking',
 'completed suicide',
 'stress symptoms',
 'death',
 'mitral valve incompetence',
 'cerebrovascular accident',
 'heart rate increased',
 'dizziness',
 'blood pressure increased',
 'palpitations',
 'paraesthesia',
 'physical examination',
 'orthostatic hypotension',
 'chest pain',
 'extrasystoles',
 'ventricular extrasystoles',
 'alopecia',
 'arthralgia',
 'dermatologic examination',
 'hypersensitivity',
 'headache',
 'hypotrichosis',
 'myocardial infarction',
 'flushing',
 'erythema',
 'pain',
 'tenderness',
 'food poisoning',
 'malaise',
 'creutzfeldt',
 'jakob disease',
 'abdominal pain',
 'vomiting',
 'diarrhoea',
 'epistaxis',
 'dyspepsia',
 'eye oedema',
 'sneezing',
 'throat tightness',
 'vision blurred',
 'sinus pain',
 'emergency care examination',
 'eye pruritus',
 'eye irritation',
 'burning sensation',
 'nausea',
 'abdominal pain upper',
 'dry throat',
 'throat irritation',
 'skin 

In [16]:
# Seems like it fails to scrape pages for about half of the symptoms
test_10_wiki_corpus = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[0:11])

[]


In [17]:
test_10_wiki_corpus

['periorbital puffiness also known as puffy eyes or swelling around the eyes is the appearance of swelling in the tissues around the eyes called the orbits it is almost exclusively caused by fluid buildup around the eyes or periorbital edema minor puffiness usually detectable below the eyes only although at times they could be present all around is often called eye bags such transient puffiness is distinct from the age related and gradual increase in the size of the fat pad lying below the lower eyelids suborbicularis oculi fat – soof which can also be colloquially referred to as eye bags causes while some degree of puffiness may be normal for a given individual factors such as age and fatigue may make the swelling more prominent the periorbital tissues are most noticeably swollen immediately after waking perhaps due to the gravitational redistribution of fluid in the horizontal positioneye puffiness may also be caused bymononucleosis – with supraorbital oedema the eyes become puffy an

# Scrape subsets of the 3k symptoms at a time 

# First 200

In [40]:
full_corpus_200 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[0:200])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


loose stools unavailable_page
Failed at: loose stools, at index 69
stools watery unavailable_page
Failed at: stools watery, at index 74
mood altered unavailable_page
Failed at: mood altered, at index 104
disturbance in attention unavailable_page
Failed at: disturbance in attention, at index 119
chest discomfort unavailable_page
Failed at: chest discomfort, at index 162
hypertonic bladder unavailable_page
Failed at: hypertonic bladder, at index 172
computerised tomogram abnormal unavailable_page
Failed at: computerised tomogram abnormal, at index 175
sinus congestion unavailable_page
Failed at: sinus congestion, at index 194
[('loose stools', 69), ('stools watery', 74), ('mood altered', 104), ('disturbance in attention', 119), ('chest discomfort', 162), ('hypertonic bladder', 172), ('computerised tomogram abnormal', 175), ('sinus congestion', 194)]


In [47]:
replacement_200 = [('Diarrhea', 69), ('Diarrhea', 74), ('Mood swing', 104), ('Attention span', 119), ('Acute bronchitis', 162), ('Overactive bladder', 172), ('CT scan', 175), ('Nasal congestion', 194)]

In [48]:
full_corpus_200_fixed = replace_missing_wiki(replacement_200, full_corpus_200)

[]


In [49]:
with open('full_corpus_200_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_200_fixed, picklefile)

# Next 200 (400)

In [50]:
full_corpus_400 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[200:400])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('pupil fixed', 18), ('chromaturia', 75), ('faeces pale', 93), ('hair colour changes', 121), ('coronary arterial stent insertion', 129), ('change of bowel habit', 145), ('skin warm', 167), ('faeces discoloured', 172), ('transaminases increased', 185)]


In [53]:
replacement_400 = [('Mydriasis', 18), ('Urine', 75), ('Common bile duct stone', 93), ('Human hair color', 121), ('Coronary catheterization', 129), ('Rectal discharge', 145), ('Desquamation', 167), ('Human feces', 172), ('Elevated transaminases', 185)]

In [54]:
full_corpus_400_fixed = replace_missing_wiki(replacement_400, full_corpus_400)

[]


In [None]:
with open('/Corpus/full_corpus_400_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_400_fixed, picklefile)

# Next 200 (600)

In [65]:
full_corpus_600 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[400:600])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('scab', 4), ('international normalised ratio increased', 21), ('catheterisation cardiac', 29), ('culture wound positive', 33), ('computerised tomogram', 43), ('shock', 66), ('bone cancer metastatic', 79), ('dyspnoea exertional', 99), ('pruritus generalised', 102), ('lip blister', 108), ('vaginal polyp', 119), ('genital rash', 120), ('apnoea', 152), ('defaecation urgency', 168), ('genital pruritus female', 169), ('lip neoplasm malignant stage unspecified', 180), ('tonic clonic movements', 198)]


In [66]:
replacement_600 = [('Fibroblast', 4), ('Prothrombin time', 21), ('Cardiac catheterization', 29), ('Open fracture', 33), ('CT scan', 43), ('Shock (circulatory)', 66), ('Metastatic breast cancer', 79), ('Shortness of breath', 99), ('Itch', 102), ('Blister', 108), ('Endometrial polyp', 119), ('Genital herpes', 120), ('Apnea', 152), ('Diarrhea', 168), ('Vulva', 169), ('Oral cancer', 180), ('Clonus', 198)]


In [67]:
full_corpus_600_fixed = replace_missing_wiki(replacement_600, full_corpus_600)

[]


In [68]:
with open('/Corpus/full_corpus_600_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_600_fixed, picklefile)

# Next 200 (800)

In [69]:
full_corpus_800 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[600:800])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('meige', 31), ('hepatic neoplasm malignant', 33), ('corneal scar', 41), ('arteriospasm coronary', 74), ('uterine spasm', 101), ('brain oedema', 103), ('arterial stenosis limb', 105), ('oedema mouth', 133), ('international normalised ratio decreased', 134), ('pco2 abnormal', 136), ('basophil count decreased', 163), ('eosinophil count abnormal', 165), ('body temperature decreased', 182), ('skin chapped', 194)]


In [71]:
replacement_800 = [('Meige lymphedema', 31), ('Liver cancer', 33), ('Corneal ulcer', 41), ('Coronary vasospasm', 74), ('Braxton Hicks contractions', 101), ('Cerebral edema', 103), ('Carotid artery stenosis', 105), ('Angioedema', 133), ('Prothrombin time', 134), ('pCO2', 136), ('White blood cell', 163), ('Eosinophilia', 165), ('Thermoregulation', 182), ('Xeroderma', 194)]

In [72]:
full_corpus_800_fixed = replace_missing_wiki(replacement_800, full_corpus_800)

[]


In [73]:
with open('/Corpus/full_corpus_800_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_800_fixed, picklefile)

# Next 200 (1000)

In [74]:
full_corpus_1000 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[800:1000])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('mental retardation severity unspecified', 15), ('prostatic specific antigen increased', 54), ('creatinine renal clearance decreased', 59), ('blood electrolytes decreased', 64), ('abnormal faeces', 80), ('pco2 decreased', 87), ('poverty of thought content', 93), ('increased bronchial secretion', 103), ('respiratory tract congestion', 104), ('peripheral coldness', 111), ('nasal dryness', 112), ('gastroenteritis proteus', 124), ('dermatitis bullous', 139), ('intercostal retraction', 143), ('vulvovaginal dryness', 147), ('bile output increased', 185)]


In [75]:
replacement_1000 = [('Intellectual disability', 15), ('Prostate-specific antigen', 54), ('Renal function', 59), ('Hypovolemia', 64), ('Human feces', 80), ('pCO2', 87), ('Thought disorder', 93), ('Bronchial challenge test', 103), ('Upper respiratory tract infection', 104), ('Frostbite', 111), ('Dried nasal mucus', 112), ('Aeromonas hydrophila', 124), ('Dermatitis herpetiformis', 139), ('Labored breathing', 143), ('Vulvovaginal health', 147), ('Dehydrocholic acid', 185)]

In [76]:
full_corpus_1000_fixed = replace_missing_wiki(replacement_1000, full_corpus_1000)

[]


In [77]:
with open('/Corpus/full_corpus_1000_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_1000_fixed, picklefile)

# Next 500 (1500)

In [78]:
full_corpus_1500 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[1000:1500])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('prothrombin level increased', 13), ('saliva altered', 18), ('frequent bowel movements', 48), ('limb injury', 57), ('blood creatine decreased', 71), ('caecal lesion excision', 75), ('metastases to kidney', 95), ('resistant staphylococcal aureus test positive', 99), ('urine arsenic increased', 105), ('elevated pacing threshold', 123), ('genital burning sensation', 128), ('operative haemorrhage', 141), ('retinal injury', 143), ('aspiration', 145), ('cardiac flutter', 154), ('respiration abnormal', 165), ('intentional drug misuse', 166), ('rash maculo', 169), ('papular', 170), ('increased upper airway secretion', 201), ('prealbumin decreased', 217), ('unevaluable event', 239), ('oedema genital', 240), ('onychalgia', 257), ('nasal oedema', 261), ('arthropod bite', 287), ('incorrect dose administered', 297), ('bile culture positive', 325), ('social avoidant behaviour', 332), ('cardiac discomfort', 341), ('bronchial irritation', 347), ('rash erythematous', 349), ('psychological factor affe

In [79]:
replacement_1500 = [('Prothrombin time', 13), ('Saliva testing', 18), ('Diarrhea', 48), ('Repetitive strain injury', 57), ('Creatine transporter defect', 71), ('Horse colic', 75), ('Renal cell carcinoma', 95), ('Methicillin-resistant Staphylococcus aureus', 99), ('Arsenic poisoning', 105), ('Pacemaker syndrome', 123), ('Genital herpes', 128), ('Bleeding', 141), ('Retinal detachment', 143), ('Aspiration pneumonia', 145), ('Atrial flutter', 154), ('Agonal respiration', 165), ('Substance abuse', 166), ('Maculopapular rash', 169), ('Maculopapular rash', 170), ('Upper respiratory tract infection', 201), ('Transthyretin', 217), ('unevaluable event', 239), ('Genital herpes', 240), ('Ingrown nail', 257), ('Nasal congestion', 261), ('Arthropod bites and stings', 287), ('Drug overdose', 297), ('Bile', 325), ('Anti-social behaviour', 332), ('Angina', 341), ('Acute bronchitis', 347), ('Erythema', 349), ('DSM-IV codes', 352), ('Insulin (medication)', 357), ('Acidosis', 372), ('Euphoria', 382), ('Topiramate', 385), ('Heavy legs', 398), ('drug overdose', 404), ('Secretin-cholecystokinin test', 412), ('Genital herpes', 416), ('Cutaneous condition', 432), ('Edema', 439), ('Rectal tenesmus', 444), ('Diarrhea', 450)]

## Notice that there is one "symptom" that is basically 'unknown'

In [80]:
'''
Since this "symptom" is basically an unknown, 
I'm going to leave it as "Page_Unavailable" and take note of its index
'''
full_corpus_1500_fixed = replace_missing_wiki(replacement_1500, full_corpus_1500)

[('unevaluable event', 239)]


In [81]:
with open('/Corpus/full_corpus_1500_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_1500_fixed, picklefile)

# Next 500 (2000)

In [17]:
full_corpus_2000 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[1500:2000])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('anaesthetic complication', 12), ('ear tube insertion', 16), ('related event', 32), ('nasal inflammation', 35), ('respiratory tract irritation', 39), ('cross sensitivity reaction', 45), ('cardiac infection', 56), ('tachyphrenia', 58), ('colitis ischaemic', 61), ('feeling of relaxation', 71), ('nasal discomfort', 74), ('pulmonary vascular disorder', 82), ('atrial conduction time prolongation', 88), ('mouth ulceration', 92), ('therapeutic product ineffective', 97), ('post procedural haemorrhage', 136), ('faeces hard', 142), ('faecal volume increased', 143), ('blood gonadotrophin increased', 167), ('large intestinal haemorrhage', 168), ('clamping of blood vessel', 215), ('toxicologic test abnormal', 221), ('skin induration', 222), ('pco2 increased', 229), ('oropharyngeal spasm', 234), ('not coded', 242), ('early satiety', 247), ('cardioactive drug level increased', 255), ('post', 269), ('wound haemorrhage', 271), ('ear congestion', 278), ('beta haemolytic streptococcal infection', 300),

In [22]:
replacement_2000 = [('Complications of anesthetic gases', 12), ('Tympanostomy tube', 16), ('related event', 32), ('Nasal polyp', 35), ('Upper respiratory tract infection', 39), ('Sensitivity analysis', 45), ('Cardiac tamponade', 56), ('Euphoria', 58), ('Ischemic colitis', 61), ('relaxation technique', 71), ('Nasal congestion', 74), ('Vascular disease', 82), ('Heart arrhythmia', 88), ('Mouth ulcer', 92), ('therapeutic product ineffective', 97), ('Hemorrhagic cystitis', 136), ('Human feces', 142), ('Fecal impaction', 143), ('Gonadotropin', 167), ('Gastrointestinal bleeding', 168), ('Hemostat', 215), ('Adverse drug reaction', 221), ('Cutaneous condition', 222), ('pco2', 229), ('Cricopharyngeal spasm', 234), ('not coded', 242), ('Expected satiety', 247), ('Pharmacokinetics', 255), ('post', 269), ('Bleeding', 271), ('Nasal congestion', 278), ('Streptococcus', 300), ('Torticollis', 320), ('Small intestinal bacterial overgrowth', 338), ('Bowel infarction', 340), ('Thrombolysis', 350), ('Urinary retention', 357), ('Hypovolemia', 359), ('Drug overdose', 363), ('Spinal cord injury', 368), ('Diaphragmatic rupture', 371), ('Dysbiosis', 373), ('Gastric bypass surgery', 418), ('Gunshot wound', 444), ('Suicidal ideation', 446), ('ICD-10 Procedure Coding System', 474), ('Hyper IgM syndrome', 476), ('Copper in health', 477), ('Nasal septum deviation', 488), ('Pharmacotherapy', 497)]

In [23]:
full_corpus_2000_fixed = replace_missing_wiki(replacement_2000, full_corpus_2000)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('related event', 32), ('therapeutic product ineffective', 97), ('not coded', 242), ('post', 269)]


## Notice that there are a few 'symptoms' that I'm keeping as unavailable, as they aren't actually symptoms or I can't classify them yet.

In [24]:
with open('/Corpus/full_corpus_2000_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_2000_fixed, picklefile)

# Next 500 (2500)

In [20]:
full_corpus_2500 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[2000:2500])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('bile duct stent insertion', 8), ('performance status decreased', 25), ('bloody discharge', 37), ('basophil percentage increased', 51), ('accidental exposure', 53), ('retinal exudates', 61), ('creatine urine decreased', 63), ('cryptogenic cirrhosis', 78), ('buccal mucosal roughening', 84), ('ceruloplasmin decreased', 89), ('cholelithotomy', 91), ('albumin globulin ratio decreased', 105), ('globulins increased', 106), ('accidental drug intake by child', 108), ('failure of child resistant mechanism for pharmaceutical product', 109), ('oesophageal hypomotility', 121), ('urine arsenic', 162), ('throat lesion', 182), ('eye operation', 185), ('neutrophil percentage increased', 201), ('barrett', 209), ('contracted bladder', 211), ('oesophagogastric fundoplasty', 217), ('gastrointestinal sounds abnormal', 227), ('amniorrhoea', 232), ('reproductive tract disorder', 288), ('epiglottis ulcer', 292), ('pharyngeal hypoaesthesia', 308), ('urine amino acid level increased', 318), ('venous occlusion

In [30]:
replacement_2500 = [('Endoscopic stenting', 8), ('Performance status', 25), ('Bleeding', 37), ('Basophil activation', 51), ('Blood agent', 53), ('Hypertensive retinopathy', 61), ('Creatine transporter defect', 63), ('Kayser–Fleischer ring', 78), ('Heck\'s disease', 84), ('Wilson\s disease', 89), ('lithotomy', 91), ('Sex hormone-binding globulin', 105), ('globulins increased', 106), ('Drug overdose', 108), ('Drug overdose', 109), ('Achalasia', 121), ('Arsenic poisoning', 162), ('Herpes labialis', 182), ('Eye surgery', 185), ('Absolute neutrophil count', 201), ('Barrett\'s esophagus', 209), ('Urinary incontinence', 211), ('Nissen fundoplication', 217), ('Stomach rumble', 227), ('Amenorrhea', 232), ('Reproductive system disease', 288), ('Human digestive system', 292), ('Hypoesthesia', 308), ('Congenital disorders of amino acid metabolism', 318), ('Central venous catheter', 356), ('Overactive bladder', 359), ('Absolute neutrophil count', 384), ('Crackles', 387), ('Alzheimer\'s disease', 390), ('Urine test strip', 418), ('Wound healing', 451), ('Cholesterol', 467), ('Scrotal ultrasound', 470), ('Serine protease', 486), ('Stye', 490), ('Anisocoria', 492)]

In [31]:
# I forgot to add Globulins increased before moving on 
full_corpus_2500_fixed = replace_missing_wiki(replacement_2500, full_corpus_2500)

[('globulins increased', 106)]


In [32]:
with open('/Corpus/full_corpus_2500_fixed.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_2500_fixed, picklefile)

# Next 500 (3000)

In [21]:
full_corpus_3000 = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[2500:3000])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[('uraemia odour', 5), ('vocal cord disorder', 10), ('monocyte count decreased', 13), ('myolipoma', 57), ('crohn', 64), ('oesophageal irritation', 74), ('blood corticotrophin increased', 99), ('hla marker study positive', 109), ('hepatic congestion', 119), ('upper respiratory tract irritation', 121), ('gastrointestinal tract irritation', 123), ('anoxia', 132), ('thyroxin binding globulin increased', 139), ('urine iron increased', 157), ('pneumonia lipoid', 169), ('biopsy bronchus abnormal', 170), ('short', 176), ('injurious ideation', 195), ('eczema weeping', 212), ('medical device complication', 216), ('foetal exposure timing unspecified', 225), ('genital disorder male', 236), ('blood electrolytes normal', 240), ('positron emission tomogram abnormal', 253), ('lung carcinoma cell type unspecified stage iv', 254), ('maternal exposure during pregnancy', 257), ('hyperproteinaemia', 270), ('device pacing issue', 271), ('medical induction of coma', 275), ('amino acid level increased', 276),

In [34]:
replacement_3000 = [('Uremic fetor', 5), ('Vocal fold paresis', 10), ('Leukocytosis', 13), ('Blood vessel', 57), ('Crohn\'s disease', 64), ('Esophageal dysphagia', 74), ('blood corticotrophin increased', 99), ('hla marker study positive', 109), ('hepatic congestion', 119), ('upper respiratory tract irritation', 121), ('Desmopressin', 123), ('Hypoxia (medical)', 132), ('Thyroid hormones', 139), ('Urine test strip', 157), ('Lipid pneumonia', 169), ('Squamous cell carcinoma', 170), ('SHORT syndrome', 176), ('Suicide terminology', 195), ('Dermatitis', 212), ('Medical device', 216), ('Fetal alcohol spectrum disorder', 225), ('Male genital disease', 236), ('Blood', 240), ('PET-CT', 253), ('Lung cancer', 254), ('Drugs in pregnancy', 257), ('hyperproteinemia', 270), ('Artificial cardiac pacemaker', 271), ('Sodium thiopental', 275), ('Branched-chain amino acid', 276), ('Macular edema', 279), ('Human feces', 285), ('Bone fracture', 312), ('Exocrine pancreatic insufficiency', 339), ('Epileptic seizure', 343), ('Black eye', 377), ('Peripheral neuropathy', 380), ('Entamoeba histolytica', 391), ('Radiography', 394), ('Scarring hair loss', 397), ('Thoracotomy', 403), ('Pyogenic liver abscess', 405), ('CT scan', 409), ('Megaloblastic anemia', 411), ('Fetal viability', 425), ('Blurred vision', 431), ('Bruise', 443), ('Urination', 451), ('Nephrotoxicity', 453), ('Medical ultrasound', 459), ('Exfoliation (cosmetology)', 461), ('Gastric erosion', 463), ('Small fiber peripheral neuropathy', 467), ('Medical device connectivity', 488), ('Antigen', 493), ('Carotenosis', 496)]

In [35]:
full_corpus_3000_fixed = replace_missing_wiki(replacement_3000, full_corpus_3000)

[('blood corticotrophin increased', 99), ('hla marker study positive', 109), ('hepatic congestion', 119), ('upper respiratory tract irritation', 121)]


In [36]:
replacement_3000_redo = [('Desmopressin', 99), ('Human leukocyte antigen', 109), ('Congestive hepatopathy', 119), ('Upper respiratory tract infection', 121)]


In [37]:
full_corpus_3000_fixed_again = replace_missing_wiki(replacement_3000_redo, full_corpus_3000_fixed)

[]


In [38]:
with open('/Corpus/full_corpus_3000_fixed_again.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_3000_fixed_again, picklefile)

# Obtain remainder of corpus (3004)

In [26]:
len(symptoms_to_wiki_search_list)

3004

In [25]:
full_corpus_remainder = create_wiki_symptom_corpus(symptoms_to_wiki_search_list[3000:3050])

[]


In [27]:
len(full_corpus_remainder)

4

In [28]:
with open('/Corpus/full_corpus_remainder.pkl', 'wb') as picklefile:
        pickle.dump(full_corpus_remainder, picklefile)

# Summary

### What I did
1. Scraped Wikipedia symptom page text

### What I will do next
1. Compile all subsets of the corpus that was scraped here into a full Wikipedia symptoms corpus
2. Vectorize the full corpus
3. Apply a KNN model to reduce the number of symptoms that will become the features in my final model