###### Project: Adverse Medical Outcomes Prediction 
##### Data Scientist: Victoria M. Ng 

# Import libraries

In [2]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
import psycopg2 as pg

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
chromedriver = "/home/victoria/projects/metis/Project3/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

  """)
2018-05-11 15:28:09,780 : INFO : 'pattern' package not found; tag filters are not available for English
  from pandas.core import datetools


# Import test train split of vectorized and PCA'ed wiki symptom corpus

In [3]:
with open('wiki_x_train.pkl', 'rb') as picklefile: 
    X_train = pickle.load(picklefile)

In [4]:
with open('wiki_y_train.pkl', 'rb') as picklefile: 
    y_train = pickle.load(picklefile)

In [5]:
with open('wiki_x_test.pkl', 'rb') as picklefile: 
    X_test = pickle.load(picklefile)

In [6]:
with open('wiki_symptom_full_corpus.pkl', 'rb') as picklefile: 
    wiki_symptom_full_corpus = pickle.load(picklefile)

# Review the bins I chose

Below are the bins I want to categorize my symptoms into and their corresponding index in the unique symptoms list. I tried to get a combination of both severe symptoms and common symptoms. I also tried to find symptoms that were generalizable but not too much that there can be both severe and common reactions mapped to the same symptom.
1. Diarrhea (69)
2. Mood_swing (104)
3. Renal function (859)
4. Upper respiratory tract infection (904)
5. Prothrombin time (1013) 
6. Bleeding (1141)
7. Nasal congestion (1261)
8. Drug overdose (1297)
9. Angina (1341)
10. Dysbiosis (1873)
11. Overactive bladder (2359)
12. Suicide terminology (2695)
13. Epileptic seizure (2843)
14. Nephrotoxicity (2953)

# Fit KNN K=1model to the training matrix

In [7]:
KNN_symptoms = KNeighborsClassifier(n_neighbors=1, n_jobs=-1 )

In [8]:
KNN_symptoms.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
           weights='uniform')

# Get the categorizations (predictions) 

In [9]:
predictions = KNN_symptoms.predict(X_test)

In [10]:
# Take a look at the distribution of the docs to bins
np.bincount(predictions)

array([ 82, 117,  36, 303, 381,  97, 379,  61,  90, 743, 136,  96, 262,
       207])

In [11]:
predictions

array([9, 9, 9, ..., 6, 3, 0])

In [30]:
with open('symptoms_categorizations.pkl', 'wb') as picklefile:
        pickle.dump(predictions, picklefile)

# Create a lookup table for all of the symptoms to their respective bins

In [12]:
target_indices = [69, 104, 859, 904, 1013, 1141, 1261, 1297, 1341, 1873, 2359, 2695, 2843, 2953]

In [13]:
with open('symptoms_to_wiki_search_list.pkl', 'rb') as picklefile: 
    symptoms_to_wiki_search_list = pickle.load(picklefile)

In [26]:
symptoms_to_bin = list(np.delete(symptoms_to_wiki_search_list,[target_indices]))
print(type(symptoms_to_bin))
print(symptoms_to_bin[0])

<class 'list'>
swelling face


In [1]:
lookup_symptoms_dict = dict(zip(symptoms_to_bin, predictions))

NameError: name 'symptoms_to_bin' is not defined

# Pickle the lookup dict for wiki symptom feature engineering

In [29]:
with open('lookup_symptoms_dict.pkl', 'wb') as picklefile:
        pickle.dump(lookup_symptoms_dict, picklefile)

# Summary

### What I did
1. Fit a KNN N=1 model to the training matrix of the Wikipedia corpus
2. Get the categorizations (or predictions) for the symptoms that needed to be binned 
3. Created a lookup table to map the symptoms to their category(bin) 

### What I will do next
1. Update the values of the symptoms bins within the symptoms matrix per row