###### Project: Adverse Medical Outcomes Prediction 
##### Data Scientist: Victoria M. Ng 

# Import libraries

In [8]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
chromedriver = "/home/victoria/projects/metis/Project3/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

# Query dataframes from SQL

In [9]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [10]:
food_df = pd.read_sql('SELECT * FROM food_df_cleaned', engine)

In [11]:
outcomes_df = pd.read_sql('SELECT * FROM outcomes_df', engine)

# Create a dataframe from just the outcomes column and the outcomes df

In [50]:
outcomes_df_updated = outcomes_df

# Remove the non and none columns 

In [51]:
outcomes_df_updated.drop(columns = ['non', 'none'], inplace=True)

In [56]:
outcomes_df_updated.head(5)

Unnamed: 0,congenital_anomaly,death,disability,hospitalization,life_threatening,other_serious__important_medical_events_,req_intervention_to_prvnt_perm_imprmnt,serious_injuries_illness,visited_a_health_care_provider,visited_an_er
0,0,0,0,1,0,0,1,0,1,1
1,0,0,0,1,0,0,1,0,1,1
2,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0


In [57]:
outcomes_df_updated.shape

(34000, 10)

In [58]:
outcomes_df_updated.columns

Index(['congenital_anomaly', 'death', 'disability', 'hospitalization',
       'life_threatening', 'other_serious__important_medical_events_',
       'req_intervention_to_prvnt_perm_imprmnt', 'serious_injuries_illness',
       'visited_a_health_care_provider', 'visited_an_er'],
      dtype='object')

# Order the outcomes from most dire to least dire

There is more than one outcome per event(row). Since I want to help people decide whether they should seek medical care, I should try to predict the worst outcome possible. 

Although this outcome is categorical, for visualization purposes later on, it would be easier to order the labels by most to least dire. 

1. death
2. life_threatening
3. serious_injuries_illness
4. disability
5. other_serious__important_medical_events_
6. congenital_anomaly
7. req_intervention_to_prvnt_perm_imprmnt
8. hospitalization
9. visited_an_er
10. visited_a_health_care_provider

In [12]:
outcomes_ordered_dict = {'death': 0, 'life_threatening': 1, 'serious_injuries_illness': 2, 'disability': 3, 
                    'other_serious__important_medical_events_':4, 'congenital_anomaly': 5,
                   'req_intervention_to_prvnt_perm_imprmnt': 6, 'hospitalization': 7,
                   'visited_an_er': 8, 'visited_a_health_care_provider': 9}

In [5]:
with open('outcomes_ordered_dict.pkl', 'wb') as picklefile:
        pickle.dump(outcomes_ordered_dict, picklefile)

# Define a function to create a column of the worst outcome per row

In [59]:
'''
This function updates the worst outcome column value in the dataframe 
per row with the associated outcome label. Since I removed the columns 
that were outcome non or none, there will be rows that don't have an 
associated worst outcome (meaning they only had non or none as the value
for outcome). Therefore, I will encode those as -9 and drop those rows 
during EDA.
'''

def create_column_of_worst_outcome(outcomes_df):
    worst_outcomes = []
    for index, row in outcomes_df.iterrows():
        if row['death'] == 1:
            worst_outcomes.append(0)
        elif row['life_threatening'] == 1:
            worst_outcomes.append(1)
        elif row['serious_injuries_illness'] == 1:
            worst_outcomes.append(2)
        elif row['disability'] == 1:
            worst_outcomes.append(3)
        elif row['other_serious__important_medical_events_'] == 1:
            worst_outcomes.append(4)
        elif row['congenital_anomaly'] == 1:
            worst_outcomes.append(5)
        elif row['req_intervention_to_prvnt_perm_imprmnt'] == 1:
            worst_outcomes.append(6)
        elif row['hospitalization'] == 1:
            worst_outcomes.append(7)
        elif row['visited_an_er'] == 1:
            worst_outcomes.append(8)
        elif row['visited_a_health_care_provider'] == 1:
            worst_outcomes.append(9)
        else:
            worst_outcomes.append(-9)
    return worst_outcomes

# Test the function

In [60]:
test_df = outcomes_df[:100]
test_outcome_codes = create_column_of_worst_outcome(test_df)
test_df['test_outcome_codes'] = test_outcome_codes
test_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,congenital_anomaly,death,disability,hospitalization,life_threatening,other_serious__important_medical_events_,req_intervention_to_prvnt_perm_imprmnt,serious_injuries_illness,visited_a_health_care_provider,visited_an_er,test_outcome_codes
0,0,0,0,1,0,0,1,0,1,1,6
1,0,0,0,1,0,0,1,0,1,1,6
2,0,0,0,0,0,0,0,1,0,0,2
3,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0


# Run the function and add the output to the outcomes df 

In [61]:
outcome_codes = create_column_of_worst_outcome(outcomes_df_updated)
outcomes_df_updated['worst_outcome_code'] = outcome_codes
outcomes_df_updated.head(5)

Unnamed: 0,congenital_anomaly,death,disability,hospitalization,life_threatening,other_serious__important_medical_events_,req_intervention_to_prvnt_perm_imprmnt,serious_injuries_illness,visited_a_health_care_provider,visited_an_er,worst_outcome_code
0,0,0,0,1,0,0,1,0,1,1,6
1,0,0,0,1,0,0,1,0,1,1,6
2,0,0,0,0,0,0,0,1,0,0,2
3,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0


In [62]:
outcomes_df_updated.shape

(34000, 11)

# Pickle and upload dataframe to postgre SQL

In [63]:
with open('outcomes_df_coded.pkl', 'wb') as picklefile:
        pickle.dump(outcomes_df_updated, picklefile)

In [64]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [65]:
outcomes_df_updated.to_sql(name='outcomes_df_coded', con=engine, if_exists = 'replace', index=False)

# Summary

### What I did
1. Cleaned the non and none outcomes
2. Ordered the outcomes from most to least dire
3. Assigned a worst outcome to each row of the df

### What I will do next
1. Remember to drop rows with -9 as outcome 
2. EDA