###### Project: Adverse Medical Outcomes Prediction 
##### Data Scientist: Victoria M. Ng 

# Import libaries

In [2]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
chromedriver = "/home/victoria/projects/metis/Project3/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

  """)
2018-05-14 15:18:03,399 : INFO : 'pattern' package not found; tag filters are not available for English
  from pandas.core import datetools


# Query dataframes from SQL

In [3]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [4]:
food_df = pd.read_sql('SELECT * FROM food_df_cleaned', engine)

In [5]:
symptoms_df = pd.read_sql('SELECT * FROM symptoms_df', engine)

In [9]:
outcomes_df = pd.read_sql('SELECT * FROM outcomes_df_coded', engine)

# Concatenate all dfs to make a full df 

In [10]:
full_df = pd.concat([food_df, symptoms_df, outcomes_df], axis=1)

In [11]:
full_df.shape

(34000, 37)

In [12]:
full_df.columns

Index(['report_id', 'report_date', 'event_date', 'product_role',
       'product_name', 'industry_code', 'industry_name', 'victim_age',
       'victim_age_unit', 'victim_gender', 'outcomes', 'symptoms', 'diarrhea',
       'mood_swing', 'renal_function', 'upper_respiratory_tract_infection',
       'prothrombin_time', 'bleeding', 'nasal_congestion', 'drug_overdose',
       'angina', 'dysbiosis', 'overactive_bladder', 'suicide_terminology',
       'epileptic_seizure', 'nephrotoxicity', 'congenital_anomaly', 'death',
       'disability', 'hospitalization', 'life_threatening',
       'other_serious__important_medical_events_',
       'req_intervention_to_prvnt_perm_imprmnt', 'serious_injuries_illness',
       'visited_a_health_care_provider', 'visited_an_er',
       'worst_outcome_code'],
      dtype='object')

# Drop rows with -9 (non or none) outcome value

In [13]:
mask_to_drop = full_df['worst_outcome_code'] == -9

In [14]:
full_df = full_df[~mask_to_drop]

In [15]:
full_df.shape

(33979, 37)

In [32]:
full_df.head(5)

Unnamed: 0,report_id,report_date,event_date,product_role,product_name,industry_code,industry_name,victim_age,victim_age_unit,victim_gender,...,death,disability,hospitalization,life_threatening,other_serious__important_medical_events_,req_intervention_to_prvnt_perm_imprmnt,serious_injuries_illness,visited_a_health_care_provider,visited_an_er,worst_outcome_code
0,65325,1/1/2004,8/4/2003,Suspect,MIDWEST COUNTRY FAIR CHOCOLATE FLAVORED CHIPS,3,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),Female,...,0,0,1,0,0,1,0,1,1,6
1,65325,1/1/2004,8/4/2003,Suspect,MIDWEST COUNTRY FAIR CHOCOLATE FLAVORED CHIPS,3,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),Female,...,0,0,1,0,0,1,0,1,1,6
2,65345,1/1/2004,12/21/2003,Suspect,"FRITO LAY FUNYUNS ONION FLAVOR, ONION RINGS",7,Snack Food Item,10.0,Year(s),Male,...,0,0,0,0,0,0,1,0,0,2
3,65399,1/5/2004,11/22/2003,Suspect,METOBOLITE 356,54,Vit/Min/Prot/Unconv Diet(Human/Animal),51.0,Year(s),Male,...,1,0,0,0,0,0,0,0,0,0
4,65400,1/5/2004,9/5/2001,Suspect,METABOLIFE,54,Vit/Min/Prot/Unconv Diet(Human/Animal),45.0,Year(s),Female,...,1,0,0,0,0,0,0,0,0,0


# Reset the indices

In [43]:
full_df.reset_index(inplace=True)

# Convert all victim ages into years so that it can be a continuous variable

In [33]:
full_df['victim_age_unit'].value_counts()

Year(s)      32929
Month(s)       811
Week(s)        120
Day(s)         118
Decade(s)        1
Name: victim_age_unit, dtype: int64

In [37]:
food_df['victim_age_unit'].value_counts()

Year(s)      32948
Month(s)       812
Week(s)        121
Day(s)         118
Decade(s)        1
Name: victim_age_unit, dtype: int64

In [38]:
print(full_df.shape)
print(food_df.shape)

(33979, 37)
(34000, 12)


In [34]:
'''
This function converts the age to years, as some victim's ages were 
listed as a weeks, days, or decades.
'''

def convert_age_to_years(food_df):
    updated_ages = []
    for index, age_unit in enumerate(food_df['victim_age_unit']):
        if age_unit == 'Month(s)':
            updated_ages.append((food_df['victim_age'][index])/12)
        elif age_unit == 'Week(s)':
            updated_ages.append((food_df['victim_age'][index])/52)
        elif age_unit == 'Day(s)':
            updated_ages.append((food_df['victim_age'][index])/365)
        elif age_unit == 'Decade(s)':
            updated_ages.append((food_df['victim_age'][index])*10)
        else:
            updated_ages.append(food_df['victim_age'][index])
    return updated_ages

In [44]:
test = convert_age_to_years(full_df)

In [45]:
test_df = full_df['victim_age']
test_unit = full_df['victim_age_unit']

In [46]:
test_zip = list(zip(test, test_df, test_unit))
test_zip

[(2.0, 2.0, 'Year(s)'),
 (2.0, 2.0, 'Year(s)'),
 (10.0, 10.0, 'Year(s)'),
 (51.0, 51.0, 'Year(s)'),
 (45.0, 45.0, 'Year(s)'),
 (54.0, 54.0, 'Year(s)'),
 (36.0, 36.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (33.0, 33.0, 'Year(s)'),
 (14.0, 14.0, 'Year(s)'),
 (14.0, 14.0, 'Year(s)'),
 (14.0, 14.0, 'Year(s)'),
 (59.0, 59.0, 'Year(s)'),
 (43.0, 43.0, 'Year(s)'),
 (51.0, 51.0, 'Year(s)'),
 (35.0, 35.0, 'Year(s)'),
 (19.0, 19.0, 'Year(s)'),
 (4.0, 4.0, 'Year(s)'),
 (73.0, 73.0, 'Year(s)'),
 (10.0, 10.0, 'Year(s)'),
 (65.0, 65.0, 'Year(s)'),
 (2.0, 2.0, 'Year(s)'),
 (78.0, 78.0, 'Year(s)'),
 (66.0, 66.0, 'Year(s)'),
 (28.0, 28.0, 'Year(s)'),
 (13.0, 13.0, 'Year(s)'),
 (25.0, 25.0, 'Year(s)'),
 (4.0, 4.0, 'Year(s)'),
 (3.0, 3.0, 'Year(s)'),
 (70.0, 70.0, 'Year(s)'),
 (70.0, 70.0, 'Year(s)')

In [48]:
age_in_years = convert_age_to_years(full_df)

In [49]:
full_df['age_in_years'] = age_in_years

In [50]:
full_df.head(5)

Unnamed: 0,index,report_id,report_date,event_date,product_role,product_name,industry_code,industry_name,victim_age,victim_age_unit,...,disability,hospitalization,life_threatening,other_serious__important_medical_events_,req_intervention_to_prvnt_perm_imprmnt,serious_injuries_illness,visited_a_health_care_provider,visited_an_er,worst_outcome_code,age_in_years
0,0,65325,1/1/2004,8/4/2003,Suspect,MIDWEST COUNTRY FAIR CHOCOLATE FLAVORED CHIPS,3,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),...,0,1,0,0,1,0,1,1,6,2.0
1,1,65325,1/1/2004,8/4/2003,Suspect,MIDWEST COUNTRY FAIR CHOCOLATE FLAVORED CHIPS,3,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),...,0,1,0,0,1,0,1,1,6,2.0
2,2,65345,1/1/2004,12/21/2003,Suspect,"FRITO LAY FUNYUNS ONION FLAVOR, ONION RINGS",7,Snack Food Item,10.0,Year(s),...,0,0,0,0,0,1,0,0,2,10.0
3,3,65399,1/5/2004,11/22/2003,Suspect,METOBOLITE 356,54,Vit/Min/Prot/Unconv Diet(Human/Animal),51.0,Year(s),...,0,0,0,0,0,0,0,0,0,51.0
4,4,65400,1/5/2004,9/5/2001,Suspect,METABOLIFE,54,Vit/Min/Prot/Unconv Diet(Human/Animal),45.0,Year(s),...,0,0,0,0,0,0,0,0,0,45.0


In [51]:
full_df.shape

(33979, 39)

# Convert the product name categorical features into codes

In [52]:
# Make a categorized version of the product name 
full_df['product_name_cat'] = full_df['product_name'].astype('category')

In [53]:
full_df['product_name_codes'] = full_df['product_name_cat'].cat.codes

In [54]:
product_name_mapping = full_df['product_name_cat'].cat.categories

#### Since there are so many unique product names, it might not be useful to use this column for modeling. Instead, I will probably just use the industry code.

In [55]:
with open('product_name_mapping.pkl', 'wb') as picklefile:
        pickle.dump(product_name_mapping, picklefile)

# Convert the victim gender column to codes

In [56]:
full_df['victim_gender_cat'] = full_df['victim_gender'].astype('category')

In [57]:
full_df['victim_gender_codes'] = full_df['victim_gender_cat'].cat.codes

In [58]:
victim_gender_mapping = full_df['victim_gender_cat'].cat.categories
victim_gender_mapping

Index(['Female', 'Male'], dtype='object')

In [59]:
with open('victim_gender_mapping.pkl', 'wb') as picklefile:
        pickle.dump(victim_gender_mapping, picklefile)

In [60]:
full_df.columns

Index(['index', 'report_id', 'report_date', 'event_date', 'product_role',
       'product_name', 'industry_code', 'industry_name', 'victim_age',
       'victim_age_unit', 'victim_gender', 'outcomes', 'symptoms', 'diarrhea',
       'mood_swing', 'renal_function', 'upper_respiratory_tract_infection',
       'prothrombin_time', 'bleeding', 'nasal_congestion', 'drug_overdose',
       'angina', 'dysbiosis', 'overactive_bladder', 'suicide_terminology',
       'epileptic_seizure', 'nephrotoxicity', 'congenital_anomaly', 'death',
       'disability', 'hospitalization', 'life_threatening',
       'other_serious__important_medical_events_',
       'req_intervention_to_prvnt_perm_imprmnt', 'serious_injuries_illness',
       'visited_a_health_care_provider', 'visited_an_er', 'worst_outcome_code',
       'age_in_years', 'product_name_cat', 'product_name_codes',
       'victim_gender_cat', 'victim_gender_codes'],
      dtype='object')

# Upload the full df to SQL

In [61]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [62]:
full_df.to_sql(name='full_df', con=engine, if_exists = 'replace', index=False)

# Create a new dataframe specifically for modeling

In [63]:
food_df_modeling = full_df.copy()

In [64]:
food_df_modeling.columns

Index(['index', 'report_id', 'report_date', 'event_date', 'product_role',
       'product_name', 'industry_code', 'industry_name', 'victim_age',
       'victim_age_unit', 'victim_gender', 'outcomes', 'symptoms', 'diarrhea',
       'mood_swing', 'renal_function', 'upper_respiratory_tract_infection',
       'prothrombin_time', 'bleeding', 'nasal_congestion', 'drug_overdose',
       'angina', 'dysbiosis', 'overactive_bladder', 'suicide_terminology',
       'epileptic_seizure', 'nephrotoxicity', 'congenital_anomaly', 'death',
       'disability', 'hospitalization', 'life_threatening',
       'other_serious__important_medical_events_',
       'req_intervention_to_prvnt_perm_imprmnt', 'serious_injuries_illness',
       'visited_a_health_care_provider', 'visited_an_er', 'worst_outcome_code',
       'age_in_years', 'product_name_cat', 'product_name_codes',
       'victim_gender_cat', 'victim_gender_codes'],
      dtype='object')

In [65]:
'''Removing report ID, date, event date, and product role because it's 
not something that can be used to predict on, since it would already be
knowns by the time the report is drafted. 
'''

'''Since the industry code is available, I will remove the industry name.'''

'''Remove the outcomes and symptoms columns since I already vectorized them'''
'''Remove the individual outcome columns since I already made a worst outcome column'''

''''''
food_df_modeling.drop(columns = ['report_id', 'report_date', 'event_date', 'product_role',
       'product_name', 'industry_name', 'victim_age',
       'victim_age_unit', 'victim_gender', 'outcomes', 'symptoms','congenital_anomaly', 'death',
       'disability', 'hospitalization', 'life_threatening',
       'other_serious__important_medical_events_',
       'req_intervention_to_prvnt_perm_imprmnt', 'serious_injuries_illness',
       'visited_a_health_care_provider', 'visited_an_er', 'product_name_cat',
       'victim_gender_cat'], inplace=True)

In [66]:
food_df_modeling.head(5)

Unnamed: 0,index,industry_code,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity,worst_outcome_code,age_in_years,product_name_codes,victim_gender_codes
0,0,3,0,0,0,2,0,0,0,0,0,4,0,0,0,0,6,2.0,10655,0
1,1,3,0,0,0,2,0,0,0,0,0,4,0,0,0,0,6,2.0,10655,0
2,2,7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,10.0,6037,1
3,3,54,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,51.0,10627,1
4,4,54,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,45.0,10549,0


In [67]:
food_df_modeling.columns # The index column will be removed during test train splitting

Index(['index', 'industry_code', 'diarrhea', 'mood_swing', 'renal_function',
       'upper_respiratory_tract_infection', 'prothrombin_time', 'bleeding',
       'nasal_congestion', 'drug_overdose', 'angina', 'dysbiosis',
       'overactive_bladder', 'suicide_terminology', 'epileptic_seizure',
       'nephrotoxicity', 'worst_outcome_code', 'age_in_years',
       'product_name_codes', 'victim_gender_codes'],
      dtype='object')

# Upload modeling dataframe to SQL

In [68]:
food_df_modeling.to_sql(name='modeling_df', con=engine, if_exists = 'replace', index=False)

# Summary

### What I did
1. Dropped rows that had a non or none outcome (encoded as -9 in previous step)
2. Convert all victim age to years to make sure the victim age was a proper continuous feature
3. Converted the victim gender to codes (0 for female and 1 for male)
4. Created a new dataframe for modeling

### What I will do next
1. Conduct the test train split for final modeling purposes