###### Project: Adverse Medical Outcomes Prediction 
##### Data Scientist: Victoria M. Ng 

# Import Libraries

In [28]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
chromedriver = "/home/victoria/projects/metis/Project3/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

# Query modeling dataframe

In [29]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [30]:
modeling_df = pd.read_sql('SELECT * FROM modeling_df', engine)

# Conduct Test Train Split with all features

In [31]:
modeling_df.columns

Index(['index', 'industry_code', 'diarrhea', 'mood_swing', 'renal_function',
       'upper_respiratory_tract_infection', 'prothrombin_time', 'bleeding',
       'nasal_congestion', 'drug_overdose', 'angina', 'dysbiosis',
       'overactive_bladder', 'suicide_terminology', 'epileptic_seizure',
       'nephrotoxicity', 'worst_outcome_code', 'age_in_years',
       'product_name_codes', 'victim_gender_codes'],
      dtype='object')

In [32]:
modeling_df.shape

(33979, 20)

In [33]:
modeling_df.head(5)

Unnamed: 0,index,industry_code,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity,worst_outcome_code,age_in_years,product_name_codes,victim_gender_codes
0,0,3,0,0,0,2,0,0,0,0,0,4,0,0,0,0,6,2.0,10655,0
1,1,3,0,0,0,2,0,0,0,0,0,4,0,0,0,0,6,2.0,10655,0
2,2,7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,10.0,6037,1
3,3,54,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,51.0,10627,1
4,4,54,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,45.0,10549,0


In [35]:
modeling_df['index'].iloc[825:835]

825    825
826    826
827    827
828    828
829    829
830    833
831    834
832    835
833    836
834    837
Name: index, dtype: int64

# Drop index column (it was created after resetting index)

In [37]:
modeling_df.drop(columns=['index'], inplace=True)

In [38]:
modeling_df.shape

(33979, 19)

# Upload new modeling df to sql

In [59]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [60]:
modeling_df.to_sql(name='modeling_df', con=engine, if_exists = 'replace', index=False)

In [39]:
X = modeling_df[[x for x in modeling_df.columns if x != 'worst_outcome_code']]
y = modeling_df['worst_outcome_code']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30, stratify = y)

In [40]:
with open('X.pkl', 'wb') as picklefile:
        pickle.dump(X, picklefile)

In [41]:
with open('y.pkl', 'wb') as picklefile:
        pickle.dump(y, picklefile)

## Scale the test train split with all features

In [42]:
# Scale the train and validation data
ssX = StandardScaler()
 
# Fit_transform figures fit(out mean and std). 
# Then transforms (using the value from the fit x-mu for each datapoint)

# Fit and transform the training data
X_train = ssX.fit_transform(X_train)

# Validate and transform the validation data
X_test = ssX.transform(X_test)

## Review the test and train sets with all features

In [43]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(23785, 18)
(10194, 18)
(23785,)
(10194,)


# Conduct test train split without product name code

I realized that the product name code column has thousands of unique values. That would probably be too many to include individually as features, so I will do a separate test train split without the product name codes.

In [44]:
modeling_df_noproduct = modeling_df.drop(columns=['product_name_codes'])

In [45]:
X_noproduct = modeling_df_noproduct[[x for x in modeling_df_noproduct.columns if x != 'worst_outcome_code']]
y_noproduct = modeling_df_noproduct['worst_outcome_code']
X_train_noproduct, X_test_noproduct, y_train_noproduct, y_test_noproduct = train_test_split(X_noproduct, y_noproduct, test_size=0.3, random_state=30, stratify =y)

In [57]:
with open('X_noproduct.pkl', 'wb') as picklefile:
        pickle.dump(X_noproduct, picklefile)

In [58]:
with open('y_noproduct.pkl', 'wb') as picklefile:
        pickle.dump(y_noproduct, picklefile)

In [46]:
X_noproduct.shape

(33979, 17)

In [61]:
features_noproduct_list = list(X_noproduct.columns)

In [63]:
with open('features_noproduct_list.pkl', 'wb') as picklefile:
        pickle.dump(features_noproduct_list, picklefile)

In [47]:
# Scale the train and validation data
ssX_noproduct = StandardScaler()
 
# Fit_transform figures fit(out mean and std). 
# Then transforms (using the value from the fit x-mu for each datapoint)

# Fit and transform the training data
X_train_noproduct = ssX_noproduct.fit_transform(X_train_noproduct)

# Validate and transform the validation data
X_test_noproduct = ssX_noproduct.transform(X_test_noproduct)

In [48]:
print(X_train_noproduct.shape)
print(X_test_noproduct.shape)
print(y_train_noproduct.shape)
print(y_test_noproduct.shape)

(23785, 17)
(10194, 17)
(23785,)
(10194,)


# Pickle the test and train sets all features

In [49]:
with open('X_train.pkl', 'wb') as picklefile:
        pickle.dump(X_train, picklefile)

In [50]:
with open('X_test.pkl', 'wb') as picklefile:
        pickle.dump(X_test, picklefile)

In [51]:
with open('y_train.pkl', 'wb') as picklefile:
        pickle.dump(y_train, picklefile)

In [52]:
with open('y_test.pkl', 'wb') as picklefile:
        pickle.dump(y_test, picklefile)

# Pickle the test and train sets without product names feature

In [53]:
with open('X_train_noproduct.pkl', 'wb') as picklefile:
        pickle.dump(X_train_noproduct, picklefile)

In [54]:
with open('X_test_noproduct.pkl', 'wb') as picklefile:
        pickle.dump(X_test_noproduct, picklefile)

In [55]:
with open('y_train_noproduct.pkl', 'wb') as picklefile:
        pickle.dump(y_train_noproduct, picklefile)

In [56]:
with open('y_test_noproduct.pkl', 'wb') as picklefile:
        pickle.dump(y_test_noproduct, picklefile)

# Summary

### What I did
1. Conducted a test train split on the full modeling df
2. Conducted a separate test train split on all data minus product codes

### What I will do next
1. Modeling using the decision tree classification model