###### Project: Adverse Medical Outcomes Prediction 
##### Data Scientist: Victoria M. Ng 

# Import Libraries

In [2]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
import psycopg2 as pg

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
chromedriver = "/home/victoria/projects/metis/Project3/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

  """)
2018-05-11 07:02:53,015 : INFO : 'pattern' package not found; tag filters are not available for English
  from pandas.core import datetools


# Import all subsets of the corpus

In [3]:
with open('/Corpus/full_corpus_200_fixed.pkl', 'rb') as picklefile: 
    full_corpus_200_fixed = pickle.load(picklefile)

In [4]:
with open('/Corpus/full_corpus_400_fixed.pkl', 'rb') as picklefile: 
    full_corpus_400_fixed = pickle.load(picklefile)

In [5]:
with open('/Corpus/full_corpus_600_fixed.pkl', 'rb') as picklefile: 
    full_corpus_600_fixed = pickle.load(picklefile)

In [6]:
with open('/Corpus/full_corpus_800_fixed.pkl', 'rb') as picklefile: 
    full_corpus_800_fixed = pickle.load(picklefile)

In [7]:
with open('/Corpus/full_corpus_1000_fixed.pkl', 'rb') as picklefile: 
    full_corpus_1000_fixed = pickle.load(picklefile)

In [8]:
with open('/Corpus/full_corpus_1500_fixed.pkl', 'rb') as picklefile: 
    full_corpus_1500_fixed = pickle.load(picklefile)

In [9]:
with open('/Corpus/full_corpus_2000_fixed.pkl', 'rb') as picklefile: 
    full_corpus_2000_fixed = pickle.load(picklefile)

In [10]:
with open('/Corpus/full_corpus_2500_fixed.pkl', 'rb') as picklefile: 
    full_corpus_2500_fixed = pickle.load(picklefile)

In [11]:
with open('/Corpus/full_corpus_3000_fixed_again.pkl', 'rb') as picklefile: 
    full_corpus_3000_fixed_again = pickle.load(picklefile)

In [12]:
with open('/Corpus/full_corpus_remainder.pkl', 'rb') as picklefile: 
    full_corpus_remainder = pickle.load(picklefile)

# Concatenate all subsets of the corpus into one full corpus

In [13]:
wiki_symptom_full_corpus = full_corpus_200_fixed + full_corpus_400_fixed + full_corpus_600_fixed + full_corpus_800_fixed + full_corpus_1000_fixed + full_corpus_1500_fixed + full_corpus_2000_fixed + full_corpus_2500_fixed + full_corpus_3000_fixed_again + full_corpus_remainder

In [14]:
len(wiki_symptom_full_corpus)

3004

In [15]:
with open('wiki_symptom_full_corpus.pkl', 'wb') as picklefile:
        pickle.dump(wiki_symptom_full_corpus, picklefile)

# Vectorize the full wiki corpus

In [16]:
stop_words = list(STOP_WORDS)
stop_words.append('a')

In [17]:
cv_wiki_symptom_full_corpus = CountVectorizer(lowercase=True, stop_words=stop_words)

In [18]:
cv_wiki_symptom_full_corpus.fit(wiki_symptom_full_corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['even', 'between', 'otherwise', 'somehow', 'keep', 'hereupon', 'become', 'being', 'please', 'former', 'they', 'within', 'quite', 'say', 'was', 'sixty', 'fifty', 'perhaps', 'further', 'his', 'all', 'too', 'whereafter', 'last', 'ourselves', 'a', 'by', 'how', 'would', 'as', 'together', 'bec...will', 'rather', 'unless', 'next', 'never', 'except', 'besides', 'much', 'per', 'before', 'of', 'a'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
wiki_symptom_full_corpus_dict = cv_wiki_symptom_full_corpus.vocabulary_
len(wiki_symptom_full_corpus_dict)

139329

In [20]:
wiki_symptom_full_corpus_dict.get('feces', 'Nope')

45612

In [21]:
wiki_symptom_full_corpus_vect = cv_wiki_symptom_full_corpus.transform(wiki_symptom_full_corpus)

# Apply PCA to reduce the number of columns( words) that will become the bins (categories ) for my KNN model

In [22]:
pca_20_wiki_symptom_full_corpus = PCA(n_components = 20)

In [23]:
pca_20_wiki_symptom_full_corpus.fit(wiki_symptom_full_corpus_vect.todense())

PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [24]:
# matrix with 20 most ~'common' combinations of words
wiki_symptom_full_corpus_20_dim_matrix = pca_20_wiki_symptom_full_corpus.transform(wiki_symptom_full_corpus_vect.todense())

In [25]:
type(wiki_symptom_full_corpus_20_dim_matrix)

numpy.ndarray

In [26]:
wiki_symptom_full_corpus_20_dim_matrix.shape

(3004, 20)

#### Look at interesting attributes of the KNN model class

In [58]:
print(pca_20_wiki_symptom_full_corpus.explained_variance_ratio_)

[0.06438179 0.0311082  0.02742774 0.02549617 0.02085938 0.01899499
 0.01328094 0.01229413 0.01068638 0.01011975 0.00972889 0.00919736
 0.00884447 0.0081622  0.0079821  0.00793626 0.00765372 0.00740921
 0.00721613 0.00702847]


In [59]:
print(pca_20_wiki_symptom_full_corpus.singular_values_)

[1501.3102963  1043.58157987  979.90492598  944.77073001  854.55433267
  815.4708309   681.87284181  656.05122916  611.65176393  595.21491583
  583.60726721  567.4406782   556.44821252  534.55517032  528.6248833
  527.10479107  517.63683446  509.30145278  502.62158321  496.04295962]


In [60]:
print(pca_20_wiki_symptom_full_corpus.components_)

[[ 4.05397482e-04 -7.73336821e-06  9.69304006e-05 ...  9.47439348e-05
   7.28799498e-06  7.28799498e-06]
 [ 8.98259809e-04 -1.65733056e-05  1.97098597e-05 ... -4.90547406e-05
  -3.77344159e-06 -3.77344159e-06]
 [ 1.76587484e-04 -3.44689238e-05 -4.75878317e-05 ... -1.34368140e-04
  -1.03360107e-05 -1.03360107e-05]
 ...
 [-1.12422923e-04  9.65011374e-05  1.98605043e-04 ...  3.45732052e-04
   2.65947732e-05  2.65947732e-05]
 [ 6.27931967e-05 -2.28074793e-04 -2.04487902e-04 ...  3.24378432e-04
   2.49521870e-05  2.49521870e-05]
 [ 3.61698582e-04  3.08833923e-05  8.59353467e-05 ... -6.16059203e-04
  -4.73891695e-05 -4.73891695e-05]]


## Test how you're going to get your test train split

In [27]:
wiki_symptom_full_corpus_20_dim_matrix

array([[-17.91325639,  -2.35474632,  -0.12414797, ...,   0.97728442,
          1.441557  ,   0.46093553],
       [-19.19324951,  -3.60991213,  -0.57976562, ...,   3.29253031,
         -1.24793116,   0.46976203],
       [-18.63415974,  -2.11638159,   1.01150961, ...,   0.57562599,
         -0.93591457,   0.15758927],
       ...,
       [-15.80131469,   3.85749768,   2.49505228, ...,  -1.07114407,
         -1.91754479,   1.33740113],
       [  1.19297458,  -9.78163688,   5.20028238, ...,  -5.12938109,
         -2.34153901,  -7.61357924],
       [ 32.30433105, -10.47030214, -23.96821249, ...,   1.43799505,
          0.88625508, -16.34231153]])

In [28]:
type(wiki_symptom_full_corpus_20_dim_matrix[1])

numpy.ndarray

In [29]:
test_a = wiki_symptom_full_corpus_20_dim_matrix[1]
test_b = wiki_symptom_full_corpus_20_dim_matrix[2]
test_final = np.array([test_a, test_b])
test_final

array([[-1.91932495e+01, -3.60991213e+00, -5.79765617e-01,
         1.93507417e+00, -1.22329064e+00,  3.19618122e+00,
        -6.26147046e-01, -7.76237818e-04, -1.62891795e+00,
         1.74167825e+00,  5.50904117e-01, -2.53897081e+00,
         2.56192110e+00, -1.60691884e+00, -1.66295676e-01,
         2.31988715e+00, -1.96352069e+00,  3.29253031e+00,
        -1.24793116e+00,  4.69762028e-01],
       [-1.86341597e+01, -2.11638159e+00,  1.01150961e+00,
         1.30934891e+00, -1.51075949e+00,  6.02667167e+00,
         7.45225390e-01,  4.99584925e-02, -1.29999190e+00,
         2.01529449e+00,  1.64704970e-01, -7.03651107e-01,
        -1.12247023e+00, -3.70048853e-01, -9.00026923e-01,
         1.09761547e+00,  1.42795547e-01,  5.75625990e-01,
        -9.35914571e-01,  1.57589269e-01]])

In [30]:
type(test_final)

numpy.ndarray

# Create the test and train corpus matrices

Below are the bins I want to categorize my symptoms into and their corresponding index in the unique symptoms list. I tried to get a combination of both severe symptoms and common symptoms. I also tried to find symptoms that were generalizable but not too much that there can be both severe and common reactions mapped to the same symptom.
1. Diarrhea (69)
2. Mood_swing (104)
3. Renal function (859)
4. Upper respiratory tract infection (904)
5. Prothrombin time (1013) 
6. Bleeding (1141)
7. Nasal congestion (1261)
8. Drug overdose (1297)
9. Angina (1341)
10. Dysbiosis (1873)
11. Overactive bladder (2359)
12. Suicide terminology (2695)
13. Epileptic seizure (2843)
14. Nephrotoxicity (2953)

In [31]:
Diarrhea = wiki_symptom_full_corpus_20_dim_matrix[69]
Mood_swing = wiki_symptom_full_corpus_20_dim_matrix[104]
Renal_function = wiki_symptom_full_corpus_20_dim_matrix[859]
Upper_respiratory_tract_infection = wiki_symptom_full_corpus_20_dim_matrix[904]
Prothrombin_time = wiki_symptom_full_corpus_20_dim_matrix[1013]
Bleeding = wiki_symptom_full_corpus_20_dim_matrix[1141]
Nasal_congestion = wiki_symptom_full_corpus_20_dim_matrix[1261]
Drug_overdose = wiki_symptom_full_corpus_20_dim_matrix[1297]
Angina = wiki_symptom_full_corpus_20_dim_matrix[1341]
Dysbiosis = wiki_symptom_full_corpus_20_dim_matrix[1873]
Overactive_bladder = wiki_symptom_full_corpus_20_dim_matrix[2359]
Suicide_terminology = wiki_symptom_full_corpus_20_dim_matrix[2695]
Epileptic_seizure = wiki_symptom_full_corpus_20_dim_matrix[2843]
Nephrotoxicity = wiki_symptom_full_corpus_20_dim_matrix[2953]

In [32]:
target_indices = [69, 104, 859, 904, 1013, 1141, 1261, 1297, 1341, 1873, 2359, 2695, 2843, 2953]
len(target_indices)

14

In [41]:
mask = np.zeros(len(wiki_symptom_full_corpus_20_dim_matrix), dtype=bool)

In [46]:
mask[0]

False

In [47]:
mask[target_indices] = 1

In [50]:
X_train = wiki_symptom_full_corpus_20_dim_matrix[target_indices]
y_train = [x for x in range(len(target_indices))]

X_test = wiki_symptom_full_corpus_20_dim_matrix[~mask]

In [51]:
print(X_train.shape)
print(len(y_train))
print(X_test.shape)

(14, 20)
14
(2990, 20)


# Pickle the test train split for feature engineering purposes

In [54]:
with open('wiki_x_train.pkl', 'wb') as picklefile:
        pickle.dump(X_train, picklefile)

In [57]:
with open('wiki_y_train.pkl', 'wb') as picklefile:
        pickle.dump(y_train, picklefile)

In [56]:
with open('wiki_x_test.pkl', 'wb') as picklefile:
        pickle.dump(X_test, picklefile)

# Summary

### What I did
1. Concatenated all subsets of the Wikipedia corpus into one full corpus
2. Vectorized the corpus
3. Applied PCA to reduce the dimensionality of my matrix
4. Created the test and train matrices

### What I will do next
1. Apply a KNN N=1 model to the reduced dimension Wikipedia corpus matrix to categorize all symptoms for purposes of making them my features in my final model