###### Project: Adverse Medical Outcomes Prediction 
##### Data Scientist: Victoria M. Ng 

# Import libraries

In [3]:
# General system libraries
import os
import sys
from IPython.display import Image, Markdown
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Dataframe libraries
import pandas as pd
from pandas import DataFrame, read_csv

# Number manipulation
import scipy.sparse
from scipy.ndimage.filters import generic_filter
import patsy
import numpy as np

# Plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

# Data type libaries
from datetime import datetime as dt

# File manipulation
import pickle
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import psycopg2 as pg

# NLP libraries
import wikipedia as wiki
from nltk import word_tokenize, sent_tokenize,FreqDist
from nltk.corpus import stopwords
import gensim as gn
from gensim import corpora, models, similarities
from collections import defaultdict
from six import iteritems
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

# Scraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
chromedriver = "/home/victoria/projects/metis/Project3/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Stats libaries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model, metrics
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import svm, datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


# Other libaries
import geopy

  """)
2018-05-16 16:09:15,053 : INFO : 'pattern' package not found; tag filters are not available for English
  from pandas.core import datetools


# Query cleaned food df from SQL

In [2]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [3]:
food_df = pd.read_sql('SELECT * FROM food_df_cleaned', engine)

In [4]:
food_df_copy = food_df.copy()

In [5]:
food_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34000 entries, 0 to 33999
Data columns (total 12 columns):
report_id          34000 non-null int64
report_date        34000 non-null object
event_date         34000 non-null object
product_role       34000 non-null object
product_name       34000 non-null object
industry_code      34000 non-null int64
industry_name      34000 non-null object
victim_age         34000 non-null float64
victim_age_unit    34000 non-null object
victim_gender      34000 non-null object
outcomes           34000 non-null object
symptoms           34000 non-null object
dtypes: float64(1), int64(2), object(9)
memory usage: 3.1+ MB


In [6]:
food_df_copy.head(5)

Unnamed: 0,report_id,report_date,event_date,product_role,product_name,industry_code,industry_name,victim_age,victim_age_unit,victim_gender,outcomes,symptoms
0,65325,1/1/2004,8/4/2003,Suspect,MIDWEST COUNTRY FAIR CHOCOLATE FLAVORED CHIPS,3,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),Female,"VISITED AN ER, VISITED A HEALTH CARE PROVIDER,...","SWELLING FACE, RASH, WHEEZING, COUGH, HOSPITAL..."
1,65325,1/1/2004,8/4/2003,Suspect,MIDWEST COUNTRY FAIR CHOCOLATE FLAVORED CHIPS,3,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),Female,"VISITED AN ER, VISITED A HEALTH CARE PROVIDER,...","SWELLING FACE, WHEEZING, COUGH, RASH, HOSPITAL..."
2,65345,1/1/2004,12/21/2003,Suspect,"FRITO LAY FUNYUNS ONION FLAVOR, ONION RINGS",7,Snack Food Item,10.0,Year(s),Male,NON-SERIOUS INJURIES/ ILLNESS,CHOKING
3,65399,1/5/2004,11/22/2003,Suspect,METOBOLITE 356,54,Vit/Min/Prot/Unconv Diet(Human/Animal),51.0,Year(s),Male,DEATH,"COMPLETED SUICIDE, STRESS SYMPTOMS, DEATH"
4,65400,1/5/2004,9/5/2001,Suspect,METABOLIFE,54,Vit/Min/Prot/Unconv Diet(Human/Animal),45.0,Year(s),Female,DEATH,"DEATH, MITRAL VALVE INCOMPETENCE"


# Create a separate dataframe with just the report id and symptoms 

In [28]:
symptoms_df = food_df_copy[['report_id', 'symptoms']].copy()
symptoms_df.head(5)
len(symptoms_df)

34000

# Import symptom lookup table

In [5]:
with open('symptoms_bin_dict.pkl', 'rb') as picklefile: 
    symptoms_bin_dict = pickle.load(picklefile)

# Import symptoms bin dict

In [None]:
with open('lookup_symptoms_dict.pkl', 'rb') as picklefile: 
    lookup_symptoms_dict = pickle.load(picklefile)

# Import symptom predictions (categorizations)

In [9]:
with open('symptoms_categorizations.pkl', 'rb') as picklefile: 
    symptoms_categorizations = pickle.load(picklefile)

# Add columns for each symptom bin

Below are the bins I want to categorize my symptoms into and their corresponding index in the unique symptoms list. I tried to get a combination of both severe symptoms and common symptoms. I also tried to find symptoms that were generalizable but not too much that there can be both severe and common reactions mapped to the same symptom.
1. Diarrhea (69)
2. Mood_swing (104)
3. Renal function (859)
4. Upper respiratory tract infection (904)
5. Prothrombin time (1013) 
6. Bleeding (1141)
7. Nasal congestion (1261)
8. Drug overdose (1297)
9. Angina (1341)
10. Dysbiosis (1873)
11. Overactive bladder (2359)
12. Suicide terminology (2695)
13. Epileptic seizure (2843)
14. Nephrotoxicity (2953)

In [37]:
symptoms_df['diarrhea'] = 0
symptoms_df['mood_swing'] = 0
symptoms_df['renal_function'] = 0
symptoms_df['upper_respiratory_tract_infection'] = 0
symptoms_df['prothrombin_time'] = 0
symptoms_df['bleeding'] = 0
symptoms_df['nasal_congestion'] = 0
symptoms_df['drug_overdose'] = 0
symptoms_df['angina'] = 0
symptoms_df['dysbiosis'] = 0
symptoms_df['overactive_bladder'] = 0
symptoms_df['suicide_terminology'] = 0
symptoms_df['epileptic_seizure'] = 0
symptoms_df['nephrotoxicity'] = 0

# Make a symptoms bin dict

In [1]:
symptoms_bin_dict = {0:'diarrhea', 1:'mood_swing', 2:'renal_function', 3: 'upper_respiratory_tract_infection', 
            4: 'prothrombin_time', 5:'bleeding', 6:'nasal_congestion', 7:'drug_overdose', 8: 'angina',
           9:'dysbiosis', 10:'overactive_bladder', 11:'suicide_terminology', 12: 'epileptic_seizure', 
           13:'nephrotoxicity'}

In [4]:
with open('symptoms_bin_dict.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_bin_dict, picklefile)

# Define a function to update the bin values according to the symptoms strings per row 

In [48]:
'''
This function attempts to map each symptom in the comma separated string
of symptoms per row in the dataframe to its associated symptom bin. 
Then it would update the symptom bin's value for that row by one so that 
there would be a count per row of each of the symptom bins.

There were some characters in the comma separated strings of symptoms
that were throwing off previous function's ability to find the associated
syptom bin in the dict.

Therefore, this function also prints a list of tuples of symptoms that 
couldn't be mapped and their associated bin. That way, I could clean up 
the characters and try to map them again.

The reason why these characters are present in the df but not in the dict
is because the count vectorizer function actually splits the symptom strings
differently than how I cleaned them.
'''

def update_bin_values(symptoms_df, lookup_symptoms_dict, bin_dict, start_index):
    symptoms_left_behind = []
    indices_to_redo = []
    for index, symptoms in enumerate(symptoms_df['symptoms'], start_index):
        for symptom in symptoms.split(","):
            try:
                symptom_label = lookup_symptoms_dict[symptom]
                symptom_bin = bin_dict[symptom_label]
                symptoms_df[symptom_bin][index] += 1
            except:
                symptoms_left_behind.append(symptom)
                indices_to_redo.append(index)
                continue
    print(set(symptoms_left_behind))
    print(indices_to_redo)
    return symptoms_df

In [49]:
'''
 This function attempts to re-map the symptoms that originally couldn't
 be mapped due to unexpected characters. 
'''

def redo_symptoms(redo_symptoms_dict, redo_symptoms_indices_list, symptoms_df, lookup_symptoms_dict, bin_dict, start_index):
    symptoms_left_behind = []
    indices_to_redo = []
    print(redo_symptoms_indices_list)
    for index in redo_symptoms_indices_list:
        print(index)
        for symptom in symptoms_df['symptoms'][index].split(","):
                if symptom in lookup_symptoms_dict:
                    continue
                else:
                    try:
                        symptom_label = redo_symptoms_dict[symptom]
                        symptom_bin = bin_dict[symptom_label]
                        symptoms_df[symptom_bin][index] += 1
                    except:
                        symptoms_left_behind.append(symptom)
                        indices_to_redo.append(index)
                        continue
    print(set(symptoms_left_behind))
    print(indices_to_redo)
    return symptoms_df

# Clean the symptoms string for each row

In [14]:
'''
This function attempts to clean each symptom in the string of symptoms for
each row. That would make it easier for binning them. 
'''
def clean_words(string):
    clean_strings = []
    string=string.lower()
    string=string.replace('. ','_')
    string=string.replace('(','_')
    string=string.replace(')','_')
    string=string.replace('/ ','_')
    string=string.replace(', ',',')
    string=string.replace('-', ',')
    string=string.replace('_', ' ')
    return string

# Clean symptoms in prep for binning

In [31]:
cleaned_symptoms = symptoms_df['symptoms'].apply(clean_words)

In [32]:
symptoms_df['symptoms'] = cleaned_symptoms

In [33]:
symptoms_df['symptoms']

0        swelling face,rash,wheezing,cough,hospitalisat...
1        swelling face,wheezing,cough,rash,hospitalisat...
2                                                  choking
3                  completed suicide,stress symptoms,death
4                          death,mitral valve incompetence
5                                 cerebrovascular accident
6        heart rate increased,dizziness,blood pressure ...
7        paraesthesia,physical examination,hospitalisat...
8        paraesthesia,physical examination,hospitalisat...
9        paraesthesia,physical examination,hospitalisat...
10       paraesthesia,physical examination,hospitalisat...
11       paraesthesia,physical examination,hospitalisat...
12       paraesthesia,physical examination,hospitalisat...
13       paraesthesia,physical examination,hospitalisat...
14       paraesthesia,physical examination,hospitalisat...
15       paraesthesia,physical examination,hospitalisat...
16       paraesthesia,physical examination,hospitalisat.

# Test the update bin value function

In [18]:
test_subset_symptoms_df = symptoms_df[:100]

In [19]:
updated_test_symptoms_df = update_bin_values(test_subset_symptoms_df, lookup_symptoms_dict, bin_dict)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'mood altered', 'loose stools'}
[48, 62, 88, 88, 89, 89]


In [20]:
updated_test_symptoms_df[48:50]

Unnamed: 0,report_id,symptoms,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity
48,65665,"abdominal pain,loose stools,diarrhoea,flatulence",1,0,0,1,0,0,0,0,0,0,1,0,0,0
49,65677,diarrhoea,1,0,0,0,0,0,0,0,0,0,0,0,0,0


# Test the redo symptom binning function 

In [6]:
symptoms_bin_dict

{0: 'diarrhea',
 1: 'mood_swing',
 2: 'renal_function',
 3: 'upper_respiratory_tract_infection',
 4: 'prothrombin_time',
 5: 'bleeding',
 6: 'nasal_congestion',
 7: 'drug_overdose',
 8: 'angina',
 9: 'dysbiosis',
 10: 'overactive_bladder',
 11: 'suicide_terminology',
 12: 'epileptic_seizure',
 13: 'nephrotoxicity'}

In [21]:
redo_symptoms_dict = {'mood altered': 1, 'loose stools': 0}

In [22]:
redo_symptoms_indices_list = set([48, 62, 88, 88, 89, 89])

In [23]:
# I should see rows 48's diarrhea column go up by 1 value after I run the redo symptoms
updated_test_symptoms_df[48:50]

Unnamed: 0,report_id,symptoms,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity
48,65665,"abdominal pain,loose stools,diarrhoea,flatulence",1,0,0,1,0,0,0,0,0,0,1,0,0,0
49,65677,diarrhoea,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
updated_symptoms_test_redo = redo_symptoms(redo_symptoms_dict, redo_symptoms_indices_list, updated_test_symptoms_df, lookup_symptoms_dict, bin_dict)
updated_symptoms_test_redo[48:50]

{48, 89, 88, 62}
48
89


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


88
62
set()
[]


Unnamed: 0,report_id,symptoms,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity
48,65665,"abdominal pain,loose stools,diarrhoea,flatulence",2,0,0,1,0,0,0,0,0,0,1,0,0,0
49,65677,diarrhoea,1,0,0,0,0,0,0,0,0,0,0,0,0,0


# Apply function the symptoms dataframe to update the symptoms bin column values in subsets of at least 1000 

## Apply to the first 1000 rows of the symptoms dataframe

In [25]:
len(symptoms_df)

34000

In [None]:
update_bin_values(symptoms_df[:1000], lookup_symptoms_dict, bin_dict)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [189]:
# I'm not redoing parkinson's disease since it's a precondition rather than a possible symptom of intaking the product
redo_symptoms_dict_1000 = {'mood altered': 1, 'loose stools': 0, 'tourette\'s disorder':12}

In [190]:
redo_symptoms_indices_list_1000 = set([48, 62, 88, 88, 89, 89, 110, 153, 539, 702, 748])

In [191]:
symptoms_1000_fixed = redo_symptoms(redo_symptoms_dict_1000, redo_symptoms_indices_list_1000, symptoms_1000, lookup_symptoms_dict, bin_dict)

{153, 702, 748, 110, 48, 88, 89, 539, 62}
153
702
748


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


110
48
88
89
539
62
{"parkinson's disease"}
[110]


In [192]:
symptoms_1000_fixed

Unnamed: 0,report_id,symptoms,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity
0,65325,"swelling face,rash,wheezing,cough,hospitalisat...",0,0,0,2,0,0,0,0,0,4,0,0,0,0
1,65325,"swelling face,wheezing,cough,rash,hospitalisat...",0,0,0,2,0,0,0,0,0,4,0,0,0,0
2,65345,choking,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,65399,"completed suicide,stress symptoms,death",0,0,0,0,0,0,0,0,0,1,0,0,2,0
4,65400,"death,mitral valve incompetence",0,0,0,0,0,0,0,0,1,0,0,0,1,0
5,65403,cerebrovascular accident,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,65416,"heart rate increased,dizziness,blood pressure ...",0,0,0,0,0,1,2,0,0,1,0,0,0,0
7,65420,"paraesthesia,physical examination,hospitalisat...",0,0,0,0,1,1,2,0,3,2,0,0,0,0
8,65420,"paraesthesia,physical examination,hospitalisat...",0,0,0,0,1,1,2,0,3,2,0,0,0,0
9,65420,"paraesthesia,physical examination,hospitalisat...",0,0,0,0,1,1,2,0,3,2,0,0,0,0


In [193]:
with open('/Symptoms/binned_symptoms_1000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_1000_fixed, picklefile)

## Next 1000 (2000)

In [41]:
symptoms_2000 = update_bin_values(symptoms_df[1000:2000], lookup_symptoms_dict, bin_dict, 1000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


"meige's syndrome"
'loose stools'
'loose stools'
'mood altered'
'mood altered'
'mood altered'
'mood altered'
'mood altered'
'c'
'c'
{"meige's syndrome", 'loose stools', 'c', 'mood altered'}
[1384, 1391, 1594, 1632, 1634, 1635, 1636, 1637, 1821, 1821]


In [44]:
redo_symptoms_dict_2000 = {'meige\'s syndrome':12 , 'loose stools':0 , 'mood altered':1}

In [45]:
redo_symptoms_indices_list_2000 = set([1384, 1391, 1594, 1632, 1634, 1635, 1636, 1637, 1821, 1821])

In [47]:
symptoms_2000_fixed = redo_symptoms(redo_symptoms_dict_2000, redo_symptoms_indices_list_2000, symptoms_2000, lookup_symptoms_dict, bin_dict, 1000)

{1632, 1634, 1635, 1636, 1637, 1384, 1391, 1594, 1821}
1632
1634
1635
1636
1637


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


1384
1391
1594
1821
{'c'}
[1821, 1821]


In [50]:
with open('/Symptoms/binned_symptoms_2000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_2000_fixed, picklefile)

## Next 1000 (3000)

In [51]:
symptoms_3000 = update_bin_values(symptoms_df[2000:3000], lookup_symptoms_dict, bin_dict, 2000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'creatinine renal clearance decreased', 'x', 'respiratory tract congestion', 'mood altered', 'c', 'prothrombin level increased'}
[2089, 2090, 2091, 2253, 2378, 2649, 2773, 2928, 2934, 2981]


In [56]:
redo_symptoms_dict_3000 = {'creatinine renal clearance decreased':2 , 'respiratory tract congestion': 3 , 'mood altered':1, 'prothrombin level increased':5}

In [54]:
redo_symptoms_indices_list_3000 = set([2089, 2090, 2091, 2253, 2378, 2649, 2773, 2928, 2934, 2981])

In [57]:
symptoms_3000_fixed = redo_symptoms(redo_symptoms_dict_3000, redo_symptoms_indices_list_3000, symptoms_3000, lookup_symptoms_dict, bin_dict, 2000)

{2981, 2089, 2090, 2091, 2378, 2253, 2928, 2773, 2934, 2649}
2981
2089
2090
2091


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


2378
2253
2928
2773
2934
2649
{'x', 'c'}
[2378, 2773]


In [58]:
with open('/Symptoms/binned_symptoms_3000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_3000_fixed, picklefile)

## Next 1000 (4000)

In [59]:
symptoms_4000 = update_bin_values(symptoms_df[3000:4000], lookup_symptoms_dict, bin_dict, 3000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'x', 'respiratory tract congestion', 'mood altered', 'operative haemorrhage', 'lip and/or oral cavity cancer', 'nasal oedema'}
[3036, 3050, 3059, 3377, 3412, 3721, 3722, 3723, 3922, 3923, 3960]


In [61]:
# Not redoing the oral cancer since it's a precondition
redo_symptoms_dict_4000 = {'respiratory tract congestion': 3 , 'mood altered':1, 'operative haemorrhage': 5, 'nasal oedema':6}

In [62]:
redo_symptoms_indices_list_4000 = set([3036, 3050, 3059, 3377, 3412, 3721, 3722, 3723, 3922, 3923, 3960])

In [63]:
symptoms_4000_fixed = redo_symptoms(redo_symptoms_dict_4000, redo_symptoms_indices_list_4000, symptoms_4000, lookup_symptoms_dict, bin_dict, 3000)

{3721, 3050, 3722, 3723, 3377, 3922, 3059, 3412, 3923, 3960, 3036}
3721
3050
3722
3723
3377
3922
3059
3412
3923
3960
3036
{'x', 'lip and/or oral cavity cancer'}
[3721, 3722, 3723, 3412]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [64]:
with open('/Symptoms/binned_symptoms_4000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_4000_fixed, picklefile)

## Next 1000 (5000)

In [67]:
symptoms_5000 = update_bin_values(symptoms_df[4000:5000], lookup_symptoms_dict, bin_dict, 4000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'cardiac discomfort', 'incorrect dose administered', 'respiratory tract congestion', 'mood altered'}
[4043, 4044, 4045, 4046, 4047, 4107, 4385, 4445, 4458]


In [69]:
redo_symptoms_dict_5000 = {'incorrect dose administered': 7, 'respiratory tract congestion':2 , 'cardiac discomfort': 8 , 'mood altered':1}

In [70]:
redo_symptoms_indices_list_5000 = set([4043, 4044, 4045, 4046, 4047, 4107, 4385, 4445, 4458])

In [71]:
symptoms_5000_fixed = redo_symptoms(redo_symptoms_dict_5000, redo_symptoms_indices_list_5000, symptoms_5000, lookup_symptoms_dict, bin_dict, 4000)

{4385, 4458, 4043, 4044, 4045, 4046, 4047, 4107, 4445}
4385
4458
4043
4044
4045
4046


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


4047
4107
4445
set()
[]


In [72]:
with open('/Symptoms/binned_symptoms_5000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_5000_fixed, picklefile)

## Next 1000 (6000)

In [73]:
symptoms_6000 = update_bin_values(symptoms_df[5000:6000], lookup_symptoms_dict, bin_dict, 5000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{"meniere's disease", 'respiratory tract congestion', 'mood altered', 'b', "peyronie's disease", "ewing's sarcoma"}
[5165, 5220, 5221, 5239, 5263, 5264, 5287, 5717, 5995]


In [75]:
redo_symptoms_dict_6000 = {'meniere\'s disease': 3, 'respiratory tract congestion': 3, 'mood altered': 1}

In [76]:
redo_symptoms_indices_list_6000 = set([5165, 5220, 5221, 5239, 5263, 5264, 5287, 5717, 5995])

In [77]:
symptoms_6000_fixed = redo_symptoms(redo_symptoms_dict_6000, redo_symptoms_indices_list_6000, symptoms_6000, lookup_symptoms_dict, bin_dict, 5000)

{5220, 5221, 5287, 5995, 5165, 5263, 5264, 5717, 5239}
5220
5221
5287
5995
5165
5263
5264
5717


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


5239
{"ewing's sarcoma", "peyronie's disease", 'b'}
[5287, 5995, 5165]


In [78]:
with open('/Symptoms/binned_symptoms_6000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_6000_fixed, picklefile)

## Next 1000 (7000)

In [79]:
symptoms_7000 = update_bin_values(symptoms_df[6000:7000], lookup_symptoms_dict, bin_dict, 6000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'c', 'mood altered'}
[6044, 6045, 6169, 6170, 6171, 6172, 6173, 6174, 6175, 6176, 6177, 6178, 6179, 6180, 6181, 6182, 6183, 6184, 6185, 6186, 6187, 6188, 6189, 6190, 6191, 6192, 6193, 6194, 6195, 6196, 6197, 6198, 6199, 6200, 6201, 6202, 6203, 6204, 6205, 6206, 6207]


In [81]:
redo_symptoms_dict_7000 = {'mood altered': 1}

In [82]:
redo_symptoms_indices_list_7000 = set([6044, 6045, 6169, 6170, 6171, 6172, 6173, 6174, 6175, 6176, 6177, 6178, 6179, 6180, 6181, 6182, 6183, 6184, 6185, 6186, 6187, 6188, 6189, 6190, 6191, 6192, 6193, 6194, 6195, 6196, 6197, 6198, 6199, 6200, 6201, 6202, 6203, 6204, 6205, 6206, 6207])

In [83]:
symptoms_7000_fixed = redo_symptoms(redo_symptoms_dict_7000, redo_symptoms_indices_list_7000, symptoms_7000, lookup_symptoms_dict, bin_dict, 6000)

{6169, 6170, 6171, 6172, 6173, 6044, 6045, 6176, 6177, 6178, 6179, 6180, 6181, 6182, 6183, 6184, 6185, 6186, 6187, 6188, 6189, 6190, 6191, 6192, 6193, 6194, 6195, 6196, 6197, 6198, 6199, 6200, 6201, 6202, 6203, 6204, 6205, 6206, 6207, 6174, 6175}
6169
6170
6171
6172
6173
6044
6045
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6174
6175
{'c'}
[6169, 6170, 6171, 6172, 6173, 6176, 6177, 6178, 6179, 6180, 6181, 6182, 6183, 6184, 6185, 6186, 6187, 6188, 6189, 6190, 6191, 6192, 6193, 6194, 6195, 6196, 6197, 6198, 6199, 6200, 6201, 6202, 6203, 6204, 6205, 6206, 6207, 6174, 6175]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [84]:
with open('/Symptoms/binned_symptoms_7000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_7000_fixed, picklefile)

## Next 1000 (8000)

In [85]:
symptoms_8000 = update_bin_values(symptoms_df[7000:8000], lookup_symptoms_dict, bin_dict, 7000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'mood altered', 'respiratory tract congestion', 'operative haemorrhage'}
[7346, 7347, 7348, 7349, 7350, 7833, 7835, 7989, 7990, 7991]


In [87]:
redo_symptoms_dict_8000 = {'mood altered': 1, 'respiratory tract congestion': 3, 'operative haemorrhage': 5}

In [88]:
redo_symptoms_indices_list_8000 = set([7346, 7347, 7348, 7349, 7350, 7833, 7835, 7989, 7990, 7991])

In [89]:
symptoms_8000_fixed = redo_symptoms(redo_symptoms_dict_8000, redo_symptoms_indices_list_8000, symptoms_8000, lookup_symptoms_dict, bin_dict, 7000)

{7991, 7346, 7347, 7348, 7349, 7350, 7989, 7990, 7833, 7835}
7991
7346
7347
7348


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


7349
7350
7989
7990
7833
7835
set()
[]


In [90]:
with open('/Symptoms/binned_symptoms_8000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_8000_fixed, picklefile)

## Next 1000 (9000)

In [91]:
symptoms_9000 = update_bin_values(symptoms_df[8000:9000], lookup_symptoms_dict, bin_dict, 8000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{"brunner's gland hyperplasia", 'respiratory tract congestion', 'mood altered'}
[8196, 8197, 8256, 8257, 8407, 8464, 8848, 8849]


In [93]:
redo_symptoms_dict_9000 = {'mood altered': 1, 'respiratory tract congestion': 3, 'brunner\'s gland hyperplasia': 5}

In [94]:
redo_symptoms_indices_list_9000 = set([8196, 8197, 8256, 8257, 8407, 8464, 8848, 8849])

In [95]:
symptoms_9000_fixed = redo_symptoms(redo_symptoms_dict_9000, redo_symptoms_indices_list_9000, symptoms_9000, lookup_symptoms_dict, bin_dict, 8000)

{8256, 8257, 8196, 8197, 8464, 8848, 8849, 8407}
8256
8257
8196
8197

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.



8464
8848
8849
8407
set()
[]


In [96]:
with open('/Symptoms/binned_symptoms_9000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_9000_fixed, picklefile)

## Next 1000 (10000)

In [97]:
symptoms_10000 = update_bin_values(symptoms_df[9000:10000], lookup_symptoms_dict, bin_dict, 9000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'cardiac discomfort', 'respiratory tract congestion', 'mood altered'}
[9031, 9162, 9620, 9876, 9895, 9898]


In [99]:
redo_symptoms_dict_10000 = {'mood altered': 1, 'respiratory tract congestion': 3, 'cardiac discomfort': 8}

In [100]:
redo_symptoms_indices_list_10000 = set([9031, 9162, 9620, 9876, 9895, 9898])

In [101]:
symptoms_10000_fixed = redo_symptoms(redo_symptoms_dict_10000, redo_symptoms_indices_list_10000, symptoms_10000, lookup_symptoms_dict, bin_dict, 9000)

{9895, 9031, 9162, 9898, 9876, 9620}
9895
9031
9162
9898
9876
9620
set()
[]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [102]:
with open('/Symptoms/binned_symptoms_10000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_10000_fixed, picklefile)

## Next 2000 (12000)

In [103]:
symptoms_12000 = update_bin_values(symptoms_df[10000:12000], lookup_symptoms_dict, bin_dict, 10000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'creatinine renal clearance decreased', 'x', 'respiratory tract congestion', 'mood altered', 'incorrect dose administered', 'cardiac discomfort', 'disbacteriosis'}
[10061, 10062, 10063, 10064, 10065, 10066, 10068, 10069, 10070, 10071, 10124, 10125, 10126, 10149, 10267, 10789, 10790, 10791, 10794, 10795, 10796, 10821, 10822, 10823, 10824, 10825, 10826, 10827, 10929, 10959, 11156, 11224, 11225, 11226, 11227, 11228, 11383, 11384, 11590, 11597, 11598, 11812, 11813, 11868]


In [105]:
redo_symptoms_dict_12000 = {'creatinine renal clearance decreased': 2, 'incorrect dose administered': 7, 'mood altered': 1, 'respiratory tract congestion': 3, 'cardiac discomfort': 8, 'disbacteriosis': 9}

In [106]:
redo_symptoms_indices_list_12000 = set([10061, 10062, 10063, 10064, 10065, 10066, 10068, 10069, 10070, 10071, 10124, 10125, 10126, 10149, 10267, 10789, 10790, 10791, 10794, 10795, 10796, 10821, 10822, 10823, 10824, 10825, 10826, 10827, 10929, 10959, 11156, 11224, 11225, 11226, 11227, 11228, 11383, 11384, 11590, 11597, 11598, 11812, 11813, 11868])

In [107]:
symptoms_12000_fixed = redo_symptoms(redo_symptoms_dict_12000, redo_symptoms_indices_list_12000, symptoms_12000, lookup_symptoms_dict, bin_dict, 10000)

{10124, 10125, 10126, 11156, 10267, 11812, 10149, 10789, 10790, 10791, 11813, 10794, 10795, 10796, 10929, 10821, 10822, 10823, 10824, 10825, 10826, 10827, 11590, 10061, 10062, 10063, 10064, 10065, 10066, 10959, 10068, 10069, 10070, 10071, 11224, 11225, 11226, 11227, 11228, 11868, 11597, 11598, 11383, 11384}
10124
10125
10126
11156
10267


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


11812
10149
10789
10790
10791
11813
10794
10795
10796
10929
10821
10822
10823
10824
10825
10826
10827
11590
10061
10062
10063
10064
10065
10066
10959
10068
10069
10070
10071
11224
11225
11226
11227
11228
11868
11597
11598
11383
11384
{'x'}
[10789, 10790, 10791, 10794, 10795, 10796, 10821, 10822, 10823, 10824, 10825, 10826, 10827]


In [108]:
with open('/Symptoms/binned_symptoms_12000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_12000_fixed, picklefile)

## Next 2000 (14000)

In [109]:
symptoms_14000 = update_bin_values(symptoms_df[12000:14000], lookup_symptoms_dict, bin_dict, 12000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{"meniere's disease", 'respiratory tract congestion', 'incorrect dose administered', 'cardiac discomfort', 'disbacteriosis'}
[12356, 12513, 12514, 12515, 12516, 12517, 12518, 12693, 12694, 12695, 12696, 12697, 12698, 13097, 13098, 13099, 13100, 13378, 13650, 13696, 13843]


In [111]:
redo_symptoms_dict_14000 = {'meniere\'s disease': 12, 'respiratory tract congestion': 3,'incorrect dose administered': 7,'cardiac discomfort': 8, 'disbacteriosis': 9}

In [112]:
redo_symptoms_indices_list_14000 = set([12356, 12513, 12514, 12515, 12516, 12517, 12518, 12693, 12694, 12695, 12696, 12697, 12698, 13097, 13098, 13099, 13100, 13378, 13650, 13696, 13843])

In [113]:
symptoms_14000_fixed = redo_symptoms(redo_symptoms_dict_14000, redo_symptoms_indices_list_14000, symptoms_14000, lookup_symptoms_dict, bin_dict, 12000)

{13696, 13843, 12693, 12694, 12695, 12696, 12697, 12698, 13097, 13098, 13099, 13100, 13378, 12356, 13650, 12513, 12514, 12515, 12516, 12517, 12518}
13696
13843
12693
12694
12695


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


12696
12697
12698
13097
13098
13099
13100
13378
12356
13650
12513
12514
12515
12516
12517
12518
set()
[]


In [114]:
with open('/Symptoms/binned_symptoms_14000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_14000_fixed, picklefile)

## Next 2000 (16000)

In [115]:
symptoms_16000 = update_bin_values(symptoms_df[14000:16000], lookup_symptoms_dict, bin_dict, 14000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'x', 'respiratory tract congestion', 'mood altered', 'incorrect dose administered', 'c', "barrett's oesophagus", 'cardiac discomfort'}
[14043, 14281, 14553, 14658, 14706, 14866, 14891, 14911, 15019, 15242, 15507, 15508, 15509, 15510, 15684, 15685, 15686]


In [120]:
redo_symptoms_dict_16000 = {'respiratory tract congestion': 3,'incorrect dose administered': 7,'cardiac discomfort': 8, 'disbacteriosis': 9, 'barrett\'s oesophagus': 3, 'mood altered': 1}

In [118]:
redo_symptoms_indices_list_16000 = set([14043, 14281, 14553, 14658, 14706, 14866, 14891, 14911, 15019, 15242, 15507, 15508, 15509, 15510, 15684, 15685, 15686])

In [121]:
symptoms_16000_fixed = redo_symptoms(redo_symptoms_dict_16000, redo_symptoms_indices_list_16000, symptoms_16000, lookup_symptoms_dict, bin_dict, 14000)

{14658, 15684, 15685, 15686, 14281, 15242, 14891, 15019, 14706, 14866, 15507, 15508, 15509, 15510, 14553, 14043, 14911}
14658
15684
15685
15686


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


14281
15242
14891
15019
14706
14866
15507
15508
15509
15510
14553
14043
14911
{'x', 'c'}
[14553, 14911]


In [122]:
with open('/Symptoms/binned_symptoms_16000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_16000_fixed, picklefile)

## Next 4000 (20000)

In [123]:
symptoms_20000 = update_bin_values(symptoms_df[16000:20000], lookup_symptoms_dict, bin_dict, 16000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'attention deficit/hyperactivity disorder', "wernicke's encephalopathy", 'respiratory tract congestion', 'mood altered', "hodgkin's lymphoma", "dementia alzheimer's type", 'incorrect dose administered', 'urine protein/creatinine ratio increased', 'cardiac discomfort', "basedow's disease", 'bladder irritation', "cow's milk intolerance", 'nasal oedema'}
[16070, 16506, 16908, 16909, 16910, 16911, 16912, 16934, 17041, 17052, 17052, 17167, 17168, 17169, 17170, 17240, 17480, 17784, 18234, 18343, 18391, 18392, 18435, 18779, 19128, 19169, 19170, 19257, 19290, 19291, 19967]


In [128]:
redo_symptoms_dict_20000 = {'attention deficit/hyperactivity disorder': 1, 'wernicke\'s encephalopathy': 1, 'respiratory tract congestion': 3, 'mood altered': 1, 'hodgkin\'s lymphoma': 4,'dementia alzheimer\'s type': 12, 'incorrect dose administered': 7, 'urine protein/creatinine ratio increased': 10, 'cardiac discomfort': 8, 'bladder irritation': 10, 'nasal oedema': 6}

In [129]:
redo_symptoms_indices_list_20000 = set([16070, 16506, 16908, 16909, 16910, 16911, 16912, 16934, 17041, 17052, 17052, 17167, 17168, 17169, 17170, 17240, 17480, 17784, 18234, 18343, 18391, 18392, 18435, 18779, 19128, 19169, 19170, 19257, 19290, 19291, 19967])

In [130]:
symptoms_20000_fixed = redo_symptoms(redo_symptoms_dict_20000, redo_symptoms_indices_list_20000, symptoms_20000, lookup_symptoms_dict, bin_dict, 16000)

{18435, 16908, 16909, 16910, 16911, 16912, 17041, 17167, 17168, 17169, 17170, 17052, 16934, 18343, 19128, 19257, 18234, 16070, 17480, 18391, 18392, 17240, 19290, 18779, 19291, 19169, 19170, 17784, 16506, 19967}
18435
16908
16909
16910
16911
16912
17041
17167
17168
17169
17170
17052
16934


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


18343
19128
19257
18234
16070
17480
18391
18392
17240
19290
18779
19291
19169
19170
17784
16506
19967
{"basedow's disease", "cow's milk intolerance"}
[16908, 16909, 16910, 16911, 16912, 17041, 19290, 19291]


In [131]:
with open('/Symptoms/binned_symptoms_20000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_20000_fixed, picklefile)

## Next 10000 (30000)

In [124]:
symptoms_30000 = update_bin_values(symptoms_df[20000:30000], lookup_symptoms_dict, bin_dict, 20000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'attention deficit/hyperactivity disorder', 'x', 'respiratory tract congestion', 'mood altered', 'b', 'injurious ideation', 'c', 'total cholesterol/hdl ratio decreased', 'cardiac discomfort', 'bladder irritation', 'prothrombin level increased', "crohn's disease", 'seizure like phenomena'}
[20059, 20334, 20767, 20989, 20990, 20991, 21122, 21158, 22128, 23835, 24303, 24458, 24459, 24460, 24461, 24603, 24604, 24605, 24732, 24733, 24734, 24735, 24736, 24737, 24745, 24795, 25168, 25169, 25170, 25171, 25177, 25178, 25179, 25180, 25181, 25182, 25183, 25184, 25185, 25330, 25549, 25685, 25686, 25687, 25688, 25713, 25938, 26101, 26328, 26329, 26895, 27397, 27398, 27399, 27401, 27772, 27773, 27888, 27889, 27890, 27901, 27902, 27972, 27973, 27975, 28049, 28106, 28106, 28107, 28107, 28189, 28190, 28212, 28217, 28246, 28246, 28738, 28815, 28853]


In [133]:
redo_symptoms_dict_30000 = {'respiratory tract congestion': 3, 'mood altered': 1, 'injurious ideation': 11,'total cholesterol/hdl ratio decreased': 5,'cardiac discomfort': 8,'bladder irritation': 5,'prothrombin level increased': 4, 'seizure like phenomena': 12}

In [134]:
redo_symptoms_indices_list_30000 = set([20059, 20334, 20767, 20989, 20990, 20991, 21122, 21158, 22128, 23835, 24303, 24458, 24459, 24460, 24461, 24603, 24604, 24605, 24732, 24733, 24734, 24735, 24736, 24737, 24745, 24795, 25168, 25169, 25170, 25171, 25177, 25178, 25179, 25180, 25181, 25182, 25183, 25184, 25185, 25330, 25549, 25685, 25686, 25687, 25688, 25713, 25938, 26101, 26328, 26329, 26895, 27397, 27398, 27399, 27401, 27772, 27773, 27888, 27889, 27890, 27901, 27902, 27972, 27973, 27975, 28049, 28106, 28106, 28107, 28107, 28189, 28190, 28212, 28217, 28246, 28246, 28738, 28815, 28853])

In [135]:
symptoms_30000_fixed = redo_symptoms(redo_symptoms_dict_30000, redo_symptoms_indices_list_30000, symptoms_30000, lookup_symptoms_dict, bin_dict, 20000)

{21122, 28189, 27397, 27398, 27399, 27401, 24458, 24459, 24460, 24461, 26895, 28815, 28049, 24603, 23835, 24605, 24734, 24732, 24733, 24604, 20767, 24735, 24736, 24737, 21158, 28190, 28246, 24745, 28212, 28853, 28217, 27901, 28738, 27972, 27773, 27973, 27975, 28106, 28107, 25549, 25168, 25169, 25170, 25171, 25938, 25685, 25686, 25687, 25688, 25177, 25178, 20059, 24795, 25179, 25180, 25181, 25182, 25183, 25184, 25185, 27902, 20334, 24303, 22128, 25713, 25330, 26328, 26329, 26101, 27888, 27889, 27890, 27772, 20989, 20990, 20991}
21122
28189
27397
27398


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


27399
27401
24458
24459
24460
24461
26895
28815
28049
24603
23835
24605
24734
24732
24733
24604
20767
24735
24736
24737
21158
28190
28246
24745
28212
28853
28217
27901
28738
27972
27773
27973
27975
28106
28107
25549
25168
25169
25170
25171
25938
25685
25686
25687
25688
25177
25178
20059
24795
25179
25180
25181
25182
25183
25184
25185
27902
20334
24303
22128
25713
25330
26328
26329
26101
27888
27889
27890
27772
20989
20990
20991
{'attention deficit/hyperactivity disorder', 'x', 'b', 'c', "crohn's disease"}
[24603, 24605, 24734, 24732, 24733, 24604, 24735, 24736, 24737, 28217, 28106, 28106, 28107, 28107, 25549, 24303, 22128, 25713, 25330, 26328, 26329, 27888, 27889, 27890]


In [136]:
with open('/Symptoms/binned_symptoms_30000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_30000_fixed, picklefile)

## Remaining 4000 (34000)

In [126]:
symptoms_34000 = update_bin_values(symptoms_df[30000:34000], lookup_symptoms_dict, bin_dict, 30000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


{'x', 'respiratory tract congestion', 'mood altered', 'nephropathy toxic', "hodgkin's lymphoma", 'c', "basedow's disease", "cow's milk intolerance", 'seizure like phenomena', 'nasal oedema'}
[30431, 30678, 31838, 31877, 31968, 31969, 32246, 32247, 32417, 32418, 32419, 32420, 32421, 32422, 32423, 32596, 32619, 33032, 33033, 33034, 33035]


In [137]:
bin_dict

{0: 'diarrhea',
 1: 'mood_swing',
 2: 'renal_function',
 3: 'upper_respiratory_tract_infection',
 4: 'prothrombin_time',
 5: 'bleeding',
 6: 'nasal_congestion',
 7: 'drug_overdose',
 8: 'angina',
 9: 'dysbiosis',
 10: 'overactive_bladder',
 11: 'suicide_terminology',
 12: 'epileptic_seizure',
 13: 'nephrotoxicity'}

In [138]:
redo_symptoms_dict_34000 = {'respiratory tract congestion': 3, 'mood altered': 1,'nephropathy toxic': 13,'seizure like phenomena': 12, 'nasal oedema': 6}

In [139]:
redo_symptoms_indices_list_34000 = set([30431, 30678, 31838, 31877, 31968, 31969, 32246, 32247, 32417, 32418, 32419, 32420, 32421, 32422, 32423, 32596, 32619, 33032, 33033, 33034, 33035])

In [140]:
symptoms_34000_fixed = redo_symptoms(redo_symptoms_dict_34000, redo_symptoms_indices_list_34000, symptoms_34000, lookup_symptoms_dict, bin_dict, 30000)

{31877, 33032, 33033, 33034, 33035, 32417, 32418, 32419, 32420, 32421, 32422, 32423, 32596, 30678, 31838, 30431, 31968, 31969, 32619, 32246, 32247}
31877
33032
33033
33034
33035


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


32417
32418
32419
32420
32421
32422
32423
32596
30678
31838
30431
31968
31969
32619
32246
32247
{'x', "hodgkin's lymphoma", 'c', "basedow's disease", "cow's milk intolerance"}
[31877, 32417, 32418, 32419, 32420, 32421, 32422, 32423, 30431, 31968, 31969, 32246, 32247]


In [141]:
with open('/Symptoms/binned_symptoms_34000.pkl', 'wb') as picklefile:
        pickle.dump(symptoms_34000_fixed, picklefile)

# Concatenate all subsets into a full symptoms df

In [167]:
with open('/Symptoms/binned_symptoms_1000.pkl', 'rb') as picklefile: 
    binned_symptoms_1000 = pickle.load(picklefile)

In [142]:
with open('/Symptoms/binned_symptoms_2000.pkl', 'rb') as picklefile: 
    binned_symptoms_2000 = pickle.load(picklefile)

In [143]:
with open('/Symptoms/binned_symptoms_3000.pkl', 'rb') as picklefile: 
    binned_symptoms_3000 = pickle.load(picklefile)

In [144]:
with open('/Symptoms/binned_symptoms_4000.pkl', 'rb') as picklefile: 
    binned_symptoms_4000 = pickle.load(picklefile)

In [157]:
with open('/Symptoms/binned_symptoms_5000.pkl', 'rb') as picklefile: 
    binned_symptoms_5000 = pickle.load(picklefile)

In [145]:
with open('/Symptoms/binned_symptoms_6000.pkl', 'rb') as picklefile: 
    binned_symptoms_6000 = pickle.load(picklefile)

In [146]:
with open('/Symptoms/binned_symptoms_7000.pkl', 'rb') as picklefile: 
    binned_symptoms_7000 = pickle.load(picklefile)

In [147]:
with open('/Symptoms/binned_symptoms_8000.pkl', 'rb') as picklefile: 
    binned_symptoms_8000 = pickle.load(picklefile)

In [148]:
with open('/Symptoms/binned_symptoms_9000.pkl', 'rb') as picklefile: 
    binned_symptoms_9000 = pickle.load(picklefile)

In [149]:
with open('/Symptoms/binned_symptoms_10000.pkl', 'rb') as picklefile: 
    binned_symptoms_10000 = pickle.load(picklefile)

In [150]:
with open('/Symptoms/binned_symptoms_12000.pkl', 'rb') as picklefile: 
    binned_symptoms_12000 = pickle.load(picklefile)

In [151]:
with open('/Symptoms/binned_symptoms_14000.pkl', 'rb') as picklefile: 
    binned_symptoms_14000 = pickle.load(picklefile)

In [152]:
with open('/Symptoms/binned_symptoms_16000.pkl', 'rb') as picklefile: 
    binned_symptoms_16000 = pickle.load(picklefile)

In [153]:
with open('/Symptoms/binned_symptoms_20000.pkl', 'rb') as picklefile: 
    binned_symptoms_20000 = pickle.load(picklefile)

In [154]:
with open('/Symptoms/binned_symptoms_30000.pkl', 'rb') as picklefile: 
    binned_symptoms_30000 = pickle.load(picklefile)

In [155]:
with open('/Symptoms/binned_symptoms_34000.pkl', 'rb') as picklefile: 
    binned_symptoms_34000 = pickle.load(picklefile)

In [168]:
frames = [binned_symptoms_1000, binned_symptoms_2000, binned_symptoms_3000, binned_symptoms_4000, 
          binned_symptoms_5000, binned_symptoms_6000, binned_symptoms_7000, binned_symptoms_8000, 
          binned_symptoms_9000, binned_symptoms_10000,binned_symptoms_12000, binned_symptoms_14000, 
          binned_symptoms_16000, binned_symptoms_20000, binned_symptoms_30000, binned_symptoms_34000]

In [169]:
symptoms_df_final = pd.concat(frames)
symptoms_df_final.head(5)

Unnamed: 0,report_id,symptoms,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity
0,65325,"swelling face,rash,wheezing,cough,hospitalisat...",0,0,0,2,0,0,0,0,0,4,0,0,0,0
1,65325,"swelling face,wheezing,cough,rash,hospitalisat...",0,0,0,2,0,0,0,0,0,4,0,0,0,0
2,65345,choking,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,65399,"completed suicide,stress symptoms,death",0,0,0,0,0,0,0,0,0,1,0,0,2,0
4,65400,"death,mitral valve incompetence",0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [170]:
symptoms_df_final.drop(columns= ['report_id'], inplace=True)
symptoms_df_final.drop(columns= ['symptoms'], inplace=True)

In [173]:
symptoms_df_final.head(15)

Unnamed: 0,diarrhea,mood_swing,renal_function,upper_respiratory_tract_infection,prothrombin_time,bleeding,nasal_congestion,drug_overdose,angina,dysbiosis,overactive_bladder,suicide_terminology,epileptic_seizure,nephrotoxicity
0,0,0,0,2,0,0,0,0,0,4,0,0,0,0
1,0,0,0,2,0,0,0,0,0,4,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,2,0
4,0,0,0,0,0,0,0,0,1,0,0,0,1,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,1,2,0,0,1,0,0,0,0
7,0,0,0,0,1,1,2,0,3,2,0,0,0,0
8,0,0,0,0,1,1,2,0,3,2,0,0,0,0
9,0,0,0,0,1,1,2,0,3,2,0,0,0,0


# Upload the new dataframes to Postgre SQL

In [174]:
engine = create_engine('postgresql://ubuntu:password@52.14.207.9:5432/reactions', echo=False)

In [175]:
symptoms_df_final.to_sql(name='symptoms_df', con=engine, if_exists = 'replace', index=False)

# Summary

### What I did
1. Added the bins as columns of zeros to the symptoms df
2. Defined functions to update the bins according to the symptoms string within each row of the df
3. Applied the functions to update all rows with the associated symptom bin values

### What I will do next 
1. Process the outcomes (target) for my model